1 #if !defined(lint) && !defined(__lint)
   2 /  ARCFOUR implementation optimized for AMD64.
   3 /
   4 /  Author: Marc Bevand <bevand_m (at) epita.fr>
   5 /  Licence: I hereby disclaim the copyright on this code and place it
   6 /  in the public domain.
   7 /
   8 /  The code has been designed to be easily integrated into openssl:
   9 /  the exported RC4() function can replace the actual implementations
  10 /  openssl already contains. Please note that when linking with openssl,
  11 /  it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
  12 /  with -DRC4_INT='unsigned long'.
  13 /
  14 /  The throughput achieved by this code is about 320 MBytes/sec, on
  15 /  a 1.8 GHz AMD Opteron (rev C0) processor.
  16 
  17 
  18 / ***** BEGIN LICENSE BLOCK *****
  19 / Version: MPL 1.1/GPL 2.0/LGPL 2.1
  20 /
  21 / The contents of this file are subject to the Mozilla Public License Version
  22 / 1.1 (the "License"); you may not use this file except in compliance with
  23 / the License. You may obtain a copy of the License at
  24 / http://www.mozilla.org/MPL/
  25 /
  26 / Software distributed under the License is distributed on an "AS IS" basis,
  27 / WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  28 / for the specific language governing rights and limitations under the
  29 / License.
  30 /
  31 / The Original Code is "Marc Bevand's fast AMD64 ARCFOUR source"
  32 /
  33 / The Initial Developer of the Original Code is
  34 / Marc Bevand <bevand_m@epita.fr> .
  35 / Portions created by the Initial Developer are
  36 / Copyright (C) 2004 the Initial Developer. All Rights Reserved.
  37 /
  38 / Contributor(s):
  39 /
  40 / Alternatively, the contents of this file may be used under the terms of
  41 / either the GNU General Public License Version 2 or later (the "GPL"), or
  42 / the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  43 / in which case the provisions of the GPL or the LGPL are applicable instead
  44 / of those above. If you wish to allow use of your version of this file only
  45 / under the terms of either the GPL or the LGPL, and not to allow others to
  46 / use your version of this file under the terms of the MPL, indicate your
  47 / decision by deleting the provisions above and replace them with the notice
  48 / and other provisions required by the GPL or the LGPL. If you do not delete
  49 / the provisions above, a recipient may use your version of this file under
  50 / the terms of any one of the MPL, the GPL or the LGPL.
  51 /
  52 / ***** END LICENSE BLOCK *****
  53 
  54         .ident  "@(#)arcfour_crypt_amd64.s      1.1     08/01/02 SMI"
  55 
  56 /
  57 / void arcfour_crypt(ARCFour_key *key, uchar_t *in,
  58 /               uchar_t *out, size_t len);
  59 /
  60 / The following is Marc Bevand's RC4 implementation optimized for
  61 / AMD64.  It has been lifted intact, except for minor interface
  62 / changes to get along with Solaris crypto common code (the parameter
  63 / order and the key struct element order are both different).
  64 / This function works for both aligned and unaligned data ('in' and 'out').
  65 / The key and key elements must be aligned.
  66 /
  67 / Register Usage
  68 / rax           data[x]
  69 / rbx           ARG(len)
  70 / rcx           key->i (aka x)
  71 / rdx           key->j (aka y)
  72 / rsi           ARG(in)
  73 / rdi           ARG(out)
  74 / rbp           key->arr (aka data or d)
  75 / rsp           stack
  76 / r8            8 bytes of rc4 stream
  77 / r9            temp
  78 / r10-r15       unused
  79 /
  80 
  81 #include <sys/asm_linkage.h>
  82 
  83 
  84         ENTRY_NP(arcfour_crypt)
  85         /* EXPORT DELETE START */
  86                                         / load parameters
  87         push    %rbp
  88         push    %rbx
  89         mov     %rdi,           %rbp    / rbp = ARG(key)
  90                                         / rsi = ARG(in)
  91         mov     %rdx,           %rdi    / rdi = ARG(out)
  92         mov     %rcx,           %rbx    / rbx = ARG(len)
  93 
  94                                         / load key indices and key
  95         mov     2048(%rbp),     %rcx    / rcx x = key->i
  96         mov     2056(%rbp),     %rdx    / rdx y = key->j
  97                                         / rbp d = key->arr
  98         inc     %rcx                    / x++
  99         and     $255,           %rcx    / x &= 0xff
 100         lea     -8(%rbx,%rsi),  %rbx    / rbx = in+len-8
 101         mov     %rbx,           %r9     / tmp = in+len-8
 102         mov     (%rbp,%rcx,8),  %rax    / tx = d[x]
 103         cmp     %rsi,           %rbx    / cmp in with in+len-8
 104         jl      .Lend                   / jump if (in+len-8 < in)
 105 
 106 .Lstart:
 107         add     $8,             %rsi            / increment in
 108         add     $8,             %rdi            / increment out
 109 
 110         / generate the next 8 bytes of the rc4 stream into %r8
 111         mov     $8,             %r11            / byte counter
 112 1:      add     %al,            %dl             / y += tx
 113         mov     (%rbp,%rdx,8),  %ebx            / ty = d[y]
 114         mov     %ebx,           (%rbp,%rcx,8)   / d[x] = ty
 115         add     %al,            %bl             / val = ty + tx
 116         mov     %eax,           (%rbp,%rdx,8)   / d[y] = tx
 117         inc     %cl                             / x++           (NEXT ROUND)
 118         mov     (%rbp,%rcx,8),  %eax            / tx = d[x]     (NEXT ROUND)
 119         movb    (%rbp,%rbx,8),  %r8b            / val = d[val]
 120         dec     %r11b
 121         ror     $8,             %r8             / (ror does not change ZF)
 122         jnz     1b
 123 
 124         / xor 8 bytes
 125         xor     -8(%rsi),       %r8
 126         cmp     %r9,            %rsi            / cmp in+len-8 with in
 127         mov     %r8,            -8(%rdi)
 128         jle     .Lstart                         / jump if (in <= in+len-8)
 129 
 130 .Lend:
 131         add     $8,             %r9             / tmp = in+len
 132 
 133         / handle the last bytes, one by one
 134 1:      cmp     %rsi,           %r9             / cmp in with in+len
 135         jle     .Lfinished                      / jump if (in+len <= in)
 136         add     %al,            %dl             / y += tx
 137         mov     (%rbp,%rdx,8),  %ebx            / ty = d[y]
 138         mov     %ebx,           (%rbp,%rcx,8)   / d[x] = ty
 139         add     %al,            %bl             / val = ty + tx
 140         mov     %eax,           (%rbp,%rdx,8)   / d[y] = tx
 141         inc     %cl                             / x++           (NEXT ROUND)
 142         mov     (%rbp,%rcx,8),  %eax            / tx = d[x]     (NEXT ROUND)
 143         movb    (%rbp,%rbx,8),  %r8b            / val = d[val]
 144         xor     (%rsi),         %r8b            / xor 1 byte
 145         movb    %r8b,           (%rdi)
 146         inc     %rsi                            / in++
 147         inc     %rdi                            / out++
 148         jmp     1b
 149 
 150 .Lfinished:                                     / save key indices i & j
 151         dec     %rcx                            / x--
 152         movb    %dl,            2056(%rbp)      / key->j = y
 153         movb    %cl,            2048(%rbp)      / key->i = x
 154         pop     %rbx
 155         pop     %rbp
 156 
 157         /* EXPORT DELETE END */
 158 
 159         ret
 160         SET_SIZE(arcfour_crypt)
 161 
 162 #else
 163         /* LINTED */
 164         /* Nothing to be linted in this file--it's pure assembly source. */
 165 #endif /* !lint && !__lint */