1 #if !defined(lint) && !defined(__lint) 2 / ARCFOUR implementation optimized for AMD64. 3 / 4 / Author: Marc Bevand <bevand_m (at) epita.fr> 5 / Licence: I hereby disclaim the copyright on this code and place it 6 / in the public domain. 7 / 8 / The code has been designed to be easily integrated into openssl: 9 / the exported RC4() function can replace the actual implementations 10 / openssl already contains. Please note that when linking with openssl, 11 / it requires that sizeof(RC4_INT) == 8. So openssl must be compiled 12 / with -DRC4_INT='unsigned long'. 13 / 14 / The throughput achieved by this code is about 320 MBytes/sec, on 15 / a 1.8 GHz AMD Opteron (rev C0) processor. 16 17 18 / ***** BEGIN LICENSE BLOCK ***** 19 / Version: MPL 1.1/GPL 2.0/LGPL 2.1 20 / 21 / The contents of this file are subject to the Mozilla Public License Version 22 / 1.1 (the "License"); you may not use this file except in compliance with 23 / the License. You may obtain a copy of the License at 24 / http://www.mozilla.org/MPL/ 25 / 26 / Software distributed under the License is distributed on an "AS IS" basis, 27 / WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 28 / for the specific language governing rights and limitations under the 29 / License. 30 / 31 / The Original Code is "Marc Bevand's fast AMD64 ARCFOUR source" 32 / 33 / The Initial Developer of the Original Code is 34 / Marc Bevand <bevand_m@epita.fr> . 35 / Portions created by the Initial Developer are 36 / Copyright (C) 2004 the Initial Developer. All Rights Reserved. 37 / 38 / Contributor(s): 39 / 40 / Alternatively, the contents of this file may be used under the terms of 41 / either the GNU General Public License Version 2 or later (the "GPL"), or 42 / the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 43 / in which case the provisions of the GPL or the LGPL are applicable instead 44 / of those above. If you wish to allow use of your version of this file only 45 / under the terms of either the GPL or the LGPL, and not to allow others to 46 / use your version of this file under the terms of the MPL, indicate your 47 / decision by deleting the provisions above and replace them with the notice 48 / and other provisions required by the GPL or the LGPL. If you do not delete 49 / the provisions above, a recipient may use your version of this file under 50 / the terms of any one of the MPL, the GPL or the LGPL. 51 / 52 / ***** END LICENSE BLOCK ***** 53 54 .ident "@(#)arcfour_crypt_amd64.s 1.1 08/01/02 SMI" 55 56 / 57 / void arcfour_crypt(ARCFour_key *key, uchar_t *in, 58 / uchar_t *out, size_t len); 59 / 60 / The following is Marc Bevand's RC4 implementation optimized for 61 / AMD64. It has been lifted intact, except for minor interface 62 / changes to get along with Solaris crypto common code (the parameter 63 / order and the key struct element order are both different). 64 / This function works for both aligned and unaligned data ('in' and 'out'). 65 / The key and key elements must be aligned. 66 / 67 / Register Usage 68 / rax data[x] 69 / rbx ARG(len) 70 / rcx key->i (aka x) 71 / rdx key->j (aka y) 72 / rsi ARG(in) 73 / rdi ARG(out) 74 / rbp key->arr (aka data or d) 75 / rsp stack 76 / r8 8 bytes of rc4 stream 77 / r9 temp 78 / r10-r15 unused 79 / 80 81 #include <sys/asm_linkage.h> 82 83 84 ENTRY_NP(arcfour_crypt) 85 /* EXPORT DELETE START */ 86 / load parameters 87 push %rbp 88 push %rbx 89 mov %rdi, %rbp / rbp = ARG(key) 90 / rsi = ARG(in) 91 mov %rdx, %rdi / rdi = ARG(out) 92 mov %rcx, %rbx / rbx = ARG(len) 93 94 / load key indices and key 95 mov 2048(%rbp), %rcx / rcx x = key->i 96 mov 2056(%rbp), %rdx / rdx y = key->j 97 / rbp d = key->arr 98 inc %rcx / x++ 99 and $255, %rcx / x &= 0xff 100 lea -8(%rbx,%rsi), %rbx / rbx = in+len-8 101 mov %rbx, %r9 / tmp = in+len-8 102 mov (%rbp,%rcx,8), %rax / tx = d[x] 103 cmp %rsi, %rbx / cmp in with in+len-8 104 jl .Lend / jump if (in+len-8 < in) 105 106 .Lstart: 107 add $8, %rsi / increment in 108 add $8, %rdi / increment out 109 110 / generate the next 8 bytes of the rc4 stream into %r8 111 mov $8, %r11 / byte counter 112 1: add %al, %dl / y += tx 113 mov (%rbp,%rdx,8), %ebx / ty = d[y] 114 mov %ebx, (%rbp,%rcx,8) / d[x] = ty 115 add %al, %bl / val = ty + tx 116 mov %eax, (%rbp,%rdx,8) / d[y] = tx 117 inc %cl / x++ (NEXT ROUND) 118 mov (%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) 119 movb (%rbp,%rbx,8), %r8b / val = d[val] 120 dec %r11b 121 ror $8, %r8 / (ror does not change ZF) 122 jnz 1b 123 124 / xor 8 bytes 125 xor -8(%rsi), %r8 126 cmp %r9, %rsi / cmp in+len-8 with in 127 mov %r8, -8(%rdi) 128 jle .Lstart / jump if (in <= in+len-8) 129 130 .Lend: 131 add $8, %r9 / tmp = in+len 132 133 / handle the last bytes, one by one 134 1: cmp %rsi, %r9 / cmp in with in+len 135 jle .Lfinished / jump if (in+len <= in) 136 add %al, %dl / y += tx 137 mov (%rbp,%rdx,8), %ebx / ty = d[y] 138 mov %ebx, (%rbp,%rcx,8) / d[x] = ty 139 add %al, %bl / val = ty + tx 140 mov %eax, (%rbp,%rdx,8) / d[y] = tx 141 inc %cl / x++ (NEXT ROUND) 142 mov (%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) 143 movb (%rbp,%rbx,8), %r8b / val = d[val] 144 xor (%rsi), %r8b / xor 1 byte 145 movb %r8b, (%rdi) 146 inc %rsi / in++ 147 inc %rdi / out++ 148 jmp 1b 149 150 .Lfinished: / save key indices i & j 151 dec %rcx / x-- 152 movb %dl, 2056(%rbp) / key->j = y 153 movb %cl, 2048(%rbp) / key->i = x 154 pop %rbx 155 pop %rbp 156 157 /* EXPORT DELETE END */ 158 159 ret 160 SET_SIZE(arcfour_crypt) 161 162 #else 163 /* LINTED */ 164 /* Nothing to be linted in this file--it's pure assembly source. */ 165 #endif /* !lint && !__lint */