1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # sha1_block procedure for x86_64.
  11 #
  12 # It was brought to my attention that on EM64T compiler-generated code
  13 # was far behind 32-bit assembler implementation. This is unlike on
  14 # Opteron where compiler-generated code was only 15% behind 32-bit
  15 # assembler, which originally made it hard to motivate the effort.
  16 # There was suggestion to mechanically translate 32-bit code, but I
  17 # dismissed it, reasoning that x86_64 offers enough register bank
  18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  19 # implementation:-) However! While 64-bit code does performs better
  20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  21 # x86_64 does offer larger *addressable* bank, but out-of-order core
  22 # reaches for even more registers through dynamic aliasing, and EM64T
  23 # core must have managed to run-time optimize even 32-bit code just as
  24 # good as 64-bit one. Performance improvement is summarized in the
  25 # following table:
  26 #
  27 #               gcc 3.4         32-bit asm      cycles/byte
  28 # Opteron       +45%            +20%            6.8
  29 # Xeon P4       +65%            +0%             9.9
  30 # Core2         +60%            +10%            7.0
  31 
  32 #
  33 # OpenSolaris OS modifications
  34 #
  35 # Sun elects to use this software under the BSD license.
  36 #
  37 # This source originates from OpenSSL file sha1-x86_64.pl at
  38 # ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
  39 # (presumably for future OpenSSL release 0.9.8h), with these changes:
  40 #
  41 # 1. Added perl "use strict" and declared variables.
  42 #
  43 # 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  44 # /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
  45 #
  46 # 3. Added perl function &lea_offset_eax_register_register() to handle
  47 #       Solaris as(1) bug.
  48 #
  49 # 4. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) assemblers).
  50 #
  51 
  52 use strict;
  53 my ($code, $ctx, $inp, $num, $xi, $t0, $t1, $i, @V, $A, $B, $C, $D, $E, $T);
  54 my $output = shift;
  55 open STDOUT,">$output";
  56 
  57 
  58 sub lea_offset_eax_register_register
  59 # Workaround for a Solaris "gas" assembler bug where compiling the source
  60 # errors out and does not generate a valid "lea" instruction.  Specifically,
  61 #       &lea OFFSET(%eax, SOURCE_REGISTER),DESTINATION_REGISTER
  62 #
  63 # For Solaris as, "as -a32" must be used to compile this.
  64 # For Solaris gas 2.15, this errors out with this message:
  65 # Error: `0x5a827999(%eax,%r11d)' is not a valid 64 bit base/index expression
  66 #
  67 # This should be fixed in Solaris gas 2.16.
  68 # It assembles with the Linux "as --64" gas 2.17 assembler and runs OK.
  69 #
  70 # For the ONBLD NV tools, the aw wrapper script fails when -a32 is used:
  71 # /ws/onnv-tools/onbld/bin/i386/aw -xarch=amd64 -P -a32 -o lea.o lea.s
  72 # aw: as->gas mapping failed at or near arg '-a32'
  73 #
  74 # For more information, see CRs 6644870 and 6628627.
  75 {
  76         use Switch;
  77         my ($offset, $reg_src, $reg_dest) = @_;
  78 
  79         # Failed "lea" instruction.
  80         # This instruction errors out from the Solaris as assembler.
  81         # It assembles with the Linux "as --64" assembler and runs OK.
  82         $code .= "      /lea    $offset(%eax,$reg_src),$reg_dest\n";
  83 
  84         # Workaround
  85         # This workaround hand-generates hex machine code for lea.
  86         $code .= "      / Solaris as assembly bug CR 6628627 errors out for\n";
  87         $code .= "      / the above, so we specify the machine code in hex:\n";
  88         $code .= "      .byte   0x67    / lea\n";
  89 
  90         switch ($reg_src) {
  91         case "%ebp"     {
  92                         switch ($reg_dest) {
  93                         case "%r11d" { $code .=
  94                                 "       .byte   0x44,0x8d,0x9c,0x28     "
  95                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
  96                         else    { $code .= "Unknown register $reg_dest\n"; }
  97                         }
  98         }
  99         case "%edi"     {
 100                         switch ($reg_dest) {
 101                         case "%ebp" { $code .=
 102                                 "       .byte   0x8d,0xac,0x38  "
 103                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 104                         else    { $code .= "Unknown register $reg_dest\n"; }
 105                         }
 106         }
 107         case "%edx"     {
 108                         switch ($reg_dest) {
 109                         case "%esi" { $code .=
 110                                 "       .byte   0x8d,0xb4,0x10  "
 111                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 112                         else    { $code .= "Unknown register $reg_dest\n"; }
 113                         }
 114         }
 115         case "%esi"     {
 116                         switch ($reg_dest) {
 117                         case "%edi" { $code .=
 118                                 "       .byte   0x8d,0xbc,0x30  "
 119                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 120                         else    { $code .= "Unknown register $reg_dest\n"; }
 121                         }
 122         }
 123         case "%r11d"    {
 124                         switch ($reg_dest) {
 125                         case "%r12d" { $code .=
 126                                 "       .byte   0x46,0x8d,0xa4,0x18     "
 127                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 128                         else    { $code .= "Unknown register $reg_dest\n"; }
 129                         }
 130         }
 131         case "%r12d"    {
 132                         switch ($reg_dest) {
 133                         case "%edx" { $code .=
 134                                 "       .byte   0x42,0x8d,0x94,0x20     "
 135                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 136                         else    { $code .= "Unknown register $reg_dest\n"; }
 137                         }
 138         }
 139         else            { $code .= "Unknown register $reg_src\n"; }
 140         }
 141 
 142         $code .= "      .long   $offset / offset\n";
 143 }
 144 
 145 
 146 #
 147 # void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks);
 148 #
 149 
 150 # Arguments:
 151 $ctx="%rdi";    # 1st arg
 152 $inp="%rsi";    # 2nd arg
 153 $num="%rdx";    # 3rd arg
 154 
 155 # reassign arguments in order to produce more compact code
 156 $ctx="%r8";
 157 $inp="%r9";
 158 $num="%r10";
 159 
 160 # Temporaries:
 161 $xi="%eax";
 162 $t0="%ebx";
 163 $t1="%ecx";
 164 # State information from SHA-1 context:
 165 $A="%edx";
 166 $B="%esi";
 167 $C="%edi";
 168 $D="%ebp";
 169 $E="%r11d";
 170 # Temporary:
 171 $T="%r12d";
 172 
 173 @V=($A,$B,$C,$D,$E,$T);
 174 
 175 sub PROLOGUE {
 176 my $func=shift;
 177 $code.=<<___;
 178 ENTRY_NP($func)
 179         /* EXPORT DELETE START */
 180         push    %rbx
 181         push    %rbp
 182         push    %r12
 183         mov     %rsp,%rax
 184         mov     %rdi,$ctx       # reassigned argument
 185         sub     \$`8+16*4`,%rsp
 186         mov     %rsi,$inp       # reassigned argument
 187         and     \$-64,%rsp
 188         mov     %rdx,$num       # reassigned argument
 189         mov     %rax,`16*4`(%rsp)
 190 
 191         mov     0($ctx),$A
 192         mov     4($ctx),$B
 193         mov     8($ctx),$C
 194         mov     12($ctx),$D
 195         mov     16($ctx),$E
 196 ___
 197 }
 198 
 199 sub EPILOGUE {
 200 my $func=shift;
 201 $code.=<<___;
 202         mov     `16*4`(%rsp),%rsp
 203         pop     %r12
 204         pop     %rbp
 205         pop     %rbx
 206         /* EXPORT DELETE END */
 207         ret
 208 SET_SIZE($func)
 209 ___
 210 }
 211 
 212 sub BODY_00_19 {
 213 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
 214 my $j=$i+1;
 215 $code.=<<___ if ($i==0);
 216         mov     `4*$i`($inp),$xi        
 217         `"bswap $xi"    if(!defined($host))`
 218         mov     $xi,`4*$i`(%rsp)
 219 ___
 220         &lea_offset_eax_register_register("0x5a827999", $e, $f) if ($i < 15);
 221 $code.=<<___ if ($i<15);
 222         /lea    0x5a827999($xi,$e),$f
 223         mov     $c,$t0
 224         mov     `4*$j`($inp),$xi
 225         mov     $a,$e
 226         xor     $d,$t0
 227         `"bswap $xi"    if(!defined($host))`    
 228         rol     \$5,$e
 229         and     $b,$t0
 230         mov     $xi,`4*$j`(%rsp)
 231         add     $e,$f
 232         xor     $d,$t0
 233         rol     \$30,$b
 234         add     $t0,$f
 235 ___
 236         &lea_offset_eax_register_register("0x5a827999", $e, $f) if ($i >= 15);
 237 $code.=<<___ if ($i>=15);
 238         /lea    0x5a827999($xi,$e),$f
 239         mov     `4*($j%16)`(%rsp),$xi
 240         mov     $c,$t0
 241         mov     $a,$e
 242         xor     `4*(($j+2)%16)`(%rsp),$xi
 243         xor     $d,$t0
 244         rol     \$5,$e
 245         xor     `4*(($j+8)%16)`(%rsp),$xi
 246         and     $b,$t0
 247         add     $e,$f
 248         xor     `4*(($j+13)%16)`(%rsp),$xi
 249         xor     $d,$t0
 250         rol     \$30,$b
 251         add     $t0,$f
 252         rol     \$1,$xi
 253         mov     $xi,`4*($j%16)`(%rsp)
 254 ___
 255 }
 256 
 257 sub BODY_20_39 {
 258 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 259 my $j=$i+1;
 260 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 261         &lea_offset_eax_register_register($K, $e, $f) if ($i < 79);
 262 $code.=<<___ if ($i<79);
 263         /lea    $K($xi,$e),$f
 264         mov     `4*($j%16)`(%rsp),$xi
 265         mov     $c,$t0
 266         mov     $a,$e
 267         xor     `4*(($j+2)%16)`(%rsp),$xi
 268         xor     $b,$t0
 269         rol     \$5,$e
 270         xor     `4*(($j+8)%16)`(%rsp),$xi
 271         xor     $d,$t0
 272         add     $e,$f
 273         xor     `4*(($j+13)%16)`(%rsp),$xi
 274         rol     \$30,$b
 275         add     $t0,$f
 276         rol     \$1,$xi
 277 ___
 278 $code.=<<___ if ($i<76);
 279         mov     $xi,`4*($j%16)`(%rsp)
 280 ___
 281         &lea_offset_eax_register_register($K, $e, $f) if ($i == 79);
 282 $code.=<<___ if ($i==79);
 283         /lea    $K($xi,$e),$f
 284         mov     $c,$t0
 285         mov     $a,$e
 286         xor     $b,$t0
 287         rol     \$5,$e
 288         xor     $d,$t0
 289         add     $e,$f
 290         rol     \$30,$b
 291         add     $t0,$f
 292 ___
 293 }
 294 
 295 sub BODY_40_59 {
 296 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 297 my $j=$i+1;
 298         &lea_offset_eax_register_register("0x8f1bbcdc", $e, $f);
 299 $code.=<<___;
 300         /lea    0x8f1bbcdc($xi,$e),$f
 301         mov     `4*($j%16)`(%rsp),$xi
 302         mov     $b,$t0
 303         mov     $b,$t1
 304         xor     `4*(($j+2)%16)`(%rsp),$xi
 305         mov     $a,$e
 306         and     $c,$t0
 307         xor     `4*(($j+8)%16)`(%rsp),$xi
 308         or      $c,$t1
 309         rol     \$5,$e
 310         xor     `4*(($j+13)%16)`(%rsp),$xi
 311         and     $d,$t1
 312         add     $e,$f
 313         rol     \$1,$xi
 314         or      $t1,$t0
 315         rol     \$30,$b
 316         mov     $xi,`4*($j%16)`(%rsp)
 317         add     $t0,$f
 318 ___
 319 }
 320 
 321 $code=<<___;
 322 #if !defined(lint) && !defined(__lint)
 323         .ident  "@(#)sha1-x86_64.pl     1.1     08/03/02 SMI"
 324 #include <sys/asm_linkage.h>
 325 ___
 326 
 327 
 328 &PROLOGUE("sha1_block_data_order");
 329 $code.=".align  4\n.Lloop:\n";
 330 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 331 for(;$i<40;$i++)     { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 332 for(;$i<60;$i++)     { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 333 for(;$i<80;$i++)     { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 334 $code.=<<___;
 335         / Update and save state information in SHA-1 context
 336         add     0($ctx),$E
 337         add     4($ctx),$T
 338         add     8($ctx),$A
 339         add     12($ctx),$B
 340         add     16($ctx),$C
 341         mov     $E,0($ctx)
 342         mov     $T,4($ctx)
 343         mov     $A,8($ctx)
 344         mov     $B,12($ctx)
 345         mov     $C,16($ctx)
 346 
 347         xchg    $E,$A   # mov   $E,$A
 348         xchg    $T,$B   # mov   $T,$B
 349         xchg    $E,$C   # mov   $A,$C
 350         xchg    $T,$D   # mov   $B,$D
 351                         # mov   $C,$E
 352         lea     `16*4`($inp),$inp
 353         sub     \$1,$num
 354         jnz     .Lloop
 355 ___
 356 &EPILOGUE("sha1_block_data_order");
 357 $code.=<<___;
 358 .asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 359 
 360 #else
 361         /* LINTED */
 362         /* Nothing to be linted in this file--it's pure assembly source. */
 363 #endif /* !lint && !__lint */
 364 ___
 365 
 366 ####################################################################
 367 
 368 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 369 print $code;
 370 close STDOUT;