1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # sha1_block procedure for x86_64.
  11 #
  12 # It was brought to my attention that on EM64T compiler-generated code
  13 # was far behind 32-bit assembler implementation. This is unlike on
  14 # Opteron where compiler-generated code was only 15% behind 32-bit
  15 # assembler, which originally made it hard to motivate the effort.
  16 # There was suggestion to mechanically translate 32-bit code, but I
  17 # dismissed it, reasoning that x86_64 offers enough register bank
  18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  19 # implementation:-) However! While 64-bit code does performs better
  20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  21 # x86_64 does offer larger *addressable* bank, but out-of-order core
  22 # reaches for even more registers through dynamic aliasing, and EM64T
  23 # core must have managed to run-time optimize even 32-bit code just as
  24 # good as 64-bit one. Performance improvement is summarized in the
  25 # following table:
  26 #
  27 #               gcc 3.4         32-bit asm      cycles/byte
  28 # Opteron       +45%            +20%            6.8
  29 # Xeon P4       +65%            +0%             9.9
  30 # Core2         +60%            +10%            7.0
  31 
  32 #
  33 # OpenSolaris OS modifications
  34 #
  35 # Sun elects to use this software under the BSD license.
  36 #
  37 # This source originates from OpenSSL file sha1-x86_64.pl at
  38 # ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
  39 # (presumably for future OpenSSL release 0.9.8h), with these changes:
  40 #
  41 # 1. Added perl "use strict" and declared variables.
  42 #
  43 # 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  44 # /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
  45 #
  46 # 3. Added perl function &lea_offset_eax_register_register() to handle
  47 #       Solaris as(1) bug.
  48 #
  49 # 4. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) assemblers).
  50 #
  51 
  52 use strict;
  53 my ($code, $ctx, $inp, $num, $xi, $t0, $t1, $i, @V, $A, $B, $C, $D, $E, $T);
  54 my $output = shift;
  55 open STDOUT,">$output";
  56 
  57 
  58 sub lea_offset_eax_register_register
  59 # Workaround for a Solaris "gas" assembler bug where compiling the source
  60 # errors out and does not generate a valid "lea" instruction.  Specifically,
  61 #       &lea OFFSET(%eax, SOURCE_REGISTER),DESTINATION_REGISTER
  62 #
  63 # For Solaris as, "as -a32" must be used to compile this.
  64 # For Solaris gas 2.15, this errors out with this message:
  65 # Error: `0x5a827999(%eax,%r11d)' is not a valid 64 bit base/index expression
  66 #
  67 # This should be fixed in Solaris gas 2.16.
  68 # It assembles with the Linux "as --64" gas 2.17 assembler and runs OK.
  69 #
  70 # For the ONBLD NV tools, the aw wrapper script fails when -a32 is used:
  71 # /ws/onnv-tools/onbld/bin/i386/aw -xarch=amd64 -P -a32 -o lea.o lea.s
  72 # aw: as->gas mapping failed at or near arg '-a32'
  73 #
  74 # For more information, see CRs 6644870 and 6628627.
  75 {
  76         use Switch;
  77         my ($offset, $reg_src, $reg_dest) = @_;
  78 
  79         # Failed "lea" instruction.
  80         # This instruction errors out from the Solaris as assembler.
  81         # It assembles with the Linux "as --64" assembler and runs OK.
  82         $code .= "      /lea    $offset(%eax,$reg_src),$reg_dest\n";
  83 
  84         # Workaround
  85         # This workaround hand-generates hex machine code for lea.
  86         $code .= "      / Solaris as assembly bug CR 6628627 errors out for\n";
  87         $code .= "      / the above, so we specify the machine code in hex:\n";
  88         $code .= "      .byte   0x67    / lea\n";
  89 
  90         switch ($reg_src) {
  91         case "%ebp"     {
  92                         switch ($reg_dest) {
  93                         case "%r11d" { $code .=
  94                                 "       .byte   0x44,0x8d,0x9c,0x28     "
  95                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
  96                         else    { $code .= "Unknown register $reg_dest\n"; }
  97                         }
  98         }
  99         case "%edi"     {
 100                         switch ($reg_dest) {
 101                         case "%ebp" { $code .=
 102                                 "       .byte   0x8d,0xac,0x38  "
 103                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 104                         else    { $code .= "Unknown register $reg_dest\n"; }
 105                         }
 106         }
 107         case "%edx"     {
 108                         switch ($reg_dest) {
 109                         case "%esi" { $code .=
 110                                 "       .byte   0x8d,0xb4,0x10  "
 111                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 112                         else    { $code .= "Unknown register $reg_dest\n"; }
 113                         }
 114         }
 115         case "%esi"     {
 116                         switch ($reg_dest) {
 117                         case "%edi" { $code .=
 118                                 "       .byte   0x8d,0xbc,0x30  "
 119                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 120                         else    { $code .= "Unknown register $reg_dest\n"; }
 121                         }
 122         }
 123         case "%r11d"    {
 124                         switch ($reg_dest) {
 125                         case "%r12d" { $code .=
 126                                 "       .byte   0x46,0x8d,0xa4,0x18     "
 127                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 128                         else    { $code .= "Unknown register $reg_dest\n"; }
 129                         }
 130         }
 131         case "%r12d"    {
 132                         switch ($reg_dest) {
 133                         case "%edx" { $code .=
 134                                 "       .byte   0x42,0x8d,0x94,0x20     "
 135                                 . "/ (%eax,$reg_src),$reg_dest\n"; }
 136                         else    { $code .= "Unknown register $reg_dest\n"; }
 137                         }
 138         }
 139         else            { $code .= "Unknown register $reg_src\n"; }
 140         }
 141 
 142         $code .= "      .long   $offset / offset\n";
 143 }
 144 
 145 
 146 #
 147 # void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks);
 148 #
 149 
 150 # Arguments:
 151 $ctx="%rdi";    # 1st arg
 152 $inp="%rsi";    # 2nd arg
 153 $num="%rdx";    # 3rd arg
 154 
 155 # reassign arguments in order to produce more compact code
 156 $ctx="%r8";
 157 $inp="%r9";
 158 $num="%r10";
 159 
 160 # Temporaries:
 161 $xi="%eax";
 162 $t0="%ebx";
 163 $t1="%ecx";
 164 # State information from SHA-1 context:
 165 $A="%edx";
 166 $B="%esi";
 167 $C="%edi";
 168 $D="%ebp";
 169 $E="%r11d";
 170 # Temporary:
 171 $T="%r12d";
 172 
 173 @V=($A,$B,$C,$D,$E,$T);
 174 
 175 sub PROLOGUE {
 176 my $func=shift;
 177 $code.=<<___;
 178 ENTRY_NP($func)
 179         push    %rbx
 180         push    %rbp
 181         push    %r12
 182         mov     %rsp,%rax
 183         mov     %rdi,$ctx       # reassigned argument
 184         sub     \$`8+16*4`,%rsp
 185         mov     %rsi,$inp       # reassigned argument
 186         and     \$-64,%rsp
 187         mov     %rdx,$num       # reassigned argument
 188         mov     %rax,`16*4`(%rsp)
 189 
 190         mov     0($ctx),$A
 191         mov     4($ctx),$B
 192         mov     8($ctx),$C
 193         mov     12($ctx),$D
 194         mov     16($ctx),$E
 195 ___
 196 }
 197 
 198 sub EPILOGUE {
 199 my $func=shift;
 200 $code.=<<___;
 201         mov     `16*4`(%rsp),%rsp
 202         pop     %r12
 203         pop     %rbp
 204         pop     %rbx
 205         ret
 206 SET_SIZE($func)
 207 ___
 208 }
 209 
 210 sub BODY_00_19 {
 211 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
 212 my $j=$i+1;
 213 $code.=<<___ if ($i==0);
 214         mov     `4*$i`($inp),$xi        
 215         `"bswap $xi"    if(!defined($host))`
 216         mov     $xi,`4*$i`(%rsp)
 217 ___
 218         &lea_offset_eax_register_register("0x5a827999", $e, $f) if ($i < 15);
 219 $code.=<<___ if ($i<15);
 220         /lea    0x5a827999($xi,$e),$f
 221         mov     $c,$t0
 222         mov     `4*$j`($inp),$xi
 223         mov     $a,$e
 224         xor     $d,$t0
 225         `"bswap $xi"    if(!defined($host))`    
 226         rol     \$5,$e
 227         and     $b,$t0
 228         mov     $xi,`4*$j`(%rsp)
 229         add     $e,$f
 230         xor     $d,$t0
 231         rol     \$30,$b
 232         add     $t0,$f
 233 ___
 234         &lea_offset_eax_register_register("0x5a827999", $e, $f) if ($i >= 15);
 235 $code.=<<___ if ($i>=15);
 236         /lea    0x5a827999($xi,$e),$f
 237         mov     `4*($j%16)`(%rsp),$xi
 238         mov     $c,$t0
 239         mov     $a,$e
 240         xor     `4*(($j+2)%16)`(%rsp),$xi
 241         xor     $d,$t0
 242         rol     \$5,$e
 243         xor     `4*(($j+8)%16)`(%rsp),$xi
 244         and     $b,$t0
 245         add     $e,$f
 246         xor     `4*(($j+13)%16)`(%rsp),$xi
 247         xor     $d,$t0
 248         rol     \$30,$b
 249         add     $t0,$f
 250         rol     \$1,$xi
 251         mov     $xi,`4*($j%16)`(%rsp)
 252 ___
 253 }
 254 
 255 sub BODY_20_39 {
 256 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 257 my $j=$i+1;
 258 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 259         &lea_offset_eax_register_register($K, $e, $f) if ($i < 79);
 260 $code.=<<___ if ($i<79);
 261         /lea    $K($xi,$e),$f
 262         mov     `4*($j%16)`(%rsp),$xi
 263         mov     $c,$t0
 264         mov     $a,$e
 265         xor     `4*(($j+2)%16)`(%rsp),$xi
 266         xor     $b,$t0
 267         rol     \$5,$e
 268         xor     `4*(($j+8)%16)`(%rsp),$xi
 269         xor     $d,$t0
 270         add     $e,$f
 271         xor     `4*(($j+13)%16)`(%rsp),$xi
 272         rol     \$30,$b
 273         add     $t0,$f
 274         rol     \$1,$xi
 275 ___
 276 $code.=<<___ if ($i<76);
 277         mov     $xi,`4*($j%16)`(%rsp)
 278 ___
 279         &lea_offset_eax_register_register($K, $e, $f) if ($i == 79);
 280 $code.=<<___ if ($i==79);
 281         /lea    $K($xi,$e),$f
 282         mov     $c,$t0
 283         mov     $a,$e
 284         xor     $b,$t0
 285         rol     \$5,$e
 286         xor     $d,$t0
 287         add     $e,$f
 288         rol     \$30,$b
 289         add     $t0,$f
 290 ___
 291 }
 292 
 293 sub BODY_40_59 {
 294 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 295 my $j=$i+1;
 296         &lea_offset_eax_register_register("0x8f1bbcdc", $e, $f);
 297 $code.=<<___;
 298         /lea    0x8f1bbcdc($xi,$e),$f
 299         mov     `4*($j%16)`(%rsp),$xi
 300         mov     $b,$t0
 301         mov     $b,$t1
 302         xor     `4*(($j+2)%16)`(%rsp),$xi
 303         mov     $a,$e
 304         and     $c,$t0
 305         xor     `4*(($j+8)%16)`(%rsp),$xi
 306         or      $c,$t1
 307         rol     \$5,$e
 308         xor     `4*(($j+13)%16)`(%rsp),$xi
 309         and     $d,$t1
 310         add     $e,$f
 311         rol     \$1,$xi
 312         or      $t1,$t0
 313         rol     \$30,$b
 314         mov     $xi,`4*($j%16)`(%rsp)
 315         add     $t0,$f
 316 ___
 317 }
 318 
 319 $code=<<___;
 320 #if !defined(lint) && !defined(__lint)
 321         .ident  "@(#)sha1-x86_64.pl     1.2     08/03/20 SMI"
 322 #include <sys/asm_linkage.h>
 323 ___
 324 
 325 
 326 &PROLOGUE("sha1_block_data_order");
 327 $code.=".align  4\n.Lloop:\n";
 328 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 329 for(;$i<40;$i++)     { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 330 for(;$i<60;$i++)     { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 331 for(;$i<80;$i++)     { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 332 $code.=<<___;
 333         / Update and save state information in SHA-1 context
 334         add     0($ctx),$E
 335         add     4($ctx),$T
 336         add     8($ctx),$A
 337         add     12($ctx),$B
 338         add     16($ctx),$C
 339         mov     $E,0($ctx)
 340         mov     $T,4($ctx)
 341         mov     $A,8($ctx)
 342         mov     $B,12($ctx)
 343         mov     $C,16($ctx)
 344 
 345         xchg    $E,$A   # mov   $E,$A
 346         xchg    $T,$B   # mov   $T,$B
 347         xchg    $E,$C   # mov   $A,$C
 348         xchg    $T,$D   # mov   $B,$D
 349                         # mov   $C,$E
 350         lea     `16*4`($inp),$inp
 351         sub     \$1,$num
 352         jnz     .Lloop
 353 ___
 354 &EPILOGUE("sha1_block_data_order");
 355 $code.=<<___;
 356 .asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 357 
 358 #else
 359         /* LINTED */
 360         /* Nothing to be linted in this file--it's pure assembly source. */
 361 #endif /* !lint && !__lint */
 362 ___
 363 
 364 ####################################################################
 365 
 366 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 367 print $code;
 368 close STDOUT;