nv85_sha1 Wdiff usr/src/common/crypto/sha1/sha1.c

Print this page

6662791 Need a SHA1 implementation optimized for 64-bit x86

Split	Close
Expand all
Collapse all

          --- old/usr/src/common/crypto/sha1/sha1.c
          +++ new/usr/src/common/crypto/sha1/sha1.c
   1    1  /*
   2      - * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
        2 + * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   3    3   * Use is subject to license terms.
   4    4   */
   5    5  
   6      -#pragma ident   "@(#)sha1.c     1.26    07/04/10 SMI"
        6 +#pragma ident   "@(#)sha1.c     1.27    08/03/02 SMI"
   7    7  
   8    8  /*
   9    9   * The basic framework for this code came from the reference
  10   10   * implementation for MD5.  That implementation is Copyright (C)
  11   11   * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
  12   12   *
  13   13   * License to copy and use this software is granted provided that it
  14   14   * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
  15   15   * Algorithm" in all material mentioning or referencing this software
  16   16   * or this function.

  17   17   *
  18   18   * License is also granted to make and use derivative works provided
  19   19   * that such works are identified as "derived from the RSA Data
  20   20   * Security, Inc. MD5 Message-Digest Algorithm" in all material
  21   21   * mentioning or referencing the derived work.
  22   22   *
  23   23   * RSA Data Security, Inc. makes no representations concerning either
  24   24   * the merchantability of this software or the suitability of this
  25   25   * software for any particular purpose. It is provided "as is"
  26   26   * without express or implied warranty of any kind.
  27   27   *
  28   28   * These notices must be retained in any copies of any part of this
  29   29   * documentation and/or software.
  30   30   *
  31   31   * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1
  32   32   * standard, available at http://www.itl.nist.gov/div897/pubs/fip180-1.htm
  33   33   * Not as fast as one would like -- further optimizations are encouraged
  34   34   * and appreciated.
  35   35   */
  36   36  
  37   37  #include <sys/types.h>
  38   38  #include <sys/param.h>
  39   39  #include <sys/systm.h>
  40   40  #include <sys/sysmacros.h>
  41   41  #include <sys/sha1.h>
  42   42  #include <sys/sha1_consts.h>
  43   43  
  44   44  #ifndef _KERNEL
  45   45  #include <strings.h>
  46   46  #include <stdlib.h>
  47   47  #include <errno.h>
  48   48  #include <sys/systeminfo.h>
  49   49  #endif  /* !_KERNEL */
  50   50  
  51   51  static void Encode(uint8_t *, const uint32_t *, size_t);

↓ open down ↓

35 lines elided

↑ open up ↑

  52   52  
  53   53  #if     defined(__sparc)
  54   54  
  55   55  #define SHA1_TRANSFORM(ctx, in) \
  56   56          SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \
  57   57                  (ctx)->state[3], (ctx)->state[4], (ctx), (in))
  58   58  
  59   59  static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
  60   60      SHA1_CTX *, const uint8_t *);
  61   61  
       62 +#elif   defined(__amd64)
       63 +
       64 +#define SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1)
       65 +#define SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \
       66 +                (in), (num))
       67 +
       68 +void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks);
       69 +
  62   70  #else
  63   71  
  64   72  #define SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in))
  65   73  
  66   74  static void SHA1Transform(SHA1_CTX *, const uint8_t *);
  67   75  
  68   76  #endif
  69   77  
  70   78  
  71   79  static uint8_t PADDING[64] = { 0x80, /* all zeros */ };

  72   80  
  73   81  /*
  74   82   * F, G, and H are the basic SHA1 functions.
  75   83   */
  76   84  #define F(b, c, d)      (((b) & (c)) | ((~b) & (d)))
  77   85  #define G(b, c, d)      ((b) ^ (c) ^ (d))
  78   86  #define H(b, c, d)      (((b) & (c)) | (((b)|(c)) & (d)))
  79   87  
  80   88  /*
  81   89   * ROTATE_LEFT rotates x left n bits.
  82   90   */
  83   91  
  84   92  #if     defined(__GNUC__) && defined(_LP64)
  85   93  static __inline__ uint64_t
  86   94  ROTATE_LEFT(uint64_t value, uint32_t n)
  87   95  {
  88   96          uint32_t t32;
  89   97  
  90   98          t32 = (uint32_t)value;
  91   99          return ((t32 << n) | (t32 >> (32 - n)));
  92  100  }
  93  101  
  94  102  #else
  95  103  
  96  104  #define ROTATE_LEFT(x, n)       \
  97  105          (((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n))))
  98  106  
  99  107  #endif
 100  108  
 101  109  #if     defined(__GNUC__) && (defined(__i386) || defined(__amd64))
 102  110  
 103  111  #define HAVE_BSWAP
 104  112  
 105  113  extern __inline__ uint32_t bswap(uint32_t value)
 106  114  {
 107  115          __asm__("bswap %0" : "+r" (value));
 108  116          return (value);
 109  117  }
 110  118  
 111  119  #endif
 112  120  
 113  121  /*
 114  122   * SHA1Init()
 115  123   *
 116  124   * purpose: initializes the sha1 context and begins and sha1 digest operation
 117  125   *   input: SHA1_CTX *  : the context to initializes.
 118  126   *  output: void
 119  127   */
 120  128  
 121  129  void
 122  130  SHA1Init(SHA1_CTX *ctx)
 123  131  {
 124  132          ctx->count[0] = ctx->count[1] = 0;
 125  133  
 126  134          /*
 127  135           * load magic initialization constants. Tell lint
 128  136           * that these constants are unsigned by using U.
 129  137           */
 130  138  
 131  139          ctx->state[0] = 0x67452301U;
 132  140          ctx->state[1] = 0xefcdab89U;
 133  141          ctx->state[2] = 0x98badcfeU;
 134  142          ctx->state[3] = 0x10325476U;
 135  143          ctx->state[4] = 0xc3d2e1f0U;
 136  144  }
 137  145  
 138  146  #ifdef VIS_SHA1
 139  147  #ifdef _KERNEL
 140  148  
 141  149  #include <sys/regset.h>
 142  150  #include <sys/vis.h>
 143  151  #include <sys/fpu/fpusystm.h>
 144  152  
 145  153  /* the alignment for block stores to save fp registers */
 146  154  #define VIS_ALIGN       (64)
 147  155  
 148  156  extern int sha1_savefp(kfpu_t *, int);
 149  157  extern void sha1_restorefp(kfpu_t *);
 150  158  
 151  159  uint32_t        vis_sha1_svfp_threshold = 128;
 152  160  
 153  161  #endif /* _KERNEL */
 154  162  
 155  163  /*
 156  164   * VIS SHA-1 consts.
 157  165   */
 158  166  static uint64_t VIS[] = {
 159  167          0x8000000080000000ULL,
 160  168          0x0002000200020002ULL,
 161  169          0x5a8279996ed9eba1ULL,
 162  170          0x8f1bbcdcca62c1d6ULL,
 163  171          0x012389ab456789abULL};
 164  172  
 165  173  extern void SHA1TransformVIS(uint64_t *, uint32_t *, uint32_t *, uint64_t *);
 166  174  
 167  175  
 168  176  /*
 169  177   * SHA1Update()
 170  178   *
 171  179   * purpose: continues an sha1 digest operation, using the message block
 172  180   *          to update the context.
 173  181   *   input: SHA1_CTX *  : the context to update
 174  182   *          void *      : the message block
 175  183   *          size_t    : the length of the message block in bytes
 176  184   *  output: void
 177  185   */
 178  186  
 179  187  void
 180  188  SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
 181  189  {
 182  190          uint32_t i, buf_index, buf_len;
 183  191          uint64_t X0[40], input64[8];
 184  192          const uint8_t *input = inptr;
 185  193  #ifdef _KERNEL
 186  194          int usevis = 0;
 187  195  #else
 188  196          int usevis = 1;
 189  197  #endif /* _KERNEL */
 190  198  
 191  199          /* check for noop */
 192  200          if (input_len == 0)
 193  201                  return;
 194  202  
 195  203          /* compute number of bytes mod 64 */
 196  204          buf_index = (ctx->count[1] >> 3) & 0x3F;
 197  205  
 198  206          /* update number of bits */
 199  207          if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
 200  208                  ctx->count[0]++;
 201  209  
 202  210          ctx->count[0] += (input_len >> 29);
 203  211  
 204  212          buf_len = 64 - buf_index;
 205  213  
 206  214          /* transform as many times as possible */
 207  215          i = 0;
 208  216          if (input_len >= buf_len) {
 209  217  #ifdef _KERNEL
 210  218                  kfpu_t *fpu;
 211  219                  if (fpu_exists) {
 212  220                          uint8_t fpua[sizeof (kfpu_t) + GSR_SIZE + VIS_ALIGN];
 213  221                          uint32_t len = (input_len + buf_index) & ~0x3f;
 214  222                          int svfp_ok;
 215  223  
 216  224                          fpu = (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, 64);
 217  225                          svfp_ok = ((len >= vis_sha1_svfp_threshold) ? 1 : 0);
 218  226                          usevis = fpu_exists && sha1_savefp(fpu, svfp_ok);
 219  227                  } else {
 220  228                          usevis = 0;
 221  229                  }
 222  230  #endif /* _KERNEL */
 223  231  
 224  232                  /*
 225  233                   * general optimization:
 226  234                   *
 227  235                   * only do initial bcopy() and SHA1Transform() if
 228  236                   * buf_index != 0.  if buf_index == 0, we're just
 229  237                   * wasting our time doing the bcopy() since there
 230  238                   * wasn't any data left over from a previous call to
 231  239                   * SHA1Update().
 232  240                   */
 233  241  
 234  242                  if (buf_index) {
 235  243                          bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
 236  244                          if (usevis) {
 237  245                                  SHA1TransformVIS(X0,
 238  246                                      ctx->buf_un.buf32,
 239  247                                      &ctx->state[0], VIS);
 240  248                          } else {
 241  249                                  SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
 242  250                          }
 243  251                          i = buf_len;
 244  252                  }
 245  253  
 246  254                  /*
 247  255                   * VIS SHA-1: uses the VIS 1.0 instructions to accelerate
 248  256                   * SHA-1 processing. This is achieved by "offloading" the
 249  257                   * computation of the message schedule (MS) to the VIS units.
 250  258                   * This allows the VIS computation of the message schedule
 251  259                   * to be performed in parallel with the standard integer
 252  260                   * processing of the remainder of the SHA-1 computation.
 253  261                   * performance by up to around 1.37X, compared to an optimized
 254  262                   * integer-only implementation.
 255  263                   *
 256  264                   * The VIS implementation of SHA1Transform has a different API
 257  265                   * to the standard integer version:
 258  266                   *
 259  267                   * void SHA1TransformVIS(
 260  268                   *       uint64_t *, // Pointer to MS for ith block
 261  269                   *       uint32_t *, // Pointer to ith block of message data
 262  270                   *       uint32_t *, // Pointer to SHA state i.e ctx->state
 263  271                   *       uint64_t *, // Pointer to various VIS constants
 264  272                   * )
 265  273                   *
 266  274                   * Note: the message data must by 4-byte aligned.
 267  275                   *
 268  276                   * Function requires VIS 1.0 support.
 269  277                   *

↓ open down ↓

198 lines elided

↑ open up ↑

 270  278                   * Handling is provided to deal with arbitrary byte alingment
 271  279                   * of the input data but the performance gains are reduced
 272  280                   * for alignments other than 4-bytes.
 273  281                   */
 274  282                  if (usevis) {
 275  283                          if (!IS_P2ALIGNED(&input[i], sizeof (uint32_t))) {
 276  284                                  /*
 277  285                                   * Main processing loop - input misaligned
 278  286                                   */
 279  287                                  for (; i + 63 < input_len; i += 64) {
 280      -                                    bcopy(&input[i], input64, 64);
 281      -                                    SHA1TransformVIS(X0, (uint32_t *)input64,
 282      -                                        &ctx->state[0], VIS);
      288 +                                        bcopy(&input[i], input64, 64);
      289 +                                        SHA1TransformVIS(X0,
      290 +                                            (uint32_t *)input64,
      291 +                                            &ctx->state[0], VIS);
 283  292                                  }
 284  293                          } else {
 285  294                                  /*
 286  295                                   * Main processing loop - input 8-byte aligned
 287  296                                   */
 288  297                                  for (; i + 63 < input_len; i += 64) {
 289  298                                          SHA1TransformVIS(X0,
 290      -                                            /* LINTED E_BAD_PTR_CAST_ALIGN */
      299 +                                        /* LINTED E_BAD_PTR_CAST_ALIGN */
 291  300                                              (uint32_t *)&input[i],
 292  301                                              &ctx->state[0], VIS);
 293  302                                  }
 294  303  
 295  304                          }
 296  305  #ifdef _KERNEL
 297  306                          sha1_restorefp(fpu);
 298  307  #endif /* _KERNEL */
 299  308                  } else {
 300  309                          for (; i + 63 < input_len; i += 64) {
 301      -                            SHA1_TRANSFORM(ctx, &input[i]);
      310 +                                SHA1_TRANSFORM(ctx, &input[i]);
 302  311                          }
 303  312                  }
 304  313  
 305  314                  /*
 306  315                   * general optimization:
 307  316                   *
 308  317                   * if i and input_len are the same, return now instead
 309  318                   * of calling bcopy(), since the bcopy() in this case
 310  319                   * will be an expensive nop.
 311  320                   */

 312  321  
 313  322                  if (input_len == i)
 314  323                          return;
 315  324  
 316  325                  buf_index = 0;
 317  326          }
 318  327  
 319  328          /* buffer remaining input */

↓ open down ↓

8 lines elided

↑ open up ↑

 320  329          bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
 321  330  }
 322  331  
 323  332  #else /* VIS_SHA1 */
 324  333  
 325  334  void
 326  335  SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
 327  336  {
 328  337          uint32_t i, buf_index, buf_len;
 329  338          const uint8_t *input = inptr;
      339 +#if defined(__amd64)
      340 +        uint32_t        block_count;
      341 +#endif  /* __amd64 */
 330  342  
 331  343          /* check for noop */
 332  344          if (input_len == 0)
 333  345                  return;
 334  346  
 335  347          /* compute number of bytes mod 64 */
 336  348          buf_index = (ctx->count[1] >> 3) & 0x3F;
 337  349  
 338  350          /* update number of bits */
 339  351          if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))

 340  352                  ctx->count[0]++;
 341  353  
 342  354          ctx->count[0] += (input_len >> 29);
 343  355  
 344  356          buf_len = 64 - buf_index;
 345  357  
 346  358          /* transform as many times as possible */
 347  359          i = 0;
 348  360          if (input_len >= buf_len) {
 349  361  
 350  362                  /*
 351  363                   * general optimization:
 352  364                   *
 353  365                   * only do initial bcopy() and SHA1Transform() if
 354  366                   * buf_index != 0.  if buf_index == 0, we're just
 355  367                   * wasting our time doing the bcopy() since there

↓ open down ↓

16 lines elided

↑ open up ↑

 356  368                   * wasn't any data left over from a previous call to
 357  369                   * SHA1Update().
 358  370                   */
 359  371  
 360  372                  if (buf_index) {
 361  373                          bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
 362  374                          SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
 363  375                          i = buf_len;
 364  376                  }
 365  377  
      378 +#if !defined(__amd64)
 366  379                  for (; i + 63 < input_len; i += 64)
 367  380                          SHA1_TRANSFORM(ctx, &input[i]);
      381 +#else
      382 +                block_count = (input_len - i) >> 6;
      383 +                if (block_count > 0) {
      384 +                        SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count);
      385 +                        i += block_count << 6;
      386 +                }
      387 +#endif  /* !__amd64 */
 368  388  
 369  389                  /*
 370  390                   * general optimization:
 371  391                   *
 372  392                   * if i and input_len are the same, return now instead
 373  393                   * of calling bcopy(), since the bcopy() in this case
 374  394                   * will be an expensive nop.
 375  395                   */
 376  396  
 377  397                  if (input_len == i)

 378  398                          return;
 379  399  
 380  400                  buf_index = 0;
 381  401          }
 382  402  
 383  403          /* buffer remaining input */

↓ open down ↓

6 lines elided

↑ open up ↑

 384  404          bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
 385  405  }
 386  406  
 387  407  #endif /* VIS_SHA1 */
 388  408  
 389  409  /*
 390  410   * SHA1Final()
 391  411   *
 392  412   * purpose: ends an sha1 digest operation, finalizing the message digest and
 393  413   *          zeroing the context.
 394      - *   input: uchar_t *   : a buffer to store the digest in
      414 + *   input: uchar_t *   : A buffer to store the digest.
 395  415   *                      : The function actually uses void* because many
 396  416   *                      : callers pass things other than uchar_t here.
 397  417   *          SHA1_CTX *  : the context to finalize, save, and zero
 398  418   *  output: void
 399  419   */
 400  420  
 401  421  void
 402  422  SHA1Final(void *digest, SHA1_CTX *ctx)
 403  423  {
 404  424          uint8_t         bitcount_be[sizeof (ctx->count)];

 405  425          uint32_t        index = (ctx->count[1] >> 3) & 0x3f;
 406  426  
 407  427          /* store bit count, big endian */
 408  428          Encode(bitcount_be, ctx->count, sizeof (bitcount_be));
 409  429  
 410  430          /* pad out to 56 mod 64 */
 411  431          SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
 412  432

↓ open down ↓

8 lines elided

↑ open up ↑

 413  433          /* append length (before padding) */
 414  434          SHA1Update(ctx, bitcount_be, sizeof (bitcount_be));
 415  435  
 416  436          /* store state in digest */
 417  437          Encode(digest, ctx->state, sizeof (ctx->state));
 418  438  
 419  439          /* zeroize sensitive information */
 420  440          bzero(ctx, sizeof (*ctx));
 421  441  }
 422  442  
      443 +
      444 +#if !defined(__amd64)
      445 +
 423  446  typedef uint32_t sha1word;
 424  447  
 425  448  /*
 426  449   * sparc optimization:
 427  450   *
 428  451   * on the sparc, we can load big endian 32-bit data easily.  note that
 429  452   * special care must be taken to ensure the address is 32-bit aligned.
 430  453   * in the interest of speed, we don't check to make sure, since
 431  454   * careful programming can guarantee this for us.
 432  455   */

 433  456  
 434  457  #if     defined(_BIG_ENDIAN)
 435  458  
 436  459  #define LOAD_BIG_32(addr)       (*(uint32_t *)(addr))
 437  460  
 438  461  #else   /* !defined(_BIG_ENDIAN) */
 439  462  
 440  463  #if     defined(HAVE_BSWAP)
 441  464  
 442  465  #define LOAD_BIG_32(addr) bswap(*((uint32_t *)(addr)))
 443  466  
 444  467  #else   /* !defined(HAVE_BSWAP) */
 445  468  
 446  469  /* little endian -- will work on big endian, but slowly */
 447  470  #define LOAD_BIG_32(addr)       \
 448  471          (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3])
 449  472  
 450  473  #endif  /* !defined(HAVE_BSWAP) */
 451  474  
 452  475  #endif  /* !defined(_BIG_ENDIAN) */
 453  476  
 454  477  /*
 455  478   * SHA1Transform()
 456  479   */
 457  480  #if     defined(W_ARRAY)
 458  481  #define W(n) w[n]
 459  482  #else   /* !defined(W_ARRAY) */
 460  483  #define W(n) w_ ## n
 461  484  #endif  /* !defined(W_ARRAY) */
 462  485  
 463  486  
 464  487  #if     defined(__sparc)
 465  488  
 466  489  /*
 467  490   * sparc register window optimization:
 468  491   *
 469  492   * `a', `b', `c', `d', and `e' are passed into SHA1Transform
 470  493   * explicitly since it increases the number of registers available to
 471  494   * the compiler.  under this scheme, these variables can be held in
 472  495   * %i0 - %i4, which leaves more local and out registers available.
 473  496   *
 474  497   * purpose: sha1 transformation -- updates the digest based on `block'
 475  498   *   input: uint32_t    : bytes  1 -  4 of the digest
 476  499   *          uint32_t    : bytes  5 -  8 of the digest
 477  500   *          uint32_t    : bytes  9 - 12 of the digest
 478  501   *          uint32_t    : bytes 12 - 16 of the digest
 479  502   *          uint32_t    : bytes 16 - 20 of the digest
 480  503   *          SHA1_CTX *  : the context to update
 481  504   *          uint8_t [64]: the block to use to update the digest
 482  505   *  output: void
 483  506   */
 484  507  
 485  508  void
 486  509  SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
 487  510      SHA1_CTX *ctx, const uint8_t blk[64])
 488  511  {
 489  512          /*
 490  513           * sparc optimization:
 491  514           *
 492  515           * while it is somewhat counter-intuitive, on sparc, it is
 493  516           * more efficient to place all the constants used in this
 494  517           * function in an array and load the values out of the array
 495  518           * than to manually load the constants.  this is because
 496  519           * setting a register to a 32-bit value takes two ops in most
 497  520           * cases: a `sethi' and an `or', but loading a 32-bit value
 498  521           * from memory only takes one `ld' (or `lduw' on v9).  while
 499  522           * this increases memory usage, the compiler can find enough
 500  523           * other things to do while waiting to keep the pipeline does
 501  524           * not stall.  additionally, it is likely that many of these
 502  525           * constants are cached so that later accesses do not even go
 503  526           * out to the bus.
 504  527           *
 505  528           * this array is declared `static' to keep the compiler from
 506  529           * having to bcopy() this array onto the stack frame of
 507  530           * SHA1Transform() each time it is called -- which is
 508  531           * unacceptably expensive.
 509  532           *
 510  533           * the `const' is to ensure that callers are good citizens and
 511  534           * do not try to munge the array.  since these routines are
 512  535           * going to be called from inside multithreaded kernelland,
 513  536           * this is a good safety check. -- `sha1_consts' will end up in
 514  537           * .rodata.
 515  538           *
 516  539           * unfortunately, loading from an array in this manner hurts
 517  540           * performance under intel.  so, there is a macro,
 518  541           * SHA1_CONST(), used in SHA1Transform(), that either expands to
 519  542           * a reference to this array, or to the actual constant,
 520  543           * depending on what platform this code is compiled for.
 521  544           */
 522  545  
 523  546          static const uint32_t sha1_consts[] = {
 524  547                  SHA1_CONST_0,   SHA1_CONST_1,   SHA1_CONST_2,   SHA1_CONST_3,
 525  548          };
 526  549  
 527  550          /*
 528  551           * general optimization:
 529  552           *
 530  553           * use individual integers instead of using an array.  this is a
 531  554           * win, although the amount it wins by seems to vary quite a bit.
 532  555           */
 533  556  
 534  557          uint32_t        w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
 535  558          uint32_t        w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
 536  559  
 537  560          /*
 538  561           * sparc optimization:
 539  562           *
 540  563           * if `block' is already aligned on a 4-byte boundary, use
 541  564           * LOAD_BIG_32() directly.  otherwise, bcopy() into a
 542  565           * buffer that *is* aligned on a 4-byte boundary and then do
 543  566           * the LOAD_BIG_32() on that buffer.  benchmarks have shown
 544  567           * that using the bcopy() is better than loading the bytes
 545  568           * individually and doing the endian-swap by hand.
 546  569           *
 547  570           * even though it's quite tempting to assign to do:
 548  571           *
 549  572           * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32));
 550  573           *
 551  574           * and only have one set of LOAD_BIG_32()'s, the compiler
 552  575           * *does not* like that, so please resist the urge.
 553  576           */
 554  577  
 555  578          if ((uintptr_t)blk & 0x3) {             /* not 4-byte aligned? */
 556  579                  bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
 557  580                  w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15);
 558  581                  w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14);
 559  582                  w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13);
 560  583                  w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12);
 561  584                  w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11);
 562  585                  w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10);
 563  586                  w_9  = LOAD_BIG_32(ctx->buf_un.buf32 +  9);
 564  587                  w_8  = LOAD_BIG_32(ctx->buf_un.buf32 +  8);
 565  588                  w_7  = LOAD_BIG_32(ctx->buf_un.buf32 +  7);
 566  589                  w_6  = LOAD_BIG_32(ctx->buf_un.buf32 +  6);
 567  590                  w_5  = LOAD_BIG_32(ctx->buf_un.buf32 +  5);
 568  591                  w_4  = LOAD_BIG_32(ctx->buf_un.buf32 +  4);
 569  592                  w_3  = LOAD_BIG_32(ctx->buf_un.buf32 +  3);
 570  593                  w_2  = LOAD_BIG_32(ctx->buf_un.buf32 +  2);
 571  594                  w_1  = LOAD_BIG_32(ctx->buf_un.buf32 +  1);
 572  595                  w_0  = LOAD_BIG_32(ctx->buf_un.buf32 +  0);
 573  596          } else {
 574  597                  /*LINTED*/
 575  598                  w_15 = LOAD_BIG_32(blk + 60);
 576  599                  /*LINTED*/
 577  600                  w_14 = LOAD_BIG_32(blk + 56);
 578  601                  /*LINTED*/
 579  602                  w_13 = LOAD_BIG_32(blk + 52);
 580  603                  /*LINTED*/
 581  604                  w_12 = LOAD_BIG_32(blk + 48);
 582  605                  /*LINTED*/
 583  606                  w_11 = LOAD_BIG_32(blk + 44);
 584  607                  /*LINTED*/
 585  608                  w_10 = LOAD_BIG_32(blk + 40);
 586  609                  /*LINTED*/
 587  610                  w_9  = LOAD_BIG_32(blk + 36);
 588  611                  /*LINTED*/
 589  612                  w_8  = LOAD_BIG_32(blk + 32);
 590  613                  /*LINTED*/
 591  614                  w_7  = LOAD_BIG_32(blk + 28);
 592  615                  /*LINTED*/
 593  616                  w_6  = LOAD_BIG_32(blk + 24);
 594  617                  /*LINTED*/
 595  618                  w_5  = LOAD_BIG_32(blk + 20);
 596  619                  /*LINTED*/
 597  620                  w_4  = LOAD_BIG_32(blk + 16);
 598  621                  /*LINTED*/
 599  622                  w_3  = LOAD_BIG_32(blk + 12);
 600  623                  /*LINTED*/
 601  624                  w_2  = LOAD_BIG_32(blk +  8);
 602  625                  /*LINTED*/
 603  626                  w_1  = LOAD_BIG_32(blk +  4);
 604  627                  /*LINTED*/
 605  628                  w_0  = LOAD_BIG_32(blk +  0);
 606  629          }
 607  630  #else   /* !defined(__sparc) */
 608  631  
 609  632  void
 610  633  SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64])
 611  634  {
 612  635          sha1word a = ctx->state[0];
 613  636          sha1word b = ctx->state[1];
 614  637          sha1word c = ctx->state[2];
 615  638          sha1word d = ctx->state[3];
 616  639          sha1word e = ctx->state[4];
 617  640  
 618  641  #if     defined(W_ARRAY)
 619  642          sha1word        w[16];
 620  643  #else   /* !defined(W_ARRAY) */
 621  644          sha1word        w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
 622  645          sha1word        w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
 623  646  #endif  /* !defined(W_ARRAY) */
 624  647  
 625  648          W(0)  = LOAD_BIG_32(blk +  0);
 626  649          W(1)  = LOAD_BIG_32(blk +  4);
 627  650          W(2)  = LOAD_BIG_32(blk +  8);
 628  651          W(3)  = LOAD_BIG_32(blk + 12);
 629  652          W(4)  = LOAD_BIG_32(blk + 16);
 630  653          W(5)  = LOAD_BIG_32(blk + 20);
 631  654          W(6)  = LOAD_BIG_32(blk + 24);
 632  655          W(7)  = LOAD_BIG_32(blk + 28);
 633  656          W(8)  = LOAD_BIG_32(blk + 32);
 634  657          W(9)  = LOAD_BIG_32(blk + 36);
 635  658          W(10) = LOAD_BIG_32(blk + 40);
 636  659          W(11) = LOAD_BIG_32(blk + 44);
 637  660          W(12) = LOAD_BIG_32(blk + 48);
 638  661          W(13) = LOAD_BIG_32(blk + 52);
 639  662          W(14) = LOAD_BIG_32(blk + 56);
 640  663          W(15) = LOAD_BIG_32(blk + 60);
 641  664  
 642  665  #endif  /* !defined(__sparc) */
 643  666  
 644  667          /*
 645  668           * general optimization:
 646  669           *
 647  670           * even though this approach is described in the standard as
 648  671           * being slower algorithmically, it is 30-40% faster than the
 649  672           * "faster" version under SPARC, because this version has more
 650  673           * of the constraints specified at compile-time and uses fewer
 651  674           * variables (and therefore has better register utilization)
 652  675           * than its "speedier" brother.  (i've tried both, trust me)

↓ open down ↓

220 lines elided

↑ open up ↑

 653  676           *
 654  677           * for either method given in the spec, there is an "assignment"
 655  678           * phase where the following takes place:
 656  679           *
 657  680           *      tmp = (main_computation);
 658  681           *      e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp;
 659  682           *
 660  683           * we can make the algorithm go faster by not doing this work,
 661  684           * but just pretending that `d' is now `e', etc. this works
 662  685           * really well and obviates the need for a temporary variable.
 663      -         * however, we still explictly perform the rotate action,
      686 +         * however, we still explicitly perform the rotate action,
 664  687           * since it is cheaper on SPARC to do it once than to have to
 665  688           * do it over and over again.
 666  689           */
 667  690  
 668  691          /* round 1 */
 669  692          e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */
 670  693          b = ROTATE_LEFT(b, 30);
 671  694  
 672  695          d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */
 673  696          a = ROTATE_LEFT(a, 30);

 674  697  
 675  698          c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */
 676  699          e = ROTATE_LEFT(e, 30);
 677  700  
 678  701          b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */
 679  702          d = ROTATE_LEFT(d, 30);
 680  703  
 681  704          a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */
 682  705          c = ROTATE_LEFT(c, 30);
 683  706  
 684  707          e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */
 685  708          b = ROTATE_LEFT(b, 30);
 686  709  
 687  710          d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */
 688  711          a = ROTATE_LEFT(a, 30);
 689  712  
 690  713          c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */
 691  714          e = ROTATE_LEFT(e, 30);
 692  715  
 693  716          b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */
 694  717          d = ROTATE_LEFT(d, 30);
 695  718  
 696  719          a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */
 697  720          c = ROTATE_LEFT(c, 30);
 698  721  
 699  722          e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */
 700  723          b = ROTATE_LEFT(b, 30);
 701  724  
 702  725          d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */
 703  726          a = ROTATE_LEFT(a, 30);
 704  727  
 705  728          c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */
 706  729          e = ROTATE_LEFT(e, 30);
 707  730  
 708  731          b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */
 709  732          d = ROTATE_LEFT(d, 30);
 710  733  
 711  734          a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */
 712  735          c = ROTATE_LEFT(c, 30);
 713  736  
 714  737          e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */
 715  738          b = ROTATE_LEFT(b, 30);
 716  739  
 717  740          W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);            /* 16 */
 718  741          d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0);
 719  742          a = ROTATE_LEFT(a, 30);
 720  743  
 721  744          W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);            /* 17 */
 722  745          c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0);
 723  746          e = ROTATE_LEFT(e, 30);
 724  747  
 725  748          W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);   /* 18 */
 726  749          b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0);
 727  750          d = ROTATE_LEFT(d, 30);
 728  751  
 729  752          W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);            /* 19 */
 730  753          a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0);
 731  754          c = ROTATE_LEFT(c, 30);
 732  755  
 733  756          /* round 2 */
 734  757          W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);            /* 20 */
 735  758          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1);
 736  759          b = ROTATE_LEFT(b, 30);
 737  760  
 738  761          W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);            /* 21 */
 739  762          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1);
 740  763          a = ROTATE_LEFT(a, 30);
 741  764  
 742  765          W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);            /* 22 */
 743  766          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1);
 744  767          e = ROTATE_LEFT(e, 30);
 745  768  
 746  769          W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);            /* 23 */
 747  770          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1);
 748  771          d = ROTATE_LEFT(d, 30);
 749  772  
 750  773          W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);            /* 24 */
 751  774          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1);
 752  775          c = ROTATE_LEFT(c, 30);
 753  776  
 754  777          W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);            /* 25 */
 755  778          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1);
 756  779          b = ROTATE_LEFT(b, 30);
 757  780  
 758  781          W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);  /* 26 */
 759  782          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1);
 760  783          a = ROTATE_LEFT(a, 30);
 761  784  
 762  785          W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);  /* 27 */
 763  786          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1);
 764  787          e = ROTATE_LEFT(e, 30);
 765  788  
 766  789          W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);  /* 28 */
 767  790          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1);
 768  791          d = ROTATE_LEFT(d, 30);
 769  792  
 770  793          W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 29 */
 771  794          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1);
 772  795          c = ROTATE_LEFT(c, 30);
 773  796  
 774  797          W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);  /* 30 */
 775  798          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1);
 776  799          b = ROTATE_LEFT(b, 30);
 777  800  
 778  801          W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);  /* 31 */
 779  802          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1);
 780  803          a = ROTATE_LEFT(a, 30);
 781  804  
 782  805          W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);            /* 32 */
 783  806          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1);
 784  807          e = ROTATE_LEFT(e, 30);
 785  808  
 786  809          W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);            /* 33 */
 787  810          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1);
 788  811          d = ROTATE_LEFT(d, 30);
 789  812  
 790  813          W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);   /* 34 */
 791  814          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1);
 792  815          c = ROTATE_LEFT(c, 30);
 793  816  
 794  817          W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);            /* 35 */
 795  818          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1);
 796  819          b = ROTATE_LEFT(b, 30);
 797  820  
 798  821          W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);            /* 36 */
 799  822          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1);
 800  823          a = ROTATE_LEFT(a, 30);
 801  824  
 802  825          W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);            /* 37 */
 803  826          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1);
 804  827          e = ROTATE_LEFT(e, 30);
 805  828  
 806  829          W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);            /* 38 */
 807  830          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1);
 808  831          d = ROTATE_LEFT(d, 30);
 809  832  
 810  833          W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);            /* 39 */
 811  834          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1);
 812  835          c = ROTATE_LEFT(c, 30);
 813  836  
 814  837          /* round 3 */
 815  838          W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);            /* 40 */
 816  839          e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2);
 817  840          b = ROTATE_LEFT(b, 30);
 818  841  
 819  842          W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);            /* 41 */
 820  843          d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2);
 821  844          a = ROTATE_LEFT(a, 30);
 822  845  
 823  846          W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);  /* 42 */
 824  847          c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2);
 825  848          e = ROTATE_LEFT(e, 30);
 826  849  
 827  850          W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);  /* 43 */
 828  851          b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2);
 829  852          d = ROTATE_LEFT(d, 30);
 830  853  
 831  854          W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);  /* 44 */
 832  855          a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2);
 833  856          c = ROTATE_LEFT(c, 30);
 834  857  
 835  858          W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 45 */
 836  859          e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2);
 837  860          b = ROTATE_LEFT(b, 30);
 838  861  
 839  862          W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);  /* 46 */
 840  863          d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2);
 841  864          a = ROTATE_LEFT(a, 30);
 842  865  
 843  866          W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);  /* 47 */
 844  867          c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2);
 845  868          e = ROTATE_LEFT(e, 30);
 846  869  
 847  870          W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);            /* 48 */
 848  871          b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2);
 849  872          d = ROTATE_LEFT(d, 30);
 850  873  
 851  874          W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);            /* 49 */
 852  875          a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2);
 853  876          c = ROTATE_LEFT(c, 30);
 854  877  
 855  878          W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);   /* 50 */
 856  879          e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2);
 857  880          b = ROTATE_LEFT(b, 30);
 858  881  
 859  882          W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);            /* 51 */
 860  883          d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2);
 861  884          a = ROTATE_LEFT(a, 30);
 862  885  
 863  886          W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);            /* 52 */
 864  887          c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2);
 865  888          e = ROTATE_LEFT(e, 30);
 866  889  
 867  890          W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);            /* 53 */
 868  891          b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2);
 869  892          d = ROTATE_LEFT(d, 30);
 870  893  
 871  894          W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);            /* 54 */
 872  895          a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2);
 873  896          c = ROTATE_LEFT(c, 30);
 874  897  
 875  898          W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);            /* 55 */
 876  899          e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2);
 877  900          b = ROTATE_LEFT(b, 30);
 878  901  
 879  902          W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);            /* 56 */
 880  903          d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2);
 881  904          a = ROTATE_LEFT(a, 30);
 882  905  
 883  906          W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);            /* 57 */
 884  907          c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2);
 885  908          e = ROTATE_LEFT(e, 30);
 886  909  
 887  910          W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);  /* 58 */
 888  911          b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2);
 889  912          d = ROTATE_LEFT(d, 30);
 890  913  
 891  914          W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);  /* 59 */
 892  915          a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2);
 893  916          c = ROTATE_LEFT(c, 30);
 894  917  
 895  918          /* round 4 */
 896  919          W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);  /* 60 */
 897  920          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3);
 898  921          b = ROTATE_LEFT(b, 30);
 899  922  
 900  923          W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 61 */
 901  924          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3);
 902  925          a = ROTATE_LEFT(a, 30);
 903  926  
 904  927          W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);  /* 62 */
 905  928          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3);
 906  929          e = ROTATE_LEFT(e, 30);
 907  930  
 908  931          W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);  /* 63 */
 909  932          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3);
 910  933          d = ROTATE_LEFT(d, 30);
 911  934  
 912  935          W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);            /* 64 */
 913  936          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3);
 914  937          c = ROTATE_LEFT(c, 30);
 915  938  
 916  939          W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);            /* 65 */
 917  940          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3);
 918  941          b = ROTATE_LEFT(b, 30);
 919  942  
 920  943          W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);   /* 66 */
 921  944          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3);
 922  945          a = ROTATE_LEFT(a, 30);
 923  946  
 924  947          W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);            /* 67 */
 925  948          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3);
 926  949          e = ROTATE_LEFT(e, 30);
 927  950  
 928  951          W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);            /* 68 */
 929  952          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3);
 930  953          d = ROTATE_LEFT(d, 30);
 931  954  
 932  955          W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);            /* 69 */
 933  956          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3);
 934  957          c = ROTATE_LEFT(c, 30);
 935  958  
 936  959          W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);            /* 70 */
 937  960          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3);
 938  961          b = ROTATE_LEFT(b, 30);
 939  962  
 940  963          W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);            /* 71 */
 941  964          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3);
 942  965          a = ROTATE_LEFT(a, 30);
 943  966  
 944  967          W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);            /* 72 */
 945  968          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3);
 946  969          e = ROTATE_LEFT(e, 30);
 947  970  
 948  971          W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);            /* 73 */
 949  972          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3);
 950  973          d = ROTATE_LEFT(d, 30);
 951  974  
 952  975          W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);  /* 74 */
 953  976          a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3);
 954  977          c = ROTATE_LEFT(c, 30);
 955  978  
 956  979          W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);  /* 75 */
 957  980          e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3);
 958  981          b = ROTATE_LEFT(b, 30);
 959  982  
 960  983          W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);  /* 76 */
 961  984          d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3);
 962  985          a = ROTATE_LEFT(a, 30);
 963  986  
 964  987          W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 77 */
 965  988          c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3);
 966  989          e = ROTATE_LEFT(e, 30);
 967  990  
 968  991          W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);  /* 78 */
 969  992          b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3);
 970  993          d = ROTATE_LEFT(d, 30);
 971  994  
 972  995          W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);  /* 79 */
 973  996  
 974  997          ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) +

↓ open down ↓

301 lines elided

↑ open up ↑

 975  998              SHA1_CONST(3);
 976  999          ctx->state[1] += b;
 977 1000          ctx->state[2] += ROTATE_LEFT(c, 30);
 978 1001          ctx->state[3] += d;
 979 1002          ctx->state[4] += e;
 980 1003  
 981 1004          /* zeroize sensitive information */
 982 1005          W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0;
 983 1006          W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0;
 984 1007  }
     1008 +#endif  /* !__amd64 */
 985 1009  
     1010 +
 986 1011  /*
 987 1012   * Encode()
 988 1013   *
 989 1014   * purpose: to convert a list of numbers from little endian to big endian
 990 1015   *   input: uint8_t *   : place to store the converted big endian numbers
 991 1016   *          uint32_t *  : place to get numbers to convert from
 992 1017   *          size_t      : the length of the input in bytes
 993 1018   *  output: void
 994 1019   */
 995 1020

 996 1021  static void
 997 1022  Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
 998 1023      size_t len)
 999 1024  {
1000 1025          size_t          i, j;
1001 1026  
1002 1027  #if     defined(__sparc)
1003 1028          if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
1004 1029                  for (i = 0, j = 0; j < len; i++, j += 4) {
1005 1030                          /* LINTED: pointer alignment */
1006 1031                          *((uint32_t *)(output + j)) = input[i];
1007 1032                  }
1008 1033          } else {
1009 1034  #endif  /* little endian -- will work on big endian, but slowly */
1010 1035                  for (i = 0, j = 0; j < len; i++, j += 4) {
1011 1036                          output[j]       = (input[i] >> 24) & 0xff;
1012 1037                          output[j + 1]   = (input[i] >> 16) & 0xff;
1013 1038                          output[j + 2]   = (input[i] >>  8) & 0xff;
1014 1039                          output[j + 3]   = input[i] & 0xff;
1015 1040                  }
1016 1041  #if     defined(__sparc)
1017 1042          }
1018 1043  #endif
1019 1044  }

↓ open down ↓

24 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX