nv81_ac4md5 Wdiff usr/src/common/crypto/md5/md5.c

Print this page

5072961 Need an optimized MD5 implementation for amd64

Split	Close
Expand all
Collapse all

          --- old/usr/src/common/crypto/md5/md5.c
          +++ new/usr/src/common/crypto/md5/md5.c
   1    1  /*
   2      - * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
        2 + * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   3    3   * Use is subject to license terms.
   4    4   */
   5    5  
   6    6  /*
   7    7   * Cleaned-up and optimized version of MD5, based on the reference
   8    8   * implementation provided in RFC 1321.  See RSA Copyright information
   9    9   * below.
  10   10   */
  11   11  
  12      -#pragma ident   "@(#)md5.c      1.27    07/04/10 SMI"
       12 +#pragma ident   "@(#)md5.c      1.28    08/01/02 SMI"
  13   13  
  14   14  /*
  15   15   * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
  16   16   */
  17   17  
  18   18  /*
  19   19   * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
  20   20   * rights reserved.
  21   21   *
  22   22   * License to copy and use this software is granted provided that it

  23   23   * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
  24   24   * Algorithm" in all material mentioning or referencing this software
  25   25   * or this function.
  26   26   *
  27   27   * License is also granted to make and use derivative works provided
  28   28   * that such works are identified as "derived from the RSA Data
  29   29   * Security, Inc. MD5 Message-Digest Algorithm" in all material
  30   30   * mentioning or referencing the derived work.
  31   31   *
  32   32   * RSA Data Security, Inc. makes no representations concerning either
  33   33   * the merchantability of this software or the suitability of this
  34   34   * software for any particular purpose. It is provided "as is"
  35   35   * without express or implied warranty of any kind.
  36   36   *
  37   37   * These notices must be retained in any copies of any part of this
  38   38   * documentation and/or software.
  39   39   */
  40   40  
  41   41  #include <sys/types.h>
  42   42  #include <sys/md5.h>
  43   43  #include <sys/md5_consts.h>     /* MD5_CONST() optimization */

↓ open down ↓

21 lines elided

↑ open up ↑

  44   44  #include "md5_byteswap.h"
  45   45  #if     !defined(_KERNEL) || defined(_BOOT)
  46   46  #include <strings.h>
  47   47  #endif /* !_KERNEL || _BOOT */
  48   48  
  49   49  #ifdef _KERNEL
  50   50  #include <sys/systm.h>
  51   51  #endif /* _KERNEL */
  52   52  
  53   53  static void Encode(uint8_t *, const uint32_t *, size_t);
       54 +
       55 +#if !defined(__amd64)
  54   56  static void MD5Transform(uint32_t, uint32_t, uint32_t, uint32_t, MD5_CTX *,
  55   57      const uint8_t [64]);
       58 +#else
       59 +void md5_block_asm_host_order(MD5_CTX *ctx, const void *inpp,
       60 +    unsigned int input_length_in_blocks);
       61 +#endif /* !defined(__amd64) */
  56   62  
  57   63  static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
  58   64  
  59   65  /*
  60   66   * F, G, H and I are the basic MD5 functions.
  61   67   */
  62   68  #define F(b, c, d)      (((b) & (c)) | ((~b) & (d)))
  63   69  #define G(b, c, d)      (((b) & (d)) | ((c) & (~d)))
  64   70  #define H(b, c, d)      ((b) ^ (c) ^ (d))
  65   71  #define I(b, c, d)      ((c) ^ ((b) | (~d)))

  66   72  
  67   73  /*
  68   74   * ROTATE_LEFT rotates x left n bits.
  69   75   */
  70   76  #define ROTATE_LEFT(x, n)       \
  71   77          (((x) << (n)) | ((x) >> ((sizeof (x) << 3) - (n))))
  72   78  
  73   79  /*
  74   80   * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
  75   81   * Rotation is separate from addition to prevent recomputation.
  76   82   */
  77   83  
  78   84  #define FF(a, b, c, d, x, s, ac) { \
  79   85          (a) += F((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
  80   86          (a) = ROTATE_LEFT((a), (s)); \
  81   87          (a) += (b); \
  82   88          }
  83   89  
  84   90  #define GG(a, b, c, d, x, s, ac) { \
  85   91          (a) += G((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
  86   92          (a) = ROTATE_LEFT((a), (s)); \
  87   93          (a) += (b); \
  88   94          }
  89   95  
  90   96  #define HH(a, b, c, d, x, s, ac) { \
  91   97          (a) += H((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
  92   98          (a) = ROTATE_LEFT((a), (s)); \
  93   99          (a) += (b); \
  94  100          }
  95  101  
  96  102  #define II(a, b, c, d, x, s, ac) { \
  97  103          (a) += I((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
  98  104          (a) = ROTATE_LEFT((a), (s)); \
  99  105          (a) += (b); \
 100  106          }
 101  107  
 102  108  /*
 103  109   * Loading 32-bit constants on a RISC is expensive since it involves both a
 104  110   * `sethi' and an `or'.  thus, we instead have the compiler generate `ld's to
 105  111   * load the constants from an array called `md5_consts'.  however, on intel
 106  112   * (and other CISC processors), it is cheaper to load the constant
 107  113   * directly.  thus, the c code in MD5Transform() uses the macro MD5_CONST()
 108  114   * which either expands to a constant or an array reference, depending on the
 109  115   * architecture the code is being compiled for.
 110  116   *
 111  117   * Right now, i386 and amd64 are the CISC exceptions.
 112  118   * If we get another CISC ISA, we'll have to change the ifdef.
 113  119   */
 114  120  
 115  121  #if defined(__i386) || defined(__amd64)
 116  122  
 117  123  #define MD5_CONST(x)            (MD5_CONST_ ## x)
 118  124  #define MD5_CONST_e(x)          MD5_CONST(x)
 119  125  #define MD5_CONST_o(x)          MD5_CONST(x)
 120  126  
 121  127  #else
 122  128  /*
 123  129   * sparc/RISC optimization:
 124  130   *
 125  131   * while it is somewhat counter-intuitive, on sparc (and presumably other RISC
 126  132   * machines), it is more efficient to place all the constants used in this
 127  133   * function in an array and load the values out of the array than to manually
 128  134   * load the constants.  this is because setting a register to a 32-bit value
 129  135   * takes two ops in most cases: a `sethi' and an `or', but loading a 32-bit
 130  136   * value from memory only takes one `ld' (or `lduw' on v9).  while this
 131  137   * increases memory usage, the compiler can find enough other things to do
 132  138   * while waiting to keep the pipeline does not stall.  additionally, it is
 133  139   * likely that many of these constants are cached so that later accesses do
 134  140   * not even go out to the bus.
 135  141   *
 136  142   * this array is declared `static' to keep the compiler from having to
 137  143   * bcopy() this array onto the stack frame of MD5Transform() each time it is
 138  144   * called -- which is unacceptably expensive.
 139  145   *
 140  146   * the `const' is to ensure that callers are good citizens and do not try to
 141  147   * munge the array.  since these routines are going to be called from inside
 142  148   * multithreaded kernelland, this is a good safety check. -- `constants' will
 143  149   * end up in .rodata.
 144  150   *
 145  151   * unfortunately, loading from an array in this manner hurts performance under
 146  152   * intel (and presumably other CISC machines).  so, there is a macro,
 147  153   * MD5_CONST(), used in MD5Transform(), that either expands to a reference to
 148  154   * this array, or to the actual constant, depending on what platform this code
 149  155   * is compiled for.
 150  156   */
 151  157  
 152  158  #ifdef sun4v
 153  159  
 154  160  /*
 155  161   * Going to load these consts in 8B chunks, so need to enforce 8B alignment
 156  162   */
 157  163  
 158  164  /* CSTYLED */
 159  165  #pragma align 64 (md5_consts)
 160  166  #define _MD5_CHECK_ALIGNMENT
 161  167  
 162  168  #endif /* sun4v */
 163  169  
 164  170  static const uint32_t md5_consts[] = {
 165  171          MD5_CONST_0,    MD5_CONST_1,    MD5_CONST_2,    MD5_CONST_3,
 166  172          MD5_CONST_4,    MD5_CONST_5,    MD5_CONST_6,    MD5_CONST_7,
 167  173          MD5_CONST_8,    MD5_CONST_9,    MD5_CONST_10,   MD5_CONST_11,
 168  174          MD5_CONST_12,   MD5_CONST_13,   MD5_CONST_14,   MD5_CONST_15,
 169  175          MD5_CONST_16,   MD5_CONST_17,   MD5_CONST_18,   MD5_CONST_19,
 170  176          MD5_CONST_20,   MD5_CONST_21,   MD5_CONST_22,   MD5_CONST_23,
 171  177          MD5_CONST_24,   MD5_CONST_25,   MD5_CONST_26,   MD5_CONST_27,
 172  178          MD5_CONST_28,   MD5_CONST_29,   MD5_CONST_30,   MD5_CONST_31,
 173  179          MD5_CONST_32,   MD5_CONST_33,   MD5_CONST_34,   MD5_CONST_35,
 174  180          MD5_CONST_36,   MD5_CONST_37,   MD5_CONST_38,   MD5_CONST_39,
 175  181          MD5_CONST_40,   MD5_CONST_41,   MD5_CONST_42,   MD5_CONST_43,
 176  182          MD5_CONST_44,   MD5_CONST_45,   MD5_CONST_46,   MD5_CONST_47,
 177  183          MD5_CONST_48,   MD5_CONST_49,   MD5_CONST_50,   MD5_CONST_51,
 178  184          MD5_CONST_52,   MD5_CONST_53,   MD5_CONST_54,   MD5_CONST_55,
 179  185          MD5_CONST_56,   MD5_CONST_57,   MD5_CONST_58,   MD5_CONST_59,
 180  186          MD5_CONST_60,   MD5_CONST_61,   MD5_CONST_62,   MD5_CONST_63
 181  187  };
 182  188  
 183  189  
 184  190  #ifdef sun4v
 185  191  /*
 186  192   * To reduce the number of loads, load consts in 64-bit
 187  193   * chunks and then split.
 188  194   *
 189  195   * No need to mask upper 32-bits, as just interested in
 190  196   * low 32-bits (saves an & operation and means that this
 191  197   * optimization doesn't increases the icount.
 192  198   */
 193  199  #define MD5_CONST_e(x)          (md5_consts64[x/2] >> 32)
 194  200  #define MD5_CONST_o(x)          (md5_consts64[x/2])
 195  201  
 196  202  #else
 197  203  
 198  204  #define MD5_CONST_e(x)          (md5_consts[x])
 199  205  #define MD5_CONST_o(x)          (md5_consts[x])
 200  206  
 201  207  #endif /* sun4v */
 202  208  
 203  209  #endif
 204  210  
 205  211  /*
 206  212   * MD5Init()
 207  213   *
 208  214   * purpose: initializes the md5 context and begins and md5 digest operation
 209  215   *   input: MD5_CTX *   : the context to initialize.
 210  216   *  output: void
 211  217   */
 212  218  
 213  219  void
 214  220  MD5Init(MD5_CTX *ctx)
 215  221  {
 216  222          ctx->count[0] = ctx->count[1] = 0;
 217  223  
 218  224          /* load magic initialization constants */
 219  225          ctx->state[0] = MD5_INIT_CONST_1;
 220  226          ctx->state[1] = MD5_INIT_CONST_2;
 221  227          ctx->state[2] = MD5_INIT_CONST_3;
 222  228          ctx->state[3] = MD5_INIT_CONST_4;
 223  229  }
 224  230  
 225  231  /*
 226  232   * MD5Update()
 227  233   *
 228  234   * purpose: continues an md5 digest operation, using the message block
 229  235   *          to update the context.
 230  236   *   input: MD5_CTX *   : the context to update
 231  237   *          uint8_t *   : the message block
 232  238   *          uint32_t    : the length of the message block in bytes
 233  239   *  output: void
 234  240   *
 235  241   * MD5 crunches in 64-byte blocks.  All numeric constants here are related to

↓ open down ↓

170 lines elided

↑ open up ↑

 236  242   * that property of MD5.
 237  243   */
 238  244  
 239  245  void
 240  246  MD5Update(MD5_CTX *ctx, const void *inpp, unsigned int input_len)
 241  247  {
 242  248          uint32_t                i, buf_index, buf_len;
 243  249  #ifdef  sun4v
 244  250          uint32_t                old_asi;
 245  251  #endif  /* sun4v */
      252 +#if defined(__amd64)
      253 +        uint32_t                block_count;
      254 +#endif /* !defined(__amd64) */
 246  255          const unsigned char     *input = (const unsigned char *)inpp;
 247  256  
 248  257          /* compute (number of bytes computed so far) mod 64 */
 249  258          buf_index = (ctx->count[0] >> 3) & 0x3F;
 250  259  
 251  260          /* update number of bits hashed into this MD5 computation so far */
 252  261          if ((ctx->count[0] += (input_len << 3)) < (input_len << 3))
 253      -            ctx->count[1]++;
      262 +                ctx->count[1]++;
 254  263          ctx->count[1] += (input_len >> 29);
 255  264  
 256  265          buf_len = 64 - buf_index;
 257  266  
 258  267          /* transform as many times as possible */
 259  268          i = 0;
 260  269          if (input_len >= buf_len) {
 261  270  
 262  271                  /*
 263  272                   * general optimization:

 264  273                   *
 265  274                   * only do initial bcopy() and MD5Transform() if
 266  275                   * buf_index != 0.  if buf_index == 0, we're just
 267  276                   * wasting our time doing the bcopy() since there
 268  277                   * wasn't any data left over from a previous call to
 269  278                   * MD5Update().
 270  279                   */
 271  280  
 272  281  #ifdef sun4v
 273  282                  /*
 274  283                   * For N1 use %asi register. However, costly to repeatedly set

↓ open down ↓

11 lines elided

↑ open up ↑

 275  284                   * in MD5Transform. Therefore, set once here.
 276  285                   * Should probably restore the old value afterwards...
 277  286                   */
 278  287                  old_asi = get_little();
 279  288                  set_little(0x88);
 280  289  #endif /* sun4v */
 281  290  
 282  291                  if (buf_index) {
 283  292                          bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
 284  293  
      294 +#if !defined(__amd64)
 285  295                          MD5Transform(ctx->state[0], ctx->state[1],
 286  296                              ctx->state[2], ctx->state[3], ctx,
 287  297                              ctx->buf_un.buf8);
      298 +#else
      299 +                        md5_block_asm_host_order(ctx, ctx->buf_un.buf8, 1);
      300 +#endif /* !defined(__amd64) */
 288  301  
 289  302                          i = buf_len;
 290  303                  }
 291  304  
      305 +#if !defined(__amd64)
 292  306                  for (; i + 63 < input_len; i += 64)
 293  307                          MD5Transform(ctx->state[0], ctx->state[1],
 294  308                              ctx->state[2], ctx->state[3], ctx, &input[i]);
 295  309  
      310 +#else
      311 +                block_count = (input_len - i) >> 6;
      312 +                if (block_count > 0) {
      313 +                        md5_block_asm_host_order(ctx, &input[i], block_count);
      314 +                        i += block_count << 6;
      315 +                }
      316 +#endif /* !defined(__amd64) */
 296  317  
      318 +
 297  319  #ifdef sun4v
 298  320                  /*
 299  321                   * Restore old %ASI value
 300  322                   */
 301  323                  set_little(old_asi);
 302  324  #endif /* sun4v */
 303  325  
 304  326                  /*
 305  327                   * general optimization:
 306  328                   *

 307  329                   * if i and input_len are the same, return now instead
 308  330                   * of calling bcopy(), since the bcopy() in this
 309  331                   * case will be an expensive nop.
 310  332                   */
 311  333  
 312  334                  if (input_len == i)
 313  335                          return;
 314  336  
 315  337                  buf_index = 0;
 316  338          }
 317  339  
 318  340          /* buffer remaining input */
 319  341          bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
 320  342  }
 321  343  
 322  344  /*
 323  345   * MD5Final()
 324  346   *
 325  347   * purpose: ends an md5 digest operation, finalizing the message digest and
 326  348   *          zeroing the context.
 327  349   *   input: uchar_t *   : a buffer to store the digest in
 328  350   *                      : The function actually uses void* because many
 329  351   *                      : callers pass things other than uchar_t here.
 330  352   *          MD5_CTX *   : the context to finalize, save, and zero
 331  353   *  output: void
 332  354   */
 333  355  
 334  356  void
 335  357  MD5Final(void *digest, MD5_CTX *ctx)
 336  358  {
 337  359          uint8_t         bitcount_le[sizeof (ctx->count)];
 338  360          uint32_t        index = (ctx->count[0] >> 3) & 0x3f;
 339  361  
 340  362          /* store bit count, little endian */
 341  363          Encode(bitcount_le, ctx->count, sizeof (bitcount_le));
 342  364  
 343  365          /* pad out to 56 mod 64 */
 344  366          MD5Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
 345  367  
 346  368          /* append length (before padding) */
 347  369          MD5Update(ctx, bitcount_le, sizeof (bitcount_le));
 348  370  
 349  371          /* store state in digest */
 350  372          Encode(digest, ctx->state, sizeof (ctx->state));
 351  373  
 352  374          /* zeroize sensitive information */
 353  375          bzero(ctx, sizeof (*ctx));
 354  376  }
 355  377  
 356  378  #ifndef _KERNEL
 357  379  
 358  380  void
 359  381  md5_calc(unsigned char *output, unsigned char *input, unsigned int inlen)

↓ open down ↓

53 lines elided

↑ open up ↑

 360  382  {
 361  383          MD5_CTX context;
 362  384  
 363  385          MD5Init(&context);
 364  386          MD5Update(&context, input, inlen);
 365  387          MD5Final(output, &context);
 366  388  }
 367  389  
 368  390  #endif  /* !_KERNEL */
 369  391  
      392 +#if !defined(__amd64)
 370  393  /*
 371  394   * sparc register window optimization:
 372  395   *
 373  396   * `a', `b', `c', and `d' are passed into MD5Transform explicitly
 374  397   * since it increases the number of registers available to the
 375  398   * compiler.  under this scheme, these variables can be held in
 376  399   * %i0 - %i3, which leaves more local and out registers available.
 377  400   */
 378  401  
 379  402  /*

 380  403   * MD5Transform()
 381  404   *
 382  405   * purpose: md5 transformation -- updates the digest based on `block'
 383  406   *   input: uint32_t    : bytes  1 -  4 of the digest
 384  407   *          uint32_t    : bytes  5 -  8 of the digest
 385  408   *          uint32_t    : bytes  9 - 12 of the digest
 386  409   *          uint32_t    : bytes 12 - 16 of the digest
 387  410   *          MD5_CTX *   : the context to update
 388  411   *          uint8_t [64]: the block to use to update the digest
 389  412   *  output: void
 390  413   */
 391  414  
 392  415  static void
 393  416  MD5Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d,
 394  417      MD5_CTX *ctx, const uint8_t block[64])
 395  418  {
 396  419          /*
 397  420           * general optimization:
 398  421           *
 399  422           * use individual integers instead of using an array.  this is a
 400  423           * win, although the amount it wins by seems to vary quite a bit.
 401  424           */
 402  425  
 403  426          register uint32_t       x_0, x_1, x_2,  x_3,  x_4,  x_5,  x_6,  x_7;
 404  427          register uint32_t       x_8, x_9, x_10, x_11, x_12, x_13, x_14, x_15;
 405  428  #ifdef sun4v
 406  429          unsigned long long      *md5_consts64;
 407  430  
 408  431                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 409  432          md5_consts64 = (unsigned long long *) md5_consts;
 410  433  #endif  /* sun4v */
 411  434  
 412  435          /*
 413  436           * general optimization:
 414  437           *
 415  438           * the compiler (at least SC4.2/5.x) generates better code if
 416  439           * variable use is localized.  in this case, swapping the integers in
 417  440           * this order allows `x_0 'to be swapped nearest to its first use in
 418  441           * FF(), and likewise for `x_1' and up.  note that the compiler
 419  442           * prefers this to doing each swap right before the FF() that
 420  443           * uses it.
 421  444           */
 422  445  
 423  446          /*
 424  447           * sparc v9/v8plus optimization:
 425  448           *
 426  449           * if `block' is already aligned on a 4-byte boundary, use the
 427  450           * optimized load_little_32() directly.  otherwise, bcopy()
 428  451           * into a buffer that *is* aligned on a 4-byte boundary and
 429  452           * then do the load_little_32() on that buffer.  benchmarks
 430  453           * have shown that using the bcopy() is better than loading
 431  454           * the bytes individually and doing the endian-swap by hand.
 432  455           *
 433  456           * even though it's quite tempting to assign to do:
 434  457           *
 435  458           * blk = bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
 436  459           *
 437  460           * and only have one set of LOAD_LITTLE_32()'s, the compiler (at least
 438  461           * SC4.2/5.x) *does not* like that, so please resist the urge.
 439  462           */
 440  463  
 441  464  #ifdef _MD5_CHECK_ALIGNMENT
 442  465          if ((uintptr_t)block & 0x3) {           /* not 4-byte aligned? */
 443  466                  bcopy(block, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
 444  467  
 445  468  #ifdef sun4v
 446  469                  x_15 = LOAD_LITTLE_32_f(ctx->buf_un.buf32);
 447  470                  x_14 = LOAD_LITTLE_32_e(ctx->buf_un.buf32);
 448  471                  x_13 = LOAD_LITTLE_32_d(ctx->buf_un.buf32);
 449  472                  x_12 = LOAD_LITTLE_32_c(ctx->buf_un.buf32);
 450  473                  x_11 = LOAD_LITTLE_32_b(ctx->buf_un.buf32);
 451  474                  x_10 = LOAD_LITTLE_32_a(ctx->buf_un.buf32);
 452  475                  x_9  = LOAD_LITTLE_32_9(ctx->buf_un.buf32);
 453  476                  x_8  = LOAD_LITTLE_32_8(ctx->buf_un.buf32);
 454  477                  x_7  = LOAD_LITTLE_32_7(ctx->buf_un.buf32);
 455  478                  x_6  = LOAD_LITTLE_32_6(ctx->buf_un.buf32);
 456  479                  x_5  = LOAD_LITTLE_32_5(ctx->buf_un.buf32);
 457  480                  x_4  = LOAD_LITTLE_32_4(ctx->buf_un.buf32);
 458  481                  x_3  = LOAD_LITTLE_32_3(ctx->buf_un.buf32);
 459  482                  x_2  = LOAD_LITTLE_32_2(ctx->buf_un.buf32);
 460  483                  x_1  = LOAD_LITTLE_32_1(ctx->buf_un.buf32);
 461  484                  x_0  = LOAD_LITTLE_32_0(ctx->buf_un.buf32);
 462  485  #else
 463  486                  x_15 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 15);
 464  487                  x_14 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 14);
 465  488                  x_13 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 13);
 466  489                  x_12 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 12);
 467  490                  x_11 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 11);
 468  491                  x_10 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 10);
 469  492                  x_9  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  9);
 470  493                  x_8  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  8);
 471  494                  x_7  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  7);
 472  495                  x_6  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  6);
 473  496                  x_5  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  5);
 474  497                  x_4  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  4);
 475  498                  x_3  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  3);
 476  499                  x_2  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  2);
 477  500                  x_1  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  1);
 478  501                  x_0  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  0);
 479  502  #endif /* sun4v */
 480  503          } else
 481  504  #endif
 482  505          {
 483  506  
 484  507  #ifdef sun4v
 485  508                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 486  509                  x_15 = LOAD_LITTLE_32_f(block);
 487  510                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 488  511                  x_14 = LOAD_LITTLE_32_e(block);
 489  512                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 490  513                  x_13 = LOAD_LITTLE_32_d(block);
 491  514                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 492  515                  x_12 = LOAD_LITTLE_32_c(block);
 493  516                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 494  517                  x_11 = LOAD_LITTLE_32_b(block);
 495  518                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 496  519                  x_10 = LOAD_LITTLE_32_a(block);
 497  520                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 498  521                  x_9  = LOAD_LITTLE_32_9(block);
 499  522                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 500  523                  x_8  = LOAD_LITTLE_32_8(block);
 501  524                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 502  525                  x_7  = LOAD_LITTLE_32_7(block);
 503  526                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 504  527                  x_6  = LOAD_LITTLE_32_6(block);
 505  528                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 506  529                  x_5  = LOAD_LITTLE_32_5(block);
 507  530                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 508  531                  x_4  = LOAD_LITTLE_32_4(block);
 509  532                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 510  533                  x_3  = LOAD_LITTLE_32_3(block);
 511  534                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 512  535                  x_2  = LOAD_LITTLE_32_2(block);
 513  536                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 514  537                  x_1  = LOAD_LITTLE_32_1(block);
 515  538                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 516  539                  x_0  = LOAD_LITTLE_32_0(block);
 517  540  #else
 518  541                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 519  542                  x_15 = LOAD_LITTLE_32(block + 60);
 520  543                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 521  544                  x_14 = LOAD_LITTLE_32(block + 56);
 522  545                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 523  546                  x_13 = LOAD_LITTLE_32(block + 52);
 524  547                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 525  548                  x_12 = LOAD_LITTLE_32(block + 48);
 526  549                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 527  550                  x_11 = LOAD_LITTLE_32(block + 44);
 528  551                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 529  552                  x_10 = LOAD_LITTLE_32(block + 40);
 530  553                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 531  554                  x_9  = LOAD_LITTLE_32(block + 36);
 532  555                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 533  556                  x_8  = LOAD_LITTLE_32(block + 32);
 534  557                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 535  558                  x_7  = LOAD_LITTLE_32(block + 28);
 536  559                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 537  560                  x_6  = LOAD_LITTLE_32(block + 24);
 538  561                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 539  562                  x_5  = LOAD_LITTLE_32(block + 20);
 540  563                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 541  564                  x_4  = LOAD_LITTLE_32(block + 16);
 542  565                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 543  566                  x_3  = LOAD_LITTLE_32(block + 12);
 544  567                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 545  568                  x_2  = LOAD_LITTLE_32(block +  8);
 546  569                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 547  570                  x_1  = LOAD_LITTLE_32(block +  4);
 548  571                  /* LINTED E_BAD_PTR_CAST_ALIGN */
 549  572                  x_0  = LOAD_LITTLE_32(block +  0);
 550  573  #endif /* sun4v */
 551  574          }
 552  575  
 553  576          /* round 1 */
 554  577          FF(a, b, c, d,  x_0, MD5_SHIFT_11, MD5_CONST_e(0));  /* 1 */
 555  578          FF(d, a, b, c,  x_1, MD5_SHIFT_12, MD5_CONST_o(1));  /* 2 */
 556  579          FF(c, d, a, b,  x_2, MD5_SHIFT_13, MD5_CONST_e(2));  /* 3 */
 557  580          FF(b, c, d, a,  x_3, MD5_SHIFT_14, MD5_CONST_o(3));  /* 4 */
 558  581          FF(a, b, c, d,  x_4, MD5_SHIFT_11, MD5_CONST_e(4));  /* 5 */
 559  582          FF(d, a, b, c,  x_5, MD5_SHIFT_12, MD5_CONST_o(5));  /* 6 */
 560  583          FF(c, d, a, b,  x_6, MD5_SHIFT_13, MD5_CONST_e(6));  /* 7 */
 561  584          FF(b, c, d, a,  x_7, MD5_SHIFT_14, MD5_CONST_o(7));  /* 8 */
 562  585          FF(a, b, c, d,  x_8, MD5_SHIFT_11, MD5_CONST_e(8));  /* 9 */
 563  586          FF(d, a, b, c,  x_9, MD5_SHIFT_12, MD5_CONST_o(9));  /* 10 */
 564  587          FF(c, d, a, b, x_10, MD5_SHIFT_13, MD5_CONST_e(10)); /* 11 */
 565  588          FF(b, c, d, a, x_11, MD5_SHIFT_14, MD5_CONST_o(11)); /* 12 */
 566  589          FF(a, b, c, d, x_12, MD5_SHIFT_11, MD5_CONST_e(12)); /* 13 */
 567  590          FF(d, a, b, c, x_13, MD5_SHIFT_12, MD5_CONST_o(13)); /* 14 */
 568  591          FF(c, d, a, b, x_14, MD5_SHIFT_13, MD5_CONST_e(14)); /* 15 */
 569  592          FF(b, c, d, a, x_15, MD5_SHIFT_14, MD5_CONST_o(15)); /* 16 */
 570  593  
 571  594          /* round 2 */
 572  595          GG(a, b, c, d,  x_1, MD5_SHIFT_21, MD5_CONST_e(16)); /* 17 */
 573  596          GG(d, a, b, c,  x_6, MD5_SHIFT_22, MD5_CONST_o(17)); /* 18 */
 574  597          GG(c, d, a, b, x_11, MD5_SHIFT_23, MD5_CONST_e(18)); /* 19 */
 575  598          GG(b, c, d, a,  x_0, MD5_SHIFT_24, MD5_CONST_o(19)); /* 20 */
 576  599          GG(a, b, c, d,  x_5, MD5_SHIFT_21, MD5_CONST_e(20)); /* 21 */
 577  600          GG(d, a, b, c, x_10, MD5_SHIFT_22, MD5_CONST_o(21)); /* 22 */
 578  601          GG(c, d, a, b, x_15, MD5_SHIFT_23, MD5_CONST_e(22)); /* 23 */
 579  602          GG(b, c, d, a,  x_4, MD5_SHIFT_24, MD5_CONST_o(23)); /* 24 */
 580  603          GG(a, b, c, d,  x_9, MD5_SHIFT_21, MD5_CONST_e(24)); /* 25 */
 581  604          GG(d, a, b, c, x_14, MD5_SHIFT_22, MD5_CONST_o(25)); /* 26 */
 582  605          GG(c, d, a, b,  x_3, MD5_SHIFT_23, MD5_CONST_e(26)); /* 27 */
 583  606          GG(b, c, d, a,  x_8, MD5_SHIFT_24, MD5_CONST_o(27)); /* 28 */
 584  607          GG(a, b, c, d, x_13, MD5_SHIFT_21, MD5_CONST_e(28)); /* 29 */
 585  608          GG(d, a, b, c,  x_2, MD5_SHIFT_22, MD5_CONST_o(29)); /* 30 */
 586  609          GG(c, d, a, b,  x_7, MD5_SHIFT_23, MD5_CONST_e(30)); /* 31 */
 587  610          GG(b, c, d, a, x_12, MD5_SHIFT_24, MD5_CONST_o(31)); /* 32 */
 588  611  
 589  612          /* round 3 */
 590  613          HH(a, b, c, d,  x_5, MD5_SHIFT_31, MD5_CONST_e(32)); /* 33 */
 591  614          HH(d, a, b, c,  x_8, MD5_SHIFT_32, MD5_CONST_o(33)); /* 34 */
 592  615          HH(c, d, a, b, x_11, MD5_SHIFT_33, MD5_CONST_e(34)); /* 35 */
 593  616          HH(b, c, d, a, x_14, MD5_SHIFT_34, MD5_CONST_o(35)); /* 36 */
 594  617          HH(a, b, c, d,  x_1, MD5_SHIFT_31, MD5_CONST_e(36)); /* 37 */
 595  618          HH(d, a, b, c,  x_4, MD5_SHIFT_32, MD5_CONST_o(37)); /* 38 */
 596  619          HH(c, d, a, b,  x_7, MD5_SHIFT_33, MD5_CONST_e(38)); /* 39 */
 597  620          HH(b, c, d, a, x_10, MD5_SHIFT_34, MD5_CONST_o(39)); /* 40 */
 598  621          HH(a, b, c, d, x_13, MD5_SHIFT_31, MD5_CONST_e(40)); /* 41 */
 599  622          HH(d, a, b, c,  x_0, MD5_SHIFT_32, MD5_CONST_o(41)); /* 42 */
 600  623          HH(c, d, a, b,  x_3, MD5_SHIFT_33, MD5_CONST_e(42)); /* 43 */
 601  624          HH(b, c, d, a,  x_6, MD5_SHIFT_34, MD5_CONST_o(43)); /* 44 */
 602  625          HH(a, b, c, d,  x_9, MD5_SHIFT_31, MD5_CONST_e(44)); /* 45 */
 603  626          HH(d, a, b, c, x_12, MD5_SHIFT_32, MD5_CONST_o(45)); /* 46 */
 604  627          HH(c, d, a, b, x_15, MD5_SHIFT_33, MD5_CONST_e(46)); /* 47 */
 605  628          HH(b, c, d, a,  x_2, MD5_SHIFT_34, MD5_CONST_o(47)); /* 48 */
 606  629  
 607  630          /* round 4 */
 608  631          II(a, b, c, d,  x_0, MD5_SHIFT_41, MD5_CONST_e(48)); /* 49 */
 609  632          II(d, a, b, c,  x_7, MD5_SHIFT_42, MD5_CONST_o(49)); /* 50 */
 610  633          II(c, d, a, b, x_14, MD5_SHIFT_43, MD5_CONST_e(50)); /* 51 */
 611  634          II(b, c, d, a,  x_5, MD5_SHIFT_44, MD5_CONST_o(51)); /* 52 */
 612  635          II(a, b, c, d, x_12, MD5_SHIFT_41, MD5_CONST_e(52)); /* 53 */
 613  636          II(d, a, b, c,  x_3, MD5_SHIFT_42, MD5_CONST_o(53)); /* 54 */
 614  637          II(c, d, a, b, x_10, MD5_SHIFT_43, MD5_CONST_e(54)); /* 55 */
 615  638          II(b, c, d, a,  x_1, MD5_SHIFT_44, MD5_CONST_o(55)); /* 56 */
 616  639          II(a, b, c, d,  x_8, MD5_SHIFT_41, MD5_CONST_e(56)); /* 57 */
 617  640          II(d, a, b, c, x_15, MD5_SHIFT_42, MD5_CONST_o(57)); /* 58 */
 618  641          II(c, d, a, b,  x_6, MD5_SHIFT_43, MD5_CONST_e(58)); /* 59 */
 619  642          II(b, c, d, a, x_13, MD5_SHIFT_44, MD5_CONST_o(59)); /* 60 */
 620  643          II(a, b, c, d,  x_4, MD5_SHIFT_41, MD5_CONST_e(60)); /* 61 */
 621  644          II(d, a, b, c, x_11, MD5_SHIFT_42, MD5_CONST_o(61)); /* 62 */
 622  645          II(c, d, a, b,  x_2, MD5_SHIFT_43, MD5_CONST_e(62)); /* 63 */
 623  646          II(b, c, d, a,  x_9, MD5_SHIFT_44, MD5_CONST_o(63)); /* 64 */
 624  647  
 625  648          ctx->state[0] += a;
 626  649          ctx->state[1] += b;
 627  650          ctx->state[2] += c;

↓ open down ↓

248 lines elided

↑ open up ↑

 628  651          ctx->state[3] += d;
 629  652  
 630  653          /*
 631  654           * zeroize sensitive information -- compiler will optimize
 632  655           * this out if everything is kept in registers
 633  656           */
 634  657  
 635  658          x_0 = x_1  = x_2  = x_3  = x_4  = x_5  = x_6  = x_7 = x_8 = 0;
 636  659          x_9 = x_10 = x_11 = x_12 = x_13 = x_14 = x_15 = 0;
 637  660  }
      661 +#endif /* !defined(__amd64) */
 638  662  
 639  663  /*
 640  664   * Encode()
 641  665   *
 642  666   * purpose: to convert a list of numbers from big endian to little endian
 643  667   *   input: uint8_t *   : place to store the converted little endian numbers
 644  668   *          uint32_t *  : place to get numbers to convert from
 645  669   *          size_t      : the length of the input in bytes
 646  670   *  output: void
 647  671   */

 648  672  
 649  673  static void
 650  674  Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
 651  675      size_t input_len)
 652  676  {
 653  677          size_t          i, j;
 654  678  
 655  679          for (i = 0, j = 0; j < input_len; i++, j += sizeof (uint32_t)) {
 656  680  
 657  681  #ifdef _LITTLE_ENDIAN
 658  682  
 659  683  #ifdef _MD5_CHECK_ALIGNMENT
 660  684                  if ((uintptr_t)output & 0x3)    /* Not 4-byte aligned */
 661  685                          bcopy(input + i, output + j, 4);
 662  686                  else *(uint32_t *)(output + j) = input[i];
 663  687  #else
 664  688                  /*LINTED E_BAD_PTR_CAST_ALIGN*/
 665  689                  *(uint32_t *)(output + j) = input[i];
 666  690  #endif /* _MD5_CHECK_ALIGNMENT */
 667  691  
 668  692  #else   /* big endian -- will work on little endian, but slowly */
 669  693  
 670  694                  output[j] = input[i] & 0xff;
 671  695                  output[j + 1] = (input[i] >> 8)  & 0xff;
 672  696                  output[j + 2] = (input[i] >> 16) & 0xff;
 673  697                  output[j + 3] = (input[i] >> 24) & 0xff;
 674  698  #endif
 675  699          }
 676  700  }

↓ open down ↓

29 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX