Print this page
6662791 Need a SHA1 implementation optimized for 64-bit x86
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/crypto/sha1/sha1.c
+++ new/usr/src/common/crypto/sha1/sha1.c
1 1 /*
2 - * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
2 + * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
3 3 * Use is subject to license terms.
4 4 */
5 5
6 -#pragma ident "@(#)sha1.c 1.26 07/04/10 SMI"
6 +#pragma ident "@(#)sha1.c 1.27 08/03/02 SMI"
7 7
8 8 /*
9 9 * The basic framework for this code came from the reference
10 10 * implementation for MD5. That implementation is Copyright (C)
11 11 * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
12 12 *
13 13 * License to copy and use this software is granted provided that it
14 14 * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
15 15 * Algorithm" in all material mentioning or referencing this software
16 16 * or this function.
17 17 *
18 18 * License is also granted to make and use derivative works provided
19 19 * that such works are identified as "derived from the RSA Data
20 20 * Security, Inc. MD5 Message-Digest Algorithm" in all material
21 21 * mentioning or referencing the derived work.
22 22 *
23 23 * RSA Data Security, Inc. makes no representations concerning either
24 24 * the merchantability of this software or the suitability of this
25 25 * software for any particular purpose. It is provided "as is"
26 26 * without express or implied warranty of any kind.
27 27 *
28 28 * These notices must be retained in any copies of any part of this
29 29 * documentation and/or software.
30 30 *
31 31 * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1
32 32 * standard, available at http://www.itl.nist.gov/div897/pubs/fip180-1.htm
33 33 * Not as fast as one would like -- further optimizations are encouraged
34 34 * and appreciated.
35 35 */
36 36
37 37 #include <sys/types.h>
38 38 #include <sys/param.h>
39 39 #include <sys/systm.h>
40 40 #include <sys/sysmacros.h>
41 41 #include <sys/sha1.h>
42 42 #include <sys/sha1_consts.h>
43 43
44 44 #ifndef _KERNEL
45 45 #include <strings.h>
46 46 #include <stdlib.h>
47 47 #include <errno.h>
48 48 #include <sys/systeminfo.h>
49 49 #endif /* !_KERNEL */
50 50
51 51 static void Encode(uint8_t *, const uint32_t *, size_t);
↓ open down ↓ |
35 lines elided |
↑ open up ↑ |
52 52
53 53 #if defined(__sparc)
54 54
55 55 #define SHA1_TRANSFORM(ctx, in) \
56 56 SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \
57 57 (ctx)->state[3], (ctx)->state[4], (ctx), (in))
58 58
59 59 static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
60 60 SHA1_CTX *, const uint8_t *);
61 61
62 +#elif defined(__amd64)
63 +
64 +#define SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1)
65 +#define SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \
66 + (in), (num))
67 +
68 +void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks);
69 +
62 70 #else
63 71
64 72 #define SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in))
65 73
66 74 static void SHA1Transform(SHA1_CTX *, const uint8_t *);
67 75
68 76 #endif
69 77
70 78
71 79 static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
72 80
73 81 /*
74 82 * F, G, and H are the basic SHA1 functions.
75 83 */
76 84 #define F(b, c, d) (((b) & (c)) | ((~b) & (d)))
77 85 #define G(b, c, d) ((b) ^ (c) ^ (d))
78 86 #define H(b, c, d) (((b) & (c)) | (((b)|(c)) & (d)))
79 87
80 88 /*
81 89 * ROTATE_LEFT rotates x left n bits.
82 90 */
83 91
84 92 #if defined(__GNUC__) && defined(_LP64)
85 93 static __inline__ uint64_t
86 94 ROTATE_LEFT(uint64_t value, uint32_t n)
87 95 {
88 96 uint32_t t32;
89 97
90 98 t32 = (uint32_t)value;
91 99 return ((t32 << n) | (t32 >> (32 - n)));
92 100 }
93 101
94 102 #else
95 103
96 104 #define ROTATE_LEFT(x, n) \
97 105 (((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n))))
98 106
99 107 #endif
100 108
101 109 #if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
102 110
103 111 #define HAVE_BSWAP
104 112
105 113 extern __inline__ uint32_t bswap(uint32_t value)
106 114 {
107 115 __asm__("bswap %0" : "+r" (value));
108 116 return (value);
109 117 }
110 118
111 119 #endif
112 120
113 121 /*
114 122 * SHA1Init()
115 123 *
116 124 * purpose: initializes the sha1 context and begins and sha1 digest operation
117 125 * input: SHA1_CTX * : the context to initializes.
118 126 * output: void
119 127 */
120 128
121 129 void
122 130 SHA1Init(SHA1_CTX *ctx)
123 131 {
124 132 ctx->count[0] = ctx->count[1] = 0;
125 133
126 134 /*
127 135 * load magic initialization constants. Tell lint
128 136 * that these constants are unsigned by using U.
129 137 */
130 138
131 139 ctx->state[0] = 0x67452301U;
132 140 ctx->state[1] = 0xefcdab89U;
133 141 ctx->state[2] = 0x98badcfeU;
134 142 ctx->state[3] = 0x10325476U;
135 143 ctx->state[4] = 0xc3d2e1f0U;
136 144 }
137 145
138 146 #ifdef VIS_SHA1
139 147 #ifdef _KERNEL
140 148
141 149 #include <sys/regset.h>
142 150 #include <sys/vis.h>
143 151 #include <sys/fpu/fpusystm.h>
144 152
145 153 /* the alignment for block stores to save fp registers */
146 154 #define VIS_ALIGN (64)
147 155
148 156 extern int sha1_savefp(kfpu_t *, int);
149 157 extern void sha1_restorefp(kfpu_t *);
150 158
151 159 uint32_t vis_sha1_svfp_threshold = 128;
152 160
153 161 #endif /* _KERNEL */
154 162
155 163 /*
156 164 * VIS SHA-1 consts.
157 165 */
158 166 static uint64_t VIS[] = {
159 167 0x8000000080000000ULL,
160 168 0x0002000200020002ULL,
161 169 0x5a8279996ed9eba1ULL,
162 170 0x8f1bbcdcca62c1d6ULL,
163 171 0x012389ab456789abULL};
164 172
165 173 extern void SHA1TransformVIS(uint64_t *, uint32_t *, uint32_t *, uint64_t *);
166 174
167 175
168 176 /*
169 177 * SHA1Update()
170 178 *
171 179 * purpose: continues an sha1 digest operation, using the message block
172 180 * to update the context.
173 181 * input: SHA1_CTX * : the context to update
174 182 * void * : the message block
175 183 * size_t : the length of the message block in bytes
176 184 * output: void
177 185 */
178 186
179 187 void
180 188 SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
181 189 {
182 190 uint32_t i, buf_index, buf_len;
183 191 uint64_t X0[40], input64[8];
184 192 const uint8_t *input = inptr;
185 193 #ifdef _KERNEL
186 194 int usevis = 0;
187 195 #else
188 196 int usevis = 1;
189 197 #endif /* _KERNEL */
190 198
191 199 /* check for noop */
192 200 if (input_len == 0)
193 201 return;
194 202
195 203 /* compute number of bytes mod 64 */
196 204 buf_index = (ctx->count[1] >> 3) & 0x3F;
197 205
198 206 /* update number of bits */
199 207 if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
200 208 ctx->count[0]++;
201 209
202 210 ctx->count[0] += (input_len >> 29);
203 211
204 212 buf_len = 64 - buf_index;
205 213
206 214 /* transform as many times as possible */
207 215 i = 0;
208 216 if (input_len >= buf_len) {
209 217 #ifdef _KERNEL
210 218 kfpu_t *fpu;
211 219 if (fpu_exists) {
212 220 uint8_t fpua[sizeof (kfpu_t) + GSR_SIZE + VIS_ALIGN];
213 221 uint32_t len = (input_len + buf_index) & ~0x3f;
214 222 int svfp_ok;
215 223
216 224 fpu = (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, 64);
217 225 svfp_ok = ((len >= vis_sha1_svfp_threshold) ? 1 : 0);
218 226 usevis = fpu_exists && sha1_savefp(fpu, svfp_ok);
219 227 } else {
220 228 usevis = 0;
221 229 }
222 230 #endif /* _KERNEL */
223 231
224 232 /*
225 233 * general optimization:
226 234 *
227 235 * only do initial bcopy() and SHA1Transform() if
228 236 * buf_index != 0. if buf_index == 0, we're just
229 237 * wasting our time doing the bcopy() since there
230 238 * wasn't any data left over from a previous call to
231 239 * SHA1Update().
232 240 */
233 241
234 242 if (buf_index) {
235 243 bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
236 244 if (usevis) {
237 245 SHA1TransformVIS(X0,
238 246 ctx->buf_un.buf32,
239 247 &ctx->state[0], VIS);
240 248 } else {
241 249 SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
242 250 }
243 251 i = buf_len;
244 252 }
245 253
246 254 /*
247 255 * VIS SHA-1: uses the VIS 1.0 instructions to accelerate
248 256 * SHA-1 processing. This is achieved by "offloading" the
249 257 * computation of the message schedule (MS) to the VIS units.
250 258 * This allows the VIS computation of the message schedule
251 259 * to be performed in parallel with the standard integer
252 260 * processing of the remainder of the SHA-1 computation.
253 261 * performance by up to around 1.37X, compared to an optimized
254 262 * integer-only implementation.
255 263 *
256 264 * The VIS implementation of SHA1Transform has a different API
257 265 * to the standard integer version:
258 266 *
259 267 * void SHA1TransformVIS(
260 268 * uint64_t *, // Pointer to MS for ith block
261 269 * uint32_t *, // Pointer to ith block of message data
262 270 * uint32_t *, // Pointer to SHA state i.e ctx->state
263 271 * uint64_t *, // Pointer to various VIS constants
264 272 * )
265 273 *
266 274 * Note: the message data must by 4-byte aligned.
267 275 *
268 276 * Function requires VIS 1.0 support.
269 277 *
↓ open down ↓ |
198 lines elided |
↑ open up ↑ |
270 278 * Handling is provided to deal with arbitrary byte alingment
271 279 * of the input data but the performance gains are reduced
272 280 * for alignments other than 4-bytes.
273 281 */
274 282 if (usevis) {
275 283 if (!IS_P2ALIGNED(&input[i], sizeof (uint32_t))) {
276 284 /*
277 285 * Main processing loop - input misaligned
278 286 */
279 287 for (; i + 63 < input_len; i += 64) {
280 - bcopy(&input[i], input64, 64);
281 - SHA1TransformVIS(X0, (uint32_t *)input64,
282 - &ctx->state[0], VIS);
288 + bcopy(&input[i], input64, 64);
289 + SHA1TransformVIS(X0,
290 + (uint32_t *)input64,
291 + &ctx->state[0], VIS);
283 292 }
284 293 } else {
285 294 /*
286 295 * Main processing loop - input 8-byte aligned
287 296 */
288 297 for (; i + 63 < input_len; i += 64) {
289 298 SHA1TransformVIS(X0,
290 - /* LINTED E_BAD_PTR_CAST_ALIGN */
299 + /* LINTED E_BAD_PTR_CAST_ALIGN */
291 300 (uint32_t *)&input[i],
292 301 &ctx->state[0], VIS);
293 302 }
294 303
295 304 }
296 305 #ifdef _KERNEL
297 306 sha1_restorefp(fpu);
298 307 #endif /* _KERNEL */
299 308 } else {
300 309 for (; i + 63 < input_len; i += 64) {
301 - SHA1_TRANSFORM(ctx, &input[i]);
310 + SHA1_TRANSFORM(ctx, &input[i]);
302 311 }
303 312 }
304 313
305 314 /*
306 315 * general optimization:
307 316 *
308 317 * if i and input_len are the same, return now instead
309 318 * of calling bcopy(), since the bcopy() in this case
310 319 * will be an expensive nop.
311 320 */
312 321
313 322 if (input_len == i)
314 323 return;
315 324
316 325 buf_index = 0;
317 326 }
318 327
319 328 /* buffer remaining input */
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
320 329 bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
321 330 }
322 331
323 332 #else /* VIS_SHA1 */
324 333
325 334 void
326 335 SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
327 336 {
328 337 uint32_t i, buf_index, buf_len;
329 338 const uint8_t *input = inptr;
339 +#if defined(__amd64)
340 + uint32_t block_count;
341 +#endif /* __amd64 */
330 342
331 343 /* check for noop */
332 344 if (input_len == 0)
333 345 return;
334 346
335 347 /* compute number of bytes mod 64 */
336 348 buf_index = (ctx->count[1] >> 3) & 0x3F;
337 349
338 350 /* update number of bits */
339 351 if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
340 352 ctx->count[0]++;
341 353
342 354 ctx->count[0] += (input_len >> 29);
343 355
344 356 buf_len = 64 - buf_index;
345 357
346 358 /* transform as many times as possible */
347 359 i = 0;
348 360 if (input_len >= buf_len) {
349 361
350 362 /*
351 363 * general optimization:
352 364 *
353 365 * only do initial bcopy() and SHA1Transform() if
354 366 * buf_index != 0. if buf_index == 0, we're just
355 367 * wasting our time doing the bcopy() since there
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
356 368 * wasn't any data left over from a previous call to
357 369 * SHA1Update().
358 370 */
359 371
360 372 if (buf_index) {
361 373 bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
362 374 SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
363 375 i = buf_len;
364 376 }
365 377
378 +#if !defined(__amd64)
366 379 for (; i + 63 < input_len; i += 64)
367 380 SHA1_TRANSFORM(ctx, &input[i]);
381 +#else
382 + block_count = (input_len - i) >> 6;
383 + if (block_count > 0) {
384 + SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count);
385 + i += block_count << 6;
386 + }
387 +#endif /* !__amd64 */
368 388
369 389 /*
370 390 * general optimization:
371 391 *
372 392 * if i and input_len are the same, return now instead
373 393 * of calling bcopy(), since the bcopy() in this case
374 394 * will be an expensive nop.
375 395 */
376 396
377 397 if (input_len == i)
378 398 return;
379 399
380 400 buf_index = 0;
381 401 }
382 402
383 403 /* buffer remaining input */
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
384 404 bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
385 405 }
386 406
387 407 #endif /* VIS_SHA1 */
388 408
389 409 /*
390 410 * SHA1Final()
391 411 *
392 412 * purpose: ends an sha1 digest operation, finalizing the message digest and
393 413 * zeroing the context.
394 - * input: uchar_t * : a buffer to store the digest in
414 + * input: uchar_t * : A buffer to store the digest.
395 415 * : The function actually uses void* because many
396 416 * : callers pass things other than uchar_t here.
397 417 * SHA1_CTX * : the context to finalize, save, and zero
398 418 * output: void
399 419 */
400 420
401 421 void
402 422 SHA1Final(void *digest, SHA1_CTX *ctx)
403 423 {
404 424 uint8_t bitcount_be[sizeof (ctx->count)];
405 425 uint32_t index = (ctx->count[1] >> 3) & 0x3f;
406 426
407 427 /* store bit count, big endian */
408 428 Encode(bitcount_be, ctx->count, sizeof (bitcount_be));
409 429
410 430 /* pad out to 56 mod 64 */
411 431 SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
412 432
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
413 433 /* append length (before padding) */
414 434 SHA1Update(ctx, bitcount_be, sizeof (bitcount_be));
415 435
416 436 /* store state in digest */
417 437 Encode(digest, ctx->state, sizeof (ctx->state));
418 438
419 439 /* zeroize sensitive information */
420 440 bzero(ctx, sizeof (*ctx));
421 441 }
422 442
443 +
444 +#if !defined(__amd64)
445 +
423 446 typedef uint32_t sha1word;
424 447
425 448 /*
426 449 * sparc optimization:
427 450 *
428 451 * on the sparc, we can load big endian 32-bit data easily. note that
429 452 * special care must be taken to ensure the address is 32-bit aligned.
430 453 * in the interest of speed, we don't check to make sure, since
431 454 * careful programming can guarantee this for us.
432 455 */
433 456
434 457 #if defined(_BIG_ENDIAN)
435 458
436 459 #define LOAD_BIG_32(addr) (*(uint32_t *)(addr))
437 460
438 461 #else /* !defined(_BIG_ENDIAN) */
439 462
440 463 #if defined(HAVE_BSWAP)
441 464
442 465 #define LOAD_BIG_32(addr) bswap(*((uint32_t *)(addr)))
443 466
444 467 #else /* !defined(HAVE_BSWAP) */
445 468
446 469 /* little endian -- will work on big endian, but slowly */
447 470 #define LOAD_BIG_32(addr) \
448 471 (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3])
449 472
450 473 #endif /* !defined(HAVE_BSWAP) */
451 474
452 475 #endif /* !defined(_BIG_ENDIAN) */
453 476
454 477 /*
455 478 * SHA1Transform()
456 479 */
457 480 #if defined(W_ARRAY)
458 481 #define W(n) w[n]
459 482 #else /* !defined(W_ARRAY) */
460 483 #define W(n) w_ ## n
461 484 #endif /* !defined(W_ARRAY) */
462 485
463 486
464 487 #if defined(__sparc)
465 488
466 489 /*
467 490 * sparc register window optimization:
468 491 *
469 492 * `a', `b', `c', `d', and `e' are passed into SHA1Transform
470 493 * explicitly since it increases the number of registers available to
471 494 * the compiler. under this scheme, these variables can be held in
472 495 * %i0 - %i4, which leaves more local and out registers available.
473 496 *
474 497 * purpose: sha1 transformation -- updates the digest based on `block'
475 498 * input: uint32_t : bytes 1 - 4 of the digest
476 499 * uint32_t : bytes 5 - 8 of the digest
477 500 * uint32_t : bytes 9 - 12 of the digest
478 501 * uint32_t : bytes 12 - 16 of the digest
479 502 * uint32_t : bytes 16 - 20 of the digest
480 503 * SHA1_CTX * : the context to update
481 504 * uint8_t [64]: the block to use to update the digest
482 505 * output: void
483 506 */
484 507
485 508 void
486 509 SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
487 510 SHA1_CTX *ctx, const uint8_t blk[64])
488 511 {
489 512 /*
490 513 * sparc optimization:
491 514 *
492 515 * while it is somewhat counter-intuitive, on sparc, it is
493 516 * more efficient to place all the constants used in this
494 517 * function in an array and load the values out of the array
495 518 * than to manually load the constants. this is because
496 519 * setting a register to a 32-bit value takes two ops in most
497 520 * cases: a `sethi' and an `or', but loading a 32-bit value
498 521 * from memory only takes one `ld' (or `lduw' on v9). while
499 522 * this increases memory usage, the compiler can find enough
500 523 * other things to do while waiting to keep the pipeline does
501 524 * not stall. additionally, it is likely that many of these
502 525 * constants are cached so that later accesses do not even go
503 526 * out to the bus.
504 527 *
505 528 * this array is declared `static' to keep the compiler from
506 529 * having to bcopy() this array onto the stack frame of
507 530 * SHA1Transform() each time it is called -- which is
508 531 * unacceptably expensive.
509 532 *
510 533 * the `const' is to ensure that callers are good citizens and
511 534 * do not try to munge the array. since these routines are
512 535 * going to be called from inside multithreaded kernelland,
513 536 * this is a good safety check. -- `sha1_consts' will end up in
514 537 * .rodata.
515 538 *
516 539 * unfortunately, loading from an array in this manner hurts
517 540 * performance under intel. so, there is a macro,
518 541 * SHA1_CONST(), used in SHA1Transform(), that either expands to
519 542 * a reference to this array, or to the actual constant,
520 543 * depending on what platform this code is compiled for.
521 544 */
522 545
523 546 static const uint32_t sha1_consts[] = {
524 547 SHA1_CONST_0, SHA1_CONST_1, SHA1_CONST_2, SHA1_CONST_3,
525 548 };
526 549
527 550 /*
528 551 * general optimization:
529 552 *
530 553 * use individual integers instead of using an array. this is a
531 554 * win, although the amount it wins by seems to vary quite a bit.
532 555 */
533 556
534 557 uint32_t w_0, w_1, w_2, w_3, w_4, w_5, w_6, w_7;
535 558 uint32_t w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
536 559
537 560 /*
538 561 * sparc optimization:
539 562 *
540 563 * if `block' is already aligned on a 4-byte boundary, use
541 564 * LOAD_BIG_32() directly. otherwise, bcopy() into a
542 565 * buffer that *is* aligned on a 4-byte boundary and then do
543 566 * the LOAD_BIG_32() on that buffer. benchmarks have shown
544 567 * that using the bcopy() is better than loading the bytes
545 568 * individually and doing the endian-swap by hand.
546 569 *
547 570 * even though it's quite tempting to assign to do:
548 571 *
549 572 * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32));
550 573 *
551 574 * and only have one set of LOAD_BIG_32()'s, the compiler
552 575 * *does not* like that, so please resist the urge.
553 576 */
554 577
555 578 if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */
556 579 bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
557 580 w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15);
558 581 w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14);
559 582 w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13);
560 583 w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12);
561 584 w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11);
562 585 w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10);
563 586 w_9 = LOAD_BIG_32(ctx->buf_un.buf32 + 9);
564 587 w_8 = LOAD_BIG_32(ctx->buf_un.buf32 + 8);
565 588 w_7 = LOAD_BIG_32(ctx->buf_un.buf32 + 7);
566 589 w_6 = LOAD_BIG_32(ctx->buf_un.buf32 + 6);
567 590 w_5 = LOAD_BIG_32(ctx->buf_un.buf32 + 5);
568 591 w_4 = LOAD_BIG_32(ctx->buf_un.buf32 + 4);
569 592 w_3 = LOAD_BIG_32(ctx->buf_un.buf32 + 3);
570 593 w_2 = LOAD_BIG_32(ctx->buf_un.buf32 + 2);
571 594 w_1 = LOAD_BIG_32(ctx->buf_un.buf32 + 1);
572 595 w_0 = LOAD_BIG_32(ctx->buf_un.buf32 + 0);
573 596 } else {
574 597 /*LINTED*/
575 598 w_15 = LOAD_BIG_32(blk + 60);
576 599 /*LINTED*/
577 600 w_14 = LOAD_BIG_32(blk + 56);
578 601 /*LINTED*/
579 602 w_13 = LOAD_BIG_32(blk + 52);
580 603 /*LINTED*/
581 604 w_12 = LOAD_BIG_32(blk + 48);
582 605 /*LINTED*/
583 606 w_11 = LOAD_BIG_32(blk + 44);
584 607 /*LINTED*/
585 608 w_10 = LOAD_BIG_32(blk + 40);
586 609 /*LINTED*/
587 610 w_9 = LOAD_BIG_32(blk + 36);
588 611 /*LINTED*/
589 612 w_8 = LOAD_BIG_32(blk + 32);
590 613 /*LINTED*/
591 614 w_7 = LOAD_BIG_32(blk + 28);
592 615 /*LINTED*/
593 616 w_6 = LOAD_BIG_32(blk + 24);
594 617 /*LINTED*/
595 618 w_5 = LOAD_BIG_32(blk + 20);
596 619 /*LINTED*/
597 620 w_4 = LOAD_BIG_32(blk + 16);
598 621 /*LINTED*/
599 622 w_3 = LOAD_BIG_32(blk + 12);
600 623 /*LINTED*/
601 624 w_2 = LOAD_BIG_32(blk + 8);
602 625 /*LINTED*/
603 626 w_1 = LOAD_BIG_32(blk + 4);
604 627 /*LINTED*/
605 628 w_0 = LOAD_BIG_32(blk + 0);
606 629 }
607 630 #else /* !defined(__sparc) */
608 631
609 632 void
610 633 SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64])
611 634 {
612 635 sha1word a = ctx->state[0];
613 636 sha1word b = ctx->state[1];
614 637 sha1word c = ctx->state[2];
615 638 sha1word d = ctx->state[3];
616 639 sha1word e = ctx->state[4];
617 640
618 641 #if defined(W_ARRAY)
619 642 sha1word w[16];
620 643 #else /* !defined(W_ARRAY) */
621 644 sha1word w_0, w_1, w_2, w_3, w_4, w_5, w_6, w_7;
622 645 sha1word w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
623 646 #endif /* !defined(W_ARRAY) */
624 647
625 648 W(0) = LOAD_BIG_32(blk + 0);
626 649 W(1) = LOAD_BIG_32(blk + 4);
627 650 W(2) = LOAD_BIG_32(blk + 8);
628 651 W(3) = LOAD_BIG_32(blk + 12);
629 652 W(4) = LOAD_BIG_32(blk + 16);
630 653 W(5) = LOAD_BIG_32(blk + 20);
631 654 W(6) = LOAD_BIG_32(blk + 24);
632 655 W(7) = LOAD_BIG_32(blk + 28);
633 656 W(8) = LOAD_BIG_32(blk + 32);
634 657 W(9) = LOAD_BIG_32(blk + 36);
635 658 W(10) = LOAD_BIG_32(blk + 40);
636 659 W(11) = LOAD_BIG_32(blk + 44);
637 660 W(12) = LOAD_BIG_32(blk + 48);
638 661 W(13) = LOAD_BIG_32(blk + 52);
639 662 W(14) = LOAD_BIG_32(blk + 56);
640 663 W(15) = LOAD_BIG_32(blk + 60);
641 664
642 665 #endif /* !defined(__sparc) */
643 666
644 667 /*
645 668 * general optimization:
646 669 *
647 670 * even though this approach is described in the standard as
648 671 * being slower algorithmically, it is 30-40% faster than the
649 672 * "faster" version under SPARC, because this version has more
650 673 * of the constraints specified at compile-time and uses fewer
651 674 * variables (and therefore has better register utilization)
652 675 * than its "speedier" brother. (i've tried both, trust me)
↓ open down ↓ |
220 lines elided |
↑ open up ↑ |
653 676 *
654 677 * for either method given in the spec, there is an "assignment"
655 678 * phase where the following takes place:
656 679 *
657 680 * tmp = (main_computation);
658 681 * e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp;
659 682 *
660 683 * we can make the algorithm go faster by not doing this work,
661 684 * but just pretending that `d' is now `e', etc. this works
662 685 * really well and obviates the need for a temporary variable.
663 - * however, we still explictly perform the rotate action,
686 + * however, we still explicitly perform the rotate action,
664 687 * since it is cheaper on SPARC to do it once than to have to
665 688 * do it over and over again.
666 689 */
667 690
668 691 /* round 1 */
669 692 e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */
670 693 b = ROTATE_LEFT(b, 30);
671 694
672 695 d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */
673 696 a = ROTATE_LEFT(a, 30);
674 697
675 698 c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */
676 699 e = ROTATE_LEFT(e, 30);
677 700
678 701 b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */
679 702 d = ROTATE_LEFT(d, 30);
680 703
681 704 a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */
682 705 c = ROTATE_LEFT(c, 30);
683 706
684 707 e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */
685 708 b = ROTATE_LEFT(b, 30);
686 709
687 710 d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */
688 711 a = ROTATE_LEFT(a, 30);
689 712
690 713 c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */
691 714 e = ROTATE_LEFT(e, 30);
692 715
693 716 b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */
694 717 d = ROTATE_LEFT(d, 30);
695 718
696 719 a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */
697 720 c = ROTATE_LEFT(c, 30);
698 721
699 722 e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */
700 723 b = ROTATE_LEFT(b, 30);
701 724
702 725 d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */
703 726 a = ROTATE_LEFT(a, 30);
704 727
705 728 c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */
706 729 e = ROTATE_LEFT(e, 30);
707 730
708 731 b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */
709 732 d = ROTATE_LEFT(d, 30);
710 733
711 734 a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */
712 735 c = ROTATE_LEFT(c, 30);
713 736
714 737 e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */
715 738 b = ROTATE_LEFT(b, 30);
716 739
717 740 W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 16 */
718 741 d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0);
719 742 a = ROTATE_LEFT(a, 30);
720 743
721 744 W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 17 */
722 745 c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0);
723 746 e = ROTATE_LEFT(e, 30);
724 747
725 748 W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 18 */
726 749 b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0);
727 750 d = ROTATE_LEFT(d, 30);
728 751
729 752 W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 19 */
730 753 a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0);
731 754 c = ROTATE_LEFT(c, 30);
732 755
733 756 /* round 2 */
734 757 W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 20 */
735 758 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1);
736 759 b = ROTATE_LEFT(b, 30);
737 760
738 761 W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 21 */
739 762 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1);
740 763 a = ROTATE_LEFT(a, 30);
741 764
742 765 W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 22 */
743 766 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1);
744 767 e = ROTATE_LEFT(e, 30);
745 768
746 769 W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 23 */
747 770 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1);
748 771 d = ROTATE_LEFT(d, 30);
749 772
750 773 W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 24 */
751 774 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1);
752 775 c = ROTATE_LEFT(c, 30);
753 776
754 777 W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 25 */
755 778 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1);
756 779 b = ROTATE_LEFT(b, 30);
757 780
758 781 W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 26 */
759 782 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1);
760 783 a = ROTATE_LEFT(a, 30);
761 784
762 785 W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 27 */
763 786 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1);
764 787 e = ROTATE_LEFT(e, 30);
765 788
766 789 W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 28 */
767 790 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1);
768 791 d = ROTATE_LEFT(d, 30);
769 792
770 793 W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 29 */
771 794 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1);
772 795 c = ROTATE_LEFT(c, 30);
773 796
774 797 W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 30 */
775 798 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1);
776 799 b = ROTATE_LEFT(b, 30);
777 800
778 801 W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 31 */
779 802 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1);
780 803 a = ROTATE_LEFT(a, 30);
781 804
782 805 W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 32 */
783 806 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1);
784 807 e = ROTATE_LEFT(e, 30);
785 808
786 809 W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 33 */
787 810 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1);
788 811 d = ROTATE_LEFT(d, 30);
789 812
790 813 W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 34 */
791 814 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1);
792 815 c = ROTATE_LEFT(c, 30);
793 816
794 817 W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 35 */
795 818 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1);
796 819 b = ROTATE_LEFT(b, 30);
797 820
798 821 W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 36 */
799 822 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1);
800 823 a = ROTATE_LEFT(a, 30);
801 824
802 825 W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 37 */
803 826 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1);
804 827 e = ROTATE_LEFT(e, 30);
805 828
806 829 W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 38 */
807 830 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1);
808 831 d = ROTATE_LEFT(d, 30);
809 832
810 833 W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 39 */
811 834 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1);
812 835 c = ROTATE_LEFT(c, 30);
813 836
814 837 /* round 3 */
815 838 W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 40 */
816 839 e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2);
817 840 b = ROTATE_LEFT(b, 30);
818 841
819 842 W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 41 */
820 843 d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2);
821 844 a = ROTATE_LEFT(a, 30);
822 845
823 846 W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 42 */
824 847 c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2);
825 848 e = ROTATE_LEFT(e, 30);
826 849
827 850 W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 43 */
828 851 b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2);
829 852 d = ROTATE_LEFT(d, 30);
830 853
831 854 W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 44 */
832 855 a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2);
833 856 c = ROTATE_LEFT(c, 30);
834 857
835 858 W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 45 */
836 859 e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2);
837 860 b = ROTATE_LEFT(b, 30);
838 861
839 862 W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 46 */
840 863 d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2);
841 864 a = ROTATE_LEFT(a, 30);
842 865
843 866 W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 47 */
844 867 c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2);
845 868 e = ROTATE_LEFT(e, 30);
846 869
847 870 W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 48 */
848 871 b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2);
849 872 d = ROTATE_LEFT(d, 30);
850 873
851 874 W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 49 */
852 875 a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2);
853 876 c = ROTATE_LEFT(c, 30);
854 877
855 878 W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 50 */
856 879 e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2);
857 880 b = ROTATE_LEFT(b, 30);
858 881
859 882 W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 51 */
860 883 d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2);
861 884 a = ROTATE_LEFT(a, 30);
862 885
863 886 W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 52 */
864 887 c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2);
865 888 e = ROTATE_LEFT(e, 30);
866 889
867 890 W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 53 */
868 891 b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2);
869 892 d = ROTATE_LEFT(d, 30);
870 893
871 894 W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 54 */
872 895 a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2);
873 896 c = ROTATE_LEFT(c, 30);
874 897
875 898 W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 55 */
876 899 e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2);
877 900 b = ROTATE_LEFT(b, 30);
878 901
879 902 W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 56 */
880 903 d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2);
881 904 a = ROTATE_LEFT(a, 30);
882 905
883 906 W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 57 */
884 907 c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2);
885 908 e = ROTATE_LEFT(e, 30);
886 909
887 910 W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 58 */
888 911 b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2);
889 912 d = ROTATE_LEFT(d, 30);
890 913
891 914 W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 59 */
892 915 a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2);
893 916 c = ROTATE_LEFT(c, 30);
894 917
895 918 /* round 4 */
896 919 W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 60 */
897 920 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3);
898 921 b = ROTATE_LEFT(b, 30);
899 922
900 923 W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 61 */
901 924 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3);
902 925 a = ROTATE_LEFT(a, 30);
903 926
904 927 W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 62 */
905 928 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3);
906 929 e = ROTATE_LEFT(e, 30);
907 930
908 931 W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 63 */
909 932 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3);
910 933 d = ROTATE_LEFT(d, 30);
911 934
912 935 W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 64 */
913 936 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3);
914 937 c = ROTATE_LEFT(c, 30);
915 938
916 939 W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 65 */
917 940 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3);
918 941 b = ROTATE_LEFT(b, 30);
919 942
920 943 W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 66 */
921 944 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3);
922 945 a = ROTATE_LEFT(a, 30);
923 946
924 947 W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 67 */
925 948 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3);
926 949 e = ROTATE_LEFT(e, 30);
927 950
928 951 W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 68 */
929 952 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3);
930 953 d = ROTATE_LEFT(d, 30);
931 954
932 955 W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 69 */
933 956 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3);
934 957 c = ROTATE_LEFT(c, 30);
935 958
936 959 W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 70 */
937 960 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3);
938 961 b = ROTATE_LEFT(b, 30);
939 962
940 963 W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 71 */
941 964 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3);
942 965 a = ROTATE_LEFT(a, 30);
943 966
944 967 W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 72 */
945 968 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3);
946 969 e = ROTATE_LEFT(e, 30);
947 970
948 971 W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 73 */
949 972 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3);
950 973 d = ROTATE_LEFT(d, 30);
951 974
952 975 W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 74 */
953 976 a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3);
954 977 c = ROTATE_LEFT(c, 30);
955 978
956 979 W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 75 */
957 980 e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3);
958 981 b = ROTATE_LEFT(b, 30);
959 982
960 983 W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 76 */
961 984 d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3);
962 985 a = ROTATE_LEFT(a, 30);
963 986
964 987 W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 77 */
965 988 c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3);
966 989 e = ROTATE_LEFT(e, 30);
967 990
968 991 W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 78 */
969 992 b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3);
970 993 d = ROTATE_LEFT(d, 30);
971 994
972 995 W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 79 */
973 996
974 997 ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) +
↓ open down ↓ |
301 lines elided |
↑ open up ↑ |
975 998 SHA1_CONST(3);
976 999 ctx->state[1] += b;
977 1000 ctx->state[2] += ROTATE_LEFT(c, 30);
978 1001 ctx->state[3] += d;
979 1002 ctx->state[4] += e;
980 1003
981 1004 /* zeroize sensitive information */
982 1005 W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0;
983 1006 W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0;
984 1007 }
1008 +#endif /* !__amd64 */
985 1009
1010 +
986 1011 /*
987 1012 * Encode()
988 1013 *
989 1014 * purpose: to convert a list of numbers from little endian to big endian
990 1015 * input: uint8_t * : place to store the converted big endian numbers
991 1016 * uint32_t * : place to get numbers to convert from
992 1017 * size_t : the length of the input in bytes
993 1018 * output: void
994 1019 */
995 1020
996 1021 static void
997 1022 Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
998 1023 size_t len)
999 1024 {
1000 1025 size_t i, j;
1001 1026
1002 1027 #if defined(__sparc)
1003 1028 if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
1004 1029 for (i = 0, j = 0; j < len; i++, j += 4) {
1005 1030 /* LINTED: pointer alignment */
1006 1031 *((uint32_t *)(output + j)) = input[i];
1007 1032 }
1008 1033 } else {
1009 1034 #endif /* little endian -- will work on big endian, but slowly */
1010 1035 for (i = 0, j = 0; j < len; i++, j += 4) {
1011 1036 output[j] = (input[i] >> 24) & 0xff;
1012 1037 output[j + 1] = (input[i] >> 16) & 0xff;
1013 1038 output[j + 2] = (input[i] >> 8) & 0xff;
1014 1039 output[j + 3] = input[i] & 0xff;
1015 1040 }
1016 1041 #if defined(__sparc)
1017 1042 }
1018 1043 #endif
1019 1044 }
↓ open down ↓ |
24 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX