Print this page
5072961 Need an optimized MD5 implementation for amd64
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/crypto/md5/md5.c
+++ new/usr/src/common/crypto/md5/md5.c
1 1 /*
2 - * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
2 + * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
3 3 * Use is subject to license terms.
4 4 */
5 5
6 6 /*
7 7 * Cleaned-up and optimized version of MD5, based on the reference
8 8 * implementation provided in RFC 1321. See RSA Copyright information
9 9 * below.
10 10 */
11 11
12 -#pragma ident "@(#)md5.c 1.27 07/04/10 SMI"
12 +#pragma ident "@(#)md5.c 1.28 08/01/02 SMI"
13 13
14 14 /*
15 15 * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
16 16 */
17 17
18 18 /*
19 19 * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
20 20 * rights reserved.
21 21 *
22 22 * License to copy and use this software is granted provided that it
23 23 * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
24 24 * Algorithm" in all material mentioning or referencing this software
25 25 * or this function.
26 26 *
27 27 * License is also granted to make and use derivative works provided
28 28 * that such works are identified as "derived from the RSA Data
29 29 * Security, Inc. MD5 Message-Digest Algorithm" in all material
30 30 * mentioning or referencing the derived work.
31 31 *
32 32 * RSA Data Security, Inc. makes no representations concerning either
33 33 * the merchantability of this software or the suitability of this
34 34 * software for any particular purpose. It is provided "as is"
35 35 * without express or implied warranty of any kind.
36 36 *
37 37 * These notices must be retained in any copies of any part of this
38 38 * documentation and/or software.
39 39 */
40 40
41 41 #include <sys/types.h>
42 42 #include <sys/md5.h>
43 43 #include <sys/md5_consts.h> /* MD5_CONST() optimization */
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
44 44 #include "md5_byteswap.h"
45 45 #if !defined(_KERNEL) || defined(_BOOT)
46 46 #include <strings.h>
47 47 #endif /* !_KERNEL || _BOOT */
48 48
49 49 #ifdef _KERNEL
50 50 #include <sys/systm.h>
51 51 #endif /* _KERNEL */
52 52
53 53 static void Encode(uint8_t *, const uint32_t *, size_t);
54 +
55 +#if !defined(__amd64)
54 56 static void MD5Transform(uint32_t, uint32_t, uint32_t, uint32_t, MD5_CTX *,
55 57 const uint8_t [64]);
58 +#else
59 +void md5_block_asm_host_order(MD5_CTX *ctx, const void *inpp,
60 + unsigned int input_length_in_blocks);
61 +#endif /* !defined(__amd64) */
56 62
57 63 static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
58 64
59 65 /*
60 66 * F, G, H and I are the basic MD5 functions.
61 67 */
62 68 #define F(b, c, d) (((b) & (c)) | ((~b) & (d)))
63 69 #define G(b, c, d) (((b) & (d)) | ((c) & (~d)))
64 70 #define H(b, c, d) ((b) ^ (c) ^ (d))
65 71 #define I(b, c, d) ((c) ^ ((b) | (~d)))
66 72
67 73 /*
68 74 * ROTATE_LEFT rotates x left n bits.
69 75 */
70 76 #define ROTATE_LEFT(x, n) \
71 77 (((x) << (n)) | ((x) >> ((sizeof (x) << 3) - (n))))
72 78
73 79 /*
74 80 * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
75 81 * Rotation is separate from addition to prevent recomputation.
76 82 */
77 83
78 84 #define FF(a, b, c, d, x, s, ac) { \
79 85 (a) += F((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
80 86 (a) = ROTATE_LEFT((a), (s)); \
81 87 (a) += (b); \
82 88 }
83 89
84 90 #define GG(a, b, c, d, x, s, ac) { \
85 91 (a) += G((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
86 92 (a) = ROTATE_LEFT((a), (s)); \
87 93 (a) += (b); \
88 94 }
89 95
90 96 #define HH(a, b, c, d, x, s, ac) { \
91 97 (a) += H((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
92 98 (a) = ROTATE_LEFT((a), (s)); \
93 99 (a) += (b); \
94 100 }
95 101
96 102 #define II(a, b, c, d, x, s, ac) { \
97 103 (a) += I((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
98 104 (a) = ROTATE_LEFT((a), (s)); \
99 105 (a) += (b); \
100 106 }
101 107
102 108 /*
103 109 * Loading 32-bit constants on a RISC is expensive since it involves both a
104 110 * `sethi' and an `or'. thus, we instead have the compiler generate `ld's to
105 111 * load the constants from an array called `md5_consts'. however, on intel
106 112 * (and other CISC processors), it is cheaper to load the constant
107 113 * directly. thus, the c code in MD5Transform() uses the macro MD5_CONST()
108 114 * which either expands to a constant or an array reference, depending on the
109 115 * architecture the code is being compiled for.
110 116 *
111 117 * Right now, i386 and amd64 are the CISC exceptions.
112 118 * If we get another CISC ISA, we'll have to change the ifdef.
113 119 */
114 120
115 121 #if defined(__i386) || defined(__amd64)
116 122
117 123 #define MD5_CONST(x) (MD5_CONST_ ## x)
118 124 #define MD5_CONST_e(x) MD5_CONST(x)
119 125 #define MD5_CONST_o(x) MD5_CONST(x)
120 126
121 127 #else
122 128 /*
123 129 * sparc/RISC optimization:
124 130 *
125 131 * while it is somewhat counter-intuitive, on sparc (and presumably other RISC
126 132 * machines), it is more efficient to place all the constants used in this
127 133 * function in an array and load the values out of the array than to manually
128 134 * load the constants. this is because setting a register to a 32-bit value
129 135 * takes two ops in most cases: a `sethi' and an `or', but loading a 32-bit
130 136 * value from memory only takes one `ld' (or `lduw' on v9). while this
131 137 * increases memory usage, the compiler can find enough other things to do
132 138 * while waiting to keep the pipeline does not stall. additionally, it is
133 139 * likely that many of these constants are cached so that later accesses do
134 140 * not even go out to the bus.
135 141 *
136 142 * this array is declared `static' to keep the compiler from having to
137 143 * bcopy() this array onto the stack frame of MD5Transform() each time it is
138 144 * called -- which is unacceptably expensive.
139 145 *
140 146 * the `const' is to ensure that callers are good citizens and do not try to
141 147 * munge the array. since these routines are going to be called from inside
142 148 * multithreaded kernelland, this is a good safety check. -- `constants' will
143 149 * end up in .rodata.
144 150 *
145 151 * unfortunately, loading from an array in this manner hurts performance under
146 152 * intel (and presumably other CISC machines). so, there is a macro,
147 153 * MD5_CONST(), used in MD5Transform(), that either expands to a reference to
148 154 * this array, or to the actual constant, depending on what platform this code
149 155 * is compiled for.
150 156 */
151 157
152 158 #ifdef sun4v
153 159
154 160 /*
155 161 * Going to load these consts in 8B chunks, so need to enforce 8B alignment
156 162 */
157 163
158 164 /* CSTYLED */
159 165 #pragma align 64 (md5_consts)
160 166 #define _MD5_CHECK_ALIGNMENT
161 167
162 168 #endif /* sun4v */
163 169
164 170 static const uint32_t md5_consts[] = {
165 171 MD5_CONST_0, MD5_CONST_1, MD5_CONST_2, MD5_CONST_3,
166 172 MD5_CONST_4, MD5_CONST_5, MD5_CONST_6, MD5_CONST_7,
167 173 MD5_CONST_8, MD5_CONST_9, MD5_CONST_10, MD5_CONST_11,
168 174 MD5_CONST_12, MD5_CONST_13, MD5_CONST_14, MD5_CONST_15,
169 175 MD5_CONST_16, MD5_CONST_17, MD5_CONST_18, MD5_CONST_19,
170 176 MD5_CONST_20, MD5_CONST_21, MD5_CONST_22, MD5_CONST_23,
171 177 MD5_CONST_24, MD5_CONST_25, MD5_CONST_26, MD5_CONST_27,
172 178 MD5_CONST_28, MD5_CONST_29, MD5_CONST_30, MD5_CONST_31,
173 179 MD5_CONST_32, MD5_CONST_33, MD5_CONST_34, MD5_CONST_35,
174 180 MD5_CONST_36, MD5_CONST_37, MD5_CONST_38, MD5_CONST_39,
175 181 MD5_CONST_40, MD5_CONST_41, MD5_CONST_42, MD5_CONST_43,
176 182 MD5_CONST_44, MD5_CONST_45, MD5_CONST_46, MD5_CONST_47,
177 183 MD5_CONST_48, MD5_CONST_49, MD5_CONST_50, MD5_CONST_51,
178 184 MD5_CONST_52, MD5_CONST_53, MD5_CONST_54, MD5_CONST_55,
179 185 MD5_CONST_56, MD5_CONST_57, MD5_CONST_58, MD5_CONST_59,
180 186 MD5_CONST_60, MD5_CONST_61, MD5_CONST_62, MD5_CONST_63
181 187 };
182 188
183 189
184 190 #ifdef sun4v
185 191 /*
186 192 * To reduce the number of loads, load consts in 64-bit
187 193 * chunks and then split.
188 194 *
189 195 * No need to mask upper 32-bits, as just interested in
190 196 * low 32-bits (saves an & operation and means that this
191 197 * optimization doesn't increases the icount.
192 198 */
193 199 #define MD5_CONST_e(x) (md5_consts64[x/2] >> 32)
194 200 #define MD5_CONST_o(x) (md5_consts64[x/2])
195 201
196 202 #else
197 203
198 204 #define MD5_CONST_e(x) (md5_consts[x])
199 205 #define MD5_CONST_o(x) (md5_consts[x])
200 206
201 207 #endif /* sun4v */
202 208
203 209 #endif
204 210
205 211 /*
206 212 * MD5Init()
207 213 *
208 214 * purpose: initializes the md5 context and begins and md5 digest operation
209 215 * input: MD5_CTX * : the context to initialize.
210 216 * output: void
211 217 */
212 218
213 219 void
214 220 MD5Init(MD5_CTX *ctx)
215 221 {
216 222 ctx->count[0] = ctx->count[1] = 0;
217 223
218 224 /* load magic initialization constants */
219 225 ctx->state[0] = MD5_INIT_CONST_1;
220 226 ctx->state[1] = MD5_INIT_CONST_2;
221 227 ctx->state[2] = MD5_INIT_CONST_3;
222 228 ctx->state[3] = MD5_INIT_CONST_4;
223 229 }
224 230
225 231 /*
226 232 * MD5Update()
227 233 *
228 234 * purpose: continues an md5 digest operation, using the message block
229 235 * to update the context.
230 236 * input: MD5_CTX * : the context to update
231 237 * uint8_t * : the message block
232 238 * uint32_t : the length of the message block in bytes
233 239 * output: void
234 240 *
235 241 * MD5 crunches in 64-byte blocks. All numeric constants here are related to
↓ open down ↓ |
170 lines elided |
↑ open up ↑ |
236 242 * that property of MD5.
237 243 */
238 244
239 245 void
240 246 MD5Update(MD5_CTX *ctx, const void *inpp, unsigned int input_len)
241 247 {
242 248 uint32_t i, buf_index, buf_len;
243 249 #ifdef sun4v
244 250 uint32_t old_asi;
245 251 #endif /* sun4v */
252 +#if defined(__amd64)
253 + uint32_t block_count;
254 +#endif /* !defined(__amd64) */
246 255 const unsigned char *input = (const unsigned char *)inpp;
247 256
248 257 /* compute (number of bytes computed so far) mod 64 */
249 258 buf_index = (ctx->count[0] >> 3) & 0x3F;
250 259
251 260 /* update number of bits hashed into this MD5 computation so far */
252 261 if ((ctx->count[0] += (input_len << 3)) < (input_len << 3))
253 - ctx->count[1]++;
262 + ctx->count[1]++;
254 263 ctx->count[1] += (input_len >> 29);
255 264
256 265 buf_len = 64 - buf_index;
257 266
258 267 /* transform as many times as possible */
259 268 i = 0;
260 269 if (input_len >= buf_len) {
261 270
262 271 /*
263 272 * general optimization:
264 273 *
265 274 * only do initial bcopy() and MD5Transform() if
266 275 * buf_index != 0. if buf_index == 0, we're just
267 276 * wasting our time doing the bcopy() since there
268 277 * wasn't any data left over from a previous call to
269 278 * MD5Update().
270 279 */
271 280
272 281 #ifdef sun4v
273 282 /*
274 283 * For N1 use %asi register. However, costly to repeatedly set
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
275 284 * in MD5Transform. Therefore, set once here.
276 285 * Should probably restore the old value afterwards...
277 286 */
278 287 old_asi = get_little();
279 288 set_little(0x88);
280 289 #endif /* sun4v */
281 290
282 291 if (buf_index) {
283 292 bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
284 293
294 +#if !defined(__amd64)
285 295 MD5Transform(ctx->state[0], ctx->state[1],
286 296 ctx->state[2], ctx->state[3], ctx,
287 297 ctx->buf_un.buf8);
298 +#else
299 + md5_block_asm_host_order(ctx, ctx->buf_un.buf8, 1);
300 +#endif /* !defined(__amd64) */
288 301
289 302 i = buf_len;
290 303 }
291 304
305 +#if !defined(__amd64)
292 306 for (; i + 63 < input_len; i += 64)
293 307 MD5Transform(ctx->state[0], ctx->state[1],
294 308 ctx->state[2], ctx->state[3], ctx, &input[i]);
295 309
310 +#else
311 + block_count = (input_len - i) >> 6;
312 + if (block_count > 0) {
313 + md5_block_asm_host_order(ctx, &input[i], block_count);
314 + i += block_count << 6;
315 + }
316 +#endif /* !defined(__amd64) */
296 317
318 +
297 319 #ifdef sun4v
298 320 /*
299 321 * Restore old %ASI value
300 322 */
301 323 set_little(old_asi);
302 324 #endif /* sun4v */
303 325
304 326 /*
305 327 * general optimization:
306 328 *
307 329 * if i and input_len are the same, return now instead
308 330 * of calling bcopy(), since the bcopy() in this
309 331 * case will be an expensive nop.
310 332 */
311 333
312 334 if (input_len == i)
313 335 return;
314 336
315 337 buf_index = 0;
316 338 }
317 339
318 340 /* buffer remaining input */
319 341 bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
320 342 }
321 343
322 344 /*
323 345 * MD5Final()
324 346 *
325 347 * purpose: ends an md5 digest operation, finalizing the message digest and
326 348 * zeroing the context.
327 349 * input: uchar_t * : a buffer to store the digest in
328 350 * : The function actually uses void* because many
329 351 * : callers pass things other than uchar_t here.
330 352 * MD5_CTX * : the context to finalize, save, and zero
331 353 * output: void
332 354 */
333 355
334 356 void
335 357 MD5Final(void *digest, MD5_CTX *ctx)
336 358 {
337 359 uint8_t bitcount_le[sizeof (ctx->count)];
338 360 uint32_t index = (ctx->count[0] >> 3) & 0x3f;
339 361
340 362 /* store bit count, little endian */
341 363 Encode(bitcount_le, ctx->count, sizeof (bitcount_le));
342 364
343 365 /* pad out to 56 mod 64 */
344 366 MD5Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
345 367
346 368 /* append length (before padding) */
347 369 MD5Update(ctx, bitcount_le, sizeof (bitcount_le));
348 370
349 371 /* store state in digest */
350 372 Encode(digest, ctx->state, sizeof (ctx->state));
351 373
352 374 /* zeroize sensitive information */
353 375 bzero(ctx, sizeof (*ctx));
354 376 }
355 377
356 378 #ifndef _KERNEL
357 379
358 380 void
359 381 md5_calc(unsigned char *output, unsigned char *input, unsigned int inlen)
↓ open down ↓ |
53 lines elided |
↑ open up ↑ |
360 382 {
361 383 MD5_CTX context;
362 384
363 385 MD5Init(&context);
364 386 MD5Update(&context, input, inlen);
365 387 MD5Final(output, &context);
366 388 }
367 389
368 390 #endif /* !_KERNEL */
369 391
392 +#if !defined(__amd64)
370 393 /*
371 394 * sparc register window optimization:
372 395 *
373 396 * `a', `b', `c', and `d' are passed into MD5Transform explicitly
374 397 * since it increases the number of registers available to the
375 398 * compiler. under this scheme, these variables can be held in
376 399 * %i0 - %i3, which leaves more local and out registers available.
377 400 */
378 401
379 402 /*
380 403 * MD5Transform()
381 404 *
382 405 * purpose: md5 transformation -- updates the digest based on `block'
383 406 * input: uint32_t : bytes 1 - 4 of the digest
384 407 * uint32_t : bytes 5 - 8 of the digest
385 408 * uint32_t : bytes 9 - 12 of the digest
386 409 * uint32_t : bytes 12 - 16 of the digest
387 410 * MD5_CTX * : the context to update
388 411 * uint8_t [64]: the block to use to update the digest
389 412 * output: void
390 413 */
391 414
392 415 static void
393 416 MD5Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d,
394 417 MD5_CTX *ctx, const uint8_t block[64])
395 418 {
396 419 /*
397 420 * general optimization:
398 421 *
399 422 * use individual integers instead of using an array. this is a
400 423 * win, although the amount it wins by seems to vary quite a bit.
401 424 */
402 425
403 426 register uint32_t x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7;
404 427 register uint32_t x_8, x_9, x_10, x_11, x_12, x_13, x_14, x_15;
405 428 #ifdef sun4v
406 429 unsigned long long *md5_consts64;
407 430
408 431 /* LINTED E_BAD_PTR_CAST_ALIGN */
409 432 md5_consts64 = (unsigned long long *) md5_consts;
410 433 #endif /* sun4v */
411 434
412 435 /*
413 436 * general optimization:
414 437 *
415 438 * the compiler (at least SC4.2/5.x) generates better code if
416 439 * variable use is localized. in this case, swapping the integers in
417 440 * this order allows `x_0 'to be swapped nearest to its first use in
418 441 * FF(), and likewise for `x_1' and up. note that the compiler
419 442 * prefers this to doing each swap right before the FF() that
420 443 * uses it.
421 444 */
422 445
423 446 /*
424 447 * sparc v9/v8plus optimization:
425 448 *
426 449 * if `block' is already aligned on a 4-byte boundary, use the
427 450 * optimized load_little_32() directly. otherwise, bcopy()
428 451 * into a buffer that *is* aligned on a 4-byte boundary and
429 452 * then do the load_little_32() on that buffer. benchmarks
430 453 * have shown that using the bcopy() is better than loading
431 454 * the bytes individually and doing the endian-swap by hand.
432 455 *
433 456 * even though it's quite tempting to assign to do:
434 457 *
435 458 * blk = bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
436 459 *
437 460 * and only have one set of LOAD_LITTLE_32()'s, the compiler (at least
438 461 * SC4.2/5.x) *does not* like that, so please resist the urge.
439 462 */
440 463
441 464 #ifdef _MD5_CHECK_ALIGNMENT
442 465 if ((uintptr_t)block & 0x3) { /* not 4-byte aligned? */
443 466 bcopy(block, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
444 467
445 468 #ifdef sun4v
446 469 x_15 = LOAD_LITTLE_32_f(ctx->buf_un.buf32);
447 470 x_14 = LOAD_LITTLE_32_e(ctx->buf_un.buf32);
448 471 x_13 = LOAD_LITTLE_32_d(ctx->buf_un.buf32);
449 472 x_12 = LOAD_LITTLE_32_c(ctx->buf_un.buf32);
450 473 x_11 = LOAD_LITTLE_32_b(ctx->buf_un.buf32);
451 474 x_10 = LOAD_LITTLE_32_a(ctx->buf_un.buf32);
452 475 x_9 = LOAD_LITTLE_32_9(ctx->buf_un.buf32);
453 476 x_8 = LOAD_LITTLE_32_8(ctx->buf_un.buf32);
454 477 x_7 = LOAD_LITTLE_32_7(ctx->buf_un.buf32);
455 478 x_6 = LOAD_LITTLE_32_6(ctx->buf_un.buf32);
456 479 x_5 = LOAD_LITTLE_32_5(ctx->buf_un.buf32);
457 480 x_4 = LOAD_LITTLE_32_4(ctx->buf_un.buf32);
458 481 x_3 = LOAD_LITTLE_32_3(ctx->buf_un.buf32);
459 482 x_2 = LOAD_LITTLE_32_2(ctx->buf_un.buf32);
460 483 x_1 = LOAD_LITTLE_32_1(ctx->buf_un.buf32);
461 484 x_0 = LOAD_LITTLE_32_0(ctx->buf_un.buf32);
462 485 #else
463 486 x_15 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 15);
464 487 x_14 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 14);
465 488 x_13 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 13);
466 489 x_12 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 12);
467 490 x_11 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 11);
468 491 x_10 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 10);
469 492 x_9 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 9);
470 493 x_8 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 8);
471 494 x_7 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 7);
472 495 x_6 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 6);
473 496 x_5 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 5);
474 497 x_4 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 4);
475 498 x_3 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 3);
476 499 x_2 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 2);
477 500 x_1 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 1);
478 501 x_0 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 0);
479 502 #endif /* sun4v */
480 503 } else
481 504 #endif
482 505 {
483 506
484 507 #ifdef sun4v
485 508 /* LINTED E_BAD_PTR_CAST_ALIGN */
486 509 x_15 = LOAD_LITTLE_32_f(block);
487 510 /* LINTED E_BAD_PTR_CAST_ALIGN */
488 511 x_14 = LOAD_LITTLE_32_e(block);
489 512 /* LINTED E_BAD_PTR_CAST_ALIGN */
490 513 x_13 = LOAD_LITTLE_32_d(block);
491 514 /* LINTED E_BAD_PTR_CAST_ALIGN */
492 515 x_12 = LOAD_LITTLE_32_c(block);
493 516 /* LINTED E_BAD_PTR_CAST_ALIGN */
494 517 x_11 = LOAD_LITTLE_32_b(block);
495 518 /* LINTED E_BAD_PTR_CAST_ALIGN */
496 519 x_10 = LOAD_LITTLE_32_a(block);
497 520 /* LINTED E_BAD_PTR_CAST_ALIGN */
498 521 x_9 = LOAD_LITTLE_32_9(block);
499 522 /* LINTED E_BAD_PTR_CAST_ALIGN */
500 523 x_8 = LOAD_LITTLE_32_8(block);
501 524 /* LINTED E_BAD_PTR_CAST_ALIGN */
502 525 x_7 = LOAD_LITTLE_32_7(block);
503 526 /* LINTED E_BAD_PTR_CAST_ALIGN */
504 527 x_6 = LOAD_LITTLE_32_6(block);
505 528 /* LINTED E_BAD_PTR_CAST_ALIGN */
506 529 x_5 = LOAD_LITTLE_32_5(block);
507 530 /* LINTED E_BAD_PTR_CAST_ALIGN */
508 531 x_4 = LOAD_LITTLE_32_4(block);
509 532 /* LINTED E_BAD_PTR_CAST_ALIGN */
510 533 x_3 = LOAD_LITTLE_32_3(block);
511 534 /* LINTED E_BAD_PTR_CAST_ALIGN */
512 535 x_2 = LOAD_LITTLE_32_2(block);
513 536 /* LINTED E_BAD_PTR_CAST_ALIGN */
514 537 x_1 = LOAD_LITTLE_32_1(block);
515 538 /* LINTED E_BAD_PTR_CAST_ALIGN */
516 539 x_0 = LOAD_LITTLE_32_0(block);
517 540 #else
518 541 /* LINTED E_BAD_PTR_CAST_ALIGN */
519 542 x_15 = LOAD_LITTLE_32(block + 60);
520 543 /* LINTED E_BAD_PTR_CAST_ALIGN */
521 544 x_14 = LOAD_LITTLE_32(block + 56);
522 545 /* LINTED E_BAD_PTR_CAST_ALIGN */
523 546 x_13 = LOAD_LITTLE_32(block + 52);
524 547 /* LINTED E_BAD_PTR_CAST_ALIGN */
525 548 x_12 = LOAD_LITTLE_32(block + 48);
526 549 /* LINTED E_BAD_PTR_CAST_ALIGN */
527 550 x_11 = LOAD_LITTLE_32(block + 44);
528 551 /* LINTED E_BAD_PTR_CAST_ALIGN */
529 552 x_10 = LOAD_LITTLE_32(block + 40);
530 553 /* LINTED E_BAD_PTR_CAST_ALIGN */
531 554 x_9 = LOAD_LITTLE_32(block + 36);
532 555 /* LINTED E_BAD_PTR_CAST_ALIGN */
533 556 x_8 = LOAD_LITTLE_32(block + 32);
534 557 /* LINTED E_BAD_PTR_CAST_ALIGN */
535 558 x_7 = LOAD_LITTLE_32(block + 28);
536 559 /* LINTED E_BAD_PTR_CAST_ALIGN */
537 560 x_6 = LOAD_LITTLE_32(block + 24);
538 561 /* LINTED E_BAD_PTR_CAST_ALIGN */
539 562 x_5 = LOAD_LITTLE_32(block + 20);
540 563 /* LINTED E_BAD_PTR_CAST_ALIGN */
541 564 x_4 = LOAD_LITTLE_32(block + 16);
542 565 /* LINTED E_BAD_PTR_CAST_ALIGN */
543 566 x_3 = LOAD_LITTLE_32(block + 12);
544 567 /* LINTED E_BAD_PTR_CAST_ALIGN */
545 568 x_2 = LOAD_LITTLE_32(block + 8);
546 569 /* LINTED E_BAD_PTR_CAST_ALIGN */
547 570 x_1 = LOAD_LITTLE_32(block + 4);
548 571 /* LINTED E_BAD_PTR_CAST_ALIGN */
549 572 x_0 = LOAD_LITTLE_32(block + 0);
550 573 #endif /* sun4v */
551 574 }
552 575
553 576 /* round 1 */
554 577 FF(a, b, c, d, x_0, MD5_SHIFT_11, MD5_CONST_e(0)); /* 1 */
555 578 FF(d, a, b, c, x_1, MD5_SHIFT_12, MD5_CONST_o(1)); /* 2 */
556 579 FF(c, d, a, b, x_2, MD5_SHIFT_13, MD5_CONST_e(2)); /* 3 */
557 580 FF(b, c, d, a, x_3, MD5_SHIFT_14, MD5_CONST_o(3)); /* 4 */
558 581 FF(a, b, c, d, x_4, MD5_SHIFT_11, MD5_CONST_e(4)); /* 5 */
559 582 FF(d, a, b, c, x_5, MD5_SHIFT_12, MD5_CONST_o(5)); /* 6 */
560 583 FF(c, d, a, b, x_6, MD5_SHIFT_13, MD5_CONST_e(6)); /* 7 */
561 584 FF(b, c, d, a, x_7, MD5_SHIFT_14, MD5_CONST_o(7)); /* 8 */
562 585 FF(a, b, c, d, x_8, MD5_SHIFT_11, MD5_CONST_e(8)); /* 9 */
563 586 FF(d, a, b, c, x_9, MD5_SHIFT_12, MD5_CONST_o(9)); /* 10 */
564 587 FF(c, d, a, b, x_10, MD5_SHIFT_13, MD5_CONST_e(10)); /* 11 */
565 588 FF(b, c, d, a, x_11, MD5_SHIFT_14, MD5_CONST_o(11)); /* 12 */
566 589 FF(a, b, c, d, x_12, MD5_SHIFT_11, MD5_CONST_e(12)); /* 13 */
567 590 FF(d, a, b, c, x_13, MD5_SHIFT_12, MD5_CONST_o(13)); /* 14 */
568 591 FF(c, d, a, b, x_14, MD5_SHIFT_13, MD5_CONST_e(14)); /* 15 */
569 592 FF(b, c, d, a, x_15, MD5_SHIFT_14, MD5_CONST_o(15)); /* 16 */
570 593
571 594 /* round 2 */
572 595 GG(a, b, c, d, x_1, MD5_SHIFT_21, MD5_CONST_e(16)); /* 17 */
573 596 GG(d, a, b, c, x_6, MD5_SHIFT_22, MD5_CONST_o(17)); /* 18 */
574 597 GG(c, d, a, b, x_11, MD5_SHIFT_23, MD5_CONST_e(18)); /* 19 */
575 598 GG(b, c, d, a, x_0, MD5_SHIFT_24, MD5_CONST_o(19)); /* 20 */
576 599 GG(a, b, c, d, x_5, MD5_SHIFT_21, MD5_CONST_e(20)); /* 21 */
577 600 GG(d, a, b, c, x_10, MD5_SHIFT_22, MD5_CONST_o(21)); /* 22 */
578 601 GG(c, d, a, b, x_15, MD5_SHIFT_23, MD5_CONST_e(22)); /* 23 */
579 602 GG(b, c, d, a, x_4, MD5_SHIFT_24, MD5_CONST_o(23)); /* 24 */
580 603 GG(a, b, c, d, x_9, MD5_SHIFT_21, MD5_CONST_e(24)); /* 25 */
581 604 GG(d, a, b, c, x_14, MD5_SHIFT_22, MD5_CONST_o(25)); /* 26 */
582 605 GG(c, d, a, b, x_3, MD5_SHIFT_23, MD5_CONST_e(26)); /* 27 */
583 606 GG(b, c, d, a, x_8, MD5_SHIFT_24, MD5_CONST_o(27)); /* 28 */
584 607 GG(a, b, c, d, x_13, MD5_SHIFT_21, MD5_CONST_e(28)); /* 29 */
585 608 GG(d, a, b, c, x_2, MD5_SHIFT_22, MD5_CONST_o(29)); /* 30 */
586 609 GG(c, d, a, b, x_7, MD5_SHIFT_23, MD5_CONST_e(30)); /* 31 */
587 610 GG(b, c, d, a, x_12, MD5_SHIFT_24, MD5_CONST_o(31)); /* 32 */
588 611
589 612 /* round 3 */
590 613 HH(a, b, c, d, x_5, MD5_SHIFT_31, MD5_CONST_e(32)); /* 33 */
591 614 HH(d, a, b, c, x_8, MD5_SHIFT_32, MD5_CONST_o(33)); /* 34 */
592 615 HH(c, d, a, b, x_11, MD5_SHIFT_33, MD5_CONST_e(34)); /* 35 */
593 616 HH(b, c, d, a, x_14, MD5_SHIFT_34, MD5_CONST_o(35)); /* 36 */
594 617 HH(a, b, c, d, x_1, MD5_SHIFT_31, MD5_CONST_e(36)); /* 37 */
595 618 HH(d, a, b, c, x_4, MD5_SHIFT_32, MD5_CONST_o(37)); /* 38 */
596 619 HH(c, d, a, b, x_7, MD5_SHIFT_33, MD5_CONST_e(38)); /* 39 */
597 620 HH(b, c, d, a, x_10, MD5_SHIFT_34, MD5_CONST_o(39)); /* 40 */
598 621 HH(a, b, c, d, x_13, MD5_SHIFT_31, MD5_CONST_e(40)); /* 41 */
599 622 HH(d, a, b, c, x_0, MD5_SHIFT_32, MD5_CONST_o(41)); /* 42 */
600 623 HH(c, d, a, b, x_3, MD5_SHIFT_33, MD5_CONST_e(42)); /* 43 */
601 624 HH(b, c, d, a, x_6, MD5_SHIFT_34, MD5_CONST_o(43)); /* 44 */
602 625 HH(a, b, c, d, x_9, MD5_SHIFT_31, MD5_CONST_e(44)); /* 45 */
603 626 HH(d, a, b, c, x_12, MD5_SHIFT_32, MD5_CONST_o(45)); /* 46 */
604 627 HH(c, d, a, b, x_15, MD5_SHIFT_33, MD5_CONST_e(46)); /* 47 */
605 628 HH(b, c, d, a, x_2, MD5_SHIFT_34, MD5_CONST_o(47)); /* 48 */
606 629
607 630 /* round 4 */
608 631 II(a, b, c, d, x_0, MD5_SHIFT_41, MD5_CONST_e(48)); /* 49 */
609 632 II(d, a, b, c, x_7, MD5_SHIFT_42, MD5_CONST_o(49)); /* 50 */
610 633 II(c, d, a, b, x_14, MD5_SHIFT_43, MD5_CONST_e(50)); /* 51 */
611 634 II(b, c, d, a, x_5, MD5_SHIFT_44, MD5_CONST_o(51)); /* 52 */
612 635 II(a, b, c, d, x_12, MD5_SHIFT_41, MD5_CONST_e(52)); /* 53 */
613 636 II(d, a, b, c, x_3, MD5_SHIFT_42, MD5_CONST_o(53)); /* 54 */
614 637 II(c, d, a, b, x_10, MD5_SHIFT_43, MD5_CONST_e(54)); /* 55 */
615 638 II(b, c, d, a, x_1, MD5_SHIFT_44, MD5_CONST_o(55)); /* 56 */
616 639 II(a, b, c, d, x_8, MD5_SHIFT_41, MD5_CONST_e(56)); /* 57 */
617 640 II(d, a, b, c, x_15, MD5_SHIFT_42, MD5_CONST_o(57)); /* 58 */
618 641 II(c, d, a, b, x_6, MD5_SHIFT_43, MD5_CONST_e(58)); /* 59 */
619 642 II(b, c, d, a, x_13, MD5_SHIFT_44, MD5_CONST_o(59)); /* 60 */
620 643 II(a, b, c, d, x_4, MD5_SHIFT_41, MD5_CONST_e(60)); /* 61 */
621 644 II(d, a, b, c, x_11, MD5_SHIFT_42, MD5_CONST_o(61)); /* 62 */
622 645 II(c, d, a, b, x_2, MD5_SHIFT_43, MD5_CONST_e(62)); /* 63 */
623 646 II(b, c, d, a, x_9, MD5_SHIFT_44, MD5_CONST_o(63)); /* 64 */
624 647
625 648 ctx->state[0] += a;
626 649 ctx->state[1] += b;
627 650 ctx->state[2] += c;
↓ open down ↓ |
248 lines elided |
↑ open up ↑ |
628 651 ctx->state[3] += d;
629 652
630 653 /*
631 654 * zeroize sensitive information -- compiler will optimize
632 655 * this out if everything is kept in registers
633 656 */
634 657
635 658 x_0 = x_1 = x_2 = x_3 = x_4 = x_5 = x_6 = x_7 = x_8 = 0;
636 659 x_9 = x_10 = x_11 = x_12 = x_13 = x_14 = x_15 = 0;
637 660 }
661 +#endif /* !defined(__amd64) */
638 662
639 663 /*
640 664 * Encode()
641 665 *
642 666 * purpose: to convert a list of numbers from big endian to little endian
643 667 * input: uint8_t * : place to store the converted little endian numbers
644 668 * uint32_t * : place to get numbers to convert from
645 669 * size_t : the length of the input in bytes
646 670 * output: void
647 671 */
648 672
649 673 static void
650 674 Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
651 675 size_t input_len)
652 676 {
653 677 size_t i, j;
654 678
655 679 for (i = 0, j = 0; j < input_len; i++, j += sizeof (uint32_t)) {
656 680
657 681 #ifdef _LITTLE_ENDIAN
658 682
659 683 #ifdef _MD5_CHECK_ALIGNMENT
660 684 if ((uintptr_t)output & 0x3) /* Not 4-byte aligned */
661 685 bcopy(input + i, output + j, 4);
662 686 else *(uint32_t *)(output + j) = input[i];
663 687 #else
664 688 /*LINTED E_BAD_PTR_CAST_ALIGN*/
665 689 *(uint32_t *)(output + j) = input[i];
666 690 #endif /* _MD5_CHECK_ALIGNMENT */
667 691
668 692 #else /* big endian -- will work on little endian, but slowly */
669 693
670 694 output[j] = input[i] & 0xff;
671 695 output[j + 1] = (input[i] >> 8) & 0xff;
672 696 output[j + 2] = (input[i] >> 16) & 0xff;
673 697 output[j + 3] = (input[i] >> 24) & 0xff;
674 698 #endif
675 699 }
676 700 }
↓ open down ↓ |
29 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX