1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <sys/asm_linkage.h>
30
31 #if defined(lint) || defined(__lint)
32
33 #include <sys/types.h>
34
35 /* ARGSUSED */
36 uint64_t
37 big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
38 { return (0); }
39
40 /* ARGSUSED */
41 uint64_t
42 big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
43 { return (0); }
44
45 /* ARGSUSED */
46 void
47 big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
48 {}
49
50 #else /* lint */
51
52 / ------------------------------------------------------------------------
53 /
54 / Implementation of big_mul_set_vec which exploits
55 / the 64X64->128 bit unsigned multiply instruction.
56 /
57 / As defined in Sun's bignum library for pkcs11, bignums are
58 / composed of an array of 32-bit "digits" along with descriptive
59 / information. The arrays of digits are only required to be
60 / aligned on 32-bit boundary. This implementation works only
61 / when the two factors and the result happen to be 64 bit aligned
62 / and have an even number of digits.
63 /
64 / ------------------------------------------------------------------------
65
66 / r = a * digit, r and a are vectors of length len
67 / returns the carry digit
68 / r and a are 64 bit aligned.
69 /
70 / uint64_t
71 / big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
72 /
73 ENTRY(big_mul_set_vec64)
74 xorq %rax, %rax / if (len == 0) return (0)
75 testq %rdx, %rdx
76 jz .L17
77
78 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
79 xorq %r9, %r9 / cy = 0
80
81 .L15:
82 cmpq $8, %r8 / 8 - len
83 jb .L16
84 movq 0(%rsi), %rax / rax = a[0]
85 movq 8(%rsi), %r11 / prefetch a[1]
86 mulq %rcx / p = a[0] * digit
87 addq %r9, %rax
88 adcq $0, %rdx / p += cy
89 movq %rax, 0(%rdi) / r[0] = lo(p)
90 movq %rdx, %r9 / cy = hi(p)
91
92 movq %r11, %rax
93 movq 16(%rsi), %r11 / prefetch a[2]
202 addq %r9, %rax
203 adcq $0, %rdx / p += cy
204 movq %rax, 40(%rdi) / r[5] = lo(p)
205 movq %rdx, %r9 / cy = hi(p)
206 decq %r8
207 jz .L17
208
209 movq 48(%rsi), %rax
210 mulq %rcx / p = a[6] * digit
211 addq %r9, %rax
212 adcq $0, %rdx / p += cy
213 movq %rax, 48(%rdi) / r[6] = lo(p)
214 movq %rdx, %r9 / cy = hi(p)
215 decq %r8
216 jz .L17
217
218
219 .L17:
220 movq %r9, %rax
221 ret
222 SET_SIZE(big_mul_set_vec64)
223
224 / ------------------------------------------------------------------------
225 /
226 / Implementation of big_mul_add_vec which exploits
227 / the 64X64->128 bit unsigned multiply instruction.
228 /
229 / As defined in Sun's bignum library for pkcs11, bignums are
230 / composed of an array of 32-bit "digits" along with descriptive
231 / information. The arrays of digits are only required to be
232 / aligned on 32-bit boundary. This implementation works only
233 / when the two factors and the result happen to be 64 bit aligned
234 / and have an even number of digits.
235 /
236 / ------------------------------------------------------------------------
237
238 / r += a * digit, r and a are vectors of length len
239 / returns the carry digit
240 / r and a are 64 bit aligned.
241 /
242 / uint64_t
243 / big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
244 /
245 ENTRY(big_mul_add_vec64)
246 xorq %rax, %rax / if (len == 0) return (0)
247 testq %rdx, %rdx
248 jz .L27
249
250 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
251 xorq %r9, %r9 / cy = 0
252
253 .L25:
254 cmpq $8, %r8 / 8 - len
255 jb .L26
256 movq 0(%rsi), %rax / rax = a[0]
257 movq 0(%rdi), %r10 / r10 = r[0]
258 movq 8(%rsi), %r11 / prefetch a[1]
259 mulq %rcx / p = a[0] * digit
260 addq %r10, %rax
261 adcq $0, %rdx / p += r[0]
262 movq 8(%rdi), %r10 / prefetch r[1]
263 addq %r9, %rax
264 adcq $0, %rdx / p += cy
265 movq %rax, 0(%rdi) / r[0] = lo(p)
419 movq %rdx, %r9 / cy = hi(p)
420 decq %r8
421 jz .L27
422
423 movq 48(%rsi), %rax
424 movq 48(%rdi), %r10
425 mulq %rcx / p = a[6] * digit
426 addq %r10, %rax
427 adcq $0, %rdx / p += r[6]
428 addq %r9, %rax
429 adcq $0, %rdx / p += cy
430 movq %rax, 48(%rdi) / r[6] = lo(p)
431 movq %rdx, %r9 / cy = hi(p)
432 decq %r8
433 jz .L27
434
435
436 .L27:
437 movq %r9, %rax
438 ret
439 SET_SIZE(big_mul_add_vec64)
440
441
442 / void
443 / big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
444
445 ENTRY(big_sqr_vec64)
446 pushq %rbx
447 pushq %rbp
448 pushq %r12
449 pushq %r13
450 pushq %r14
451 pushq %r15
452 pushq %rdx / save arg3, len
453 pushq %rsi / save arg2, a
454 pushq %rdi / save arg1, r
455
456 leaq 8(%rdi), %r13 / tr = r + 1
457 movq %rsi, %r14 / ta = a
458 movq %rdx, %r15 / tlen = len
459 decq %r15 / tlen = len - 1
460 movq %r13, %rdi / arg1 = tr
461 leaq 8(%r14), %rsi / arg2 = ta + 1
462 movq %r15, %rdx / arg3 = tlen
463 movq 0(%r14), %rcx / arg4 = ta[0]
464 call big_mul_set_vec64
465 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
466 .L31:
467 decq %r15 / --tlen
468 jz .L32 / while (--tlen != 0)
469
470 addq $16, %r13 / tr += 2
471 addq $8, %r14 / ++ta
472 movq %r13, %rdi / arg1 = tr
473 leaq 8(%r14), %rsi / arg2 = ta + 1
474 movq %r15, %rdx / arg3 = tlen
475 movq 0(%r14), %rcx / arg4 = ta[0]
476 call big_mul_add_vec64
477 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
478 jmp .L31
479
480 .L32:
481
482 / No more function calls after this.
483 / Restore arguments to registers.
484 / However, don't use %rdx for arg3, len, because it is heavily
485 / used by the hardware MUL instruction. Use %r8, instead.
486 movq 0(%rsp), %rdi / %rdi == arg1 == r
487 movq 8(%rsp), %rsi / %rsi == arg2 == a
488 movq 16(%rsp), %r8 / %r8 == arg3 == len
489
490 movq 0(%rsi), %rax / %rax = a[0];
491 mulq %rax / s = %edx:%eax = a[0]**2
492 movq %rax, 0(%rdi) / r[0] = lo64(s)
493 movq %rdx, %r9 / cy = hi64(s)
494 xorq %rdx, %rdx
495 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
496 addq %rax, %rax
535 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
536 movq %rdx, %r9 / cy = hi64(p)
537
538 incq %r11 / ++row
539 addq $2, %r12 / col += 2
540 jmp .L33
541
542 .L34:
543 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
544
545 addq $24, %rsp / skip %rdi, %rsi, %rdx
546 popq %r15
547 popq %r14
548 popq %r13
549 popq %r12
550 popq %rbp
551 popq %rbx
552
553 ret
554
555 SET_SIZE(big_sqr_vec64)
556
557 #endif /* lint */
|
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/asm_linkage.h>
27
28 #if defined(lint) || defined(__lint)
29
30 #include <sys/types.h>
31
32 /* ARGSUSED */
33 uint64_t
34 big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
35 { return (0); }
36
37 /* ARGSUSED */
38 uint64_t
39 big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
40 { return (0); }
41
42 /* ARGSUSED */
43 void
44 big_sqr_vec(uint64_t *r, uint64_t *a, int len)
45 {}
46
47 #else /* lint */
48
49 / ------------------------------------------------------------------------
50 /
51 / Implementation of big_mul_set_vec which exploits
52 / the 64X64->128 bit unsigned multiply instruction.
53 /
54 / As defined in Sun's bignum library for pkcs11, bignums are
55 / composed of an array of 64-bit "digits" or "chunks" along with
56 / descriptive information.
57 /
58 / ------------------------------------------------------------------------
59
60 / r = a * digit, r and a are vectors of length len
61 / returns the carry digit
62 / r and a are 64 bit aligned.
63 /
64 / uint64_t
65 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
66 /
67 ENTRY(big_mul_set_vec)
68 xorq %rax, %rax / if (len == 0) return (0)
69 testq %rdx, %rdx
70 jz .L17
71
72 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
73 xorq %r9, %r9 / cy = 0
74
75 .L15:
76 cmpq $8, %r8 / 8 - len
77 jb .L16
78 movq 0(%rsi), %rax / rax = a[0]
79 movq 8(%rsi), %r11 / prefetch a[1]
80 mulq %rcx / p = a[0] * digit
81 addq %r9, %rax
82 adcq $0, %rdx / p += cy
83 movq %rax, 0(%rdi) / r[0] = lo(p)
84 movq %rdx, %r9 / cy = hi(p)
85
86 movq %r11, %rax
87 movq 16(%rsi), %r11 / prefetch a[2]
196 addq %r9, %rax
197 adcq $0, %rdx / p += cy
198 movq %rax, 40(%rdi) / r[5] = lo(p)
199 movq %rdx, %r9 / cy = hi(p)
200 decq %r8
201 jz .L17
202
203 movq 48(%rsi), %rax
204 mulq %rcx / p = a[6] * digit
205 addq %r9, %rax
206 adcq $0, %rdx / p += cy
207 movq %rax, 48(%rdi) / r[6] = lo(p)
208 movq %rdx, %r9 / cy = hi(p)
209 decq %r8
210 jz .L17
211
212
213 .L17:
214 movq %r9, %rax
215 ret
216 SET_SIZE(big_mul_set_vec)
217
218
219 / ------------------------------------------------------------------------
220 /
221 / Implementation of big_mul_add_vec which exploits
222 / the 64X64->128 bit unsigned multiply instruction.
223 /
224 / As defined in Sun's bignum library for pkcs11, bignums are
225 / composed of an array of 64-bit "digits" or "chunks" along with
226 / descriptive information.
227 /
228 / ------------------------------------------------------------------------
229
230 / r += a * digit, r and a are vectors of length len
231 / returns the carry digit
232 / r and a are 64 bit aligned.
233 /
234 / uint64_t
235 / big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
236 /
237 ENTRY(big_mul_add_vec)
238 xorq %rax, %rax / if (len == 0) return (0)
239 testq %rdx, %rdx
240 jz .L27
241
242 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
243 xorq %r9, %r9 / cy = 0
244
245 .L25:
246 cmpq $8, %r8 / 8 - len
247 jb .L26
248 movq 0(%rsi), %rax / rax = a[0]
249 movq 0(%rdi), %r10 / r10 = r[0]
250 movq 8(%rsi), %r11 / prefetch a[1]
251 mulq %rcx / p = a[0] * digit
252 addq %r10, %rax
253 adcq $0, %rdx / p += r[0]
254 movq 8(%rdi), %r10 / prefetch r[1]
255 addq %r9, %rax
256 adcq $0, %rdx / p += cy
257 movq %rax, 0(%rdi) / r[0] = lo(p)
411 movq %rdx, %r9 / cy = hi(p)
412 decq %r8
413 jz .L27
414
415 movq 48(%rsi), %rax
416 movq 48(%rdi), %r10
417 mulq %rcx / p = a[6] * digit
418 addq %r10, %rax
419 adcq $0, %rdx / p += r[6]
420 addq %r9, %rax
421 adcq $0, %rdx / p += cy
422 movq %rax, 48(%rdi) / r[6] = lo(p)
423 movq %rdx, %r9 / cy = hi(p)
424 decq %r8
425 jz .L27
426
427
428 .L27:
429 movq %r9, %rax
430 ret
431 SET_SIZE(big_mul_add_vec)
432
433
434 / void
435 / big_sqr_vec(uint64_t *r, uint64_t *a, int len)
436
437 ENTRY(big_sqr_vec)
438 pushq %rbx
439 pushq %rbp
440 pushq %r12
441 pushq %r13
442 pushq %r14
443 pushq %r15
444 pushq %rdx / save arg3, len
445 pushq %rsi / save arg2, a
446 pushq %rdi / save arg1, r
447
448 leaq 8(%rdi), %r13 / tr = r + 1
449 movq %rsi, %r14 / ta = a
450 movq %rdx, %r15 / tlen = len
451 decq %r15 / tlen = len - 1
452 movq %r13, %rdi / arg1 = tr
453 leaq 8(%r14), %rsi / arg2 = ta + 1
454 movq %r15, %rdx / arg3 = tlen
455 movq 0(%r14), %rcx / arg4 = ta[0]
456 call big_mul_set_vec
457 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
458 .L31:
459 decq %r15 / --tlen
460 jz .L32 / while (--tlen != 0)
461
462 addq $16, %r13 / tr += 2
463 addq $8, %r14 / ++ta
464 movq %r13, %rdi / arg1 = tr
465 leaq 8(%r14), %rsi / arg2 = ta + 1
466 movq %r15, %rdx / arg3 = tlen
467 movq 0(%r14), %rcx / arg4 = ta[0]
468 call big_mul_add_vec
469 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
470 jmp .L31
471
472 .L32:
473
474 / No more function calls after this.
475 / Restore arguments to registers.
476 / However, don't use %rdx for arg3, len, because it is heavily
477 / used by the hardware MUL instruction. Use %r8, instead.
478 movq 0(%rsp), %rdi / %rdi == arg1 == r
479 movq 8(%rsp), %rsi / %rsi == arg2 == a
480 movq 16(%rsp), %r8 / %r8 == arg3 == len
481
482 movq 0(%rsi), %rax / %rax = a[0];
483 mulq %rax / s = %edx:%eax = a[0]**2
484 movq %rax, 0(%rdi) / r[0] = lo64(s)
485 movq %rdx, %r9 / cy = hi64(s)
486 xorq %rdx, %rdx
487 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
488 addq %rax, %rax
527 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
528 movq %rdx, %r9 / cy = hi64(p)
529
530 incq %r11 / ++row
531 addq $2, %r12 / col += 2
532 jmp .L33
533
534 .L34:
535 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
536
537 addq $24, %rsp / skip %rdi, %rsi, %rdx
538 popq %r15
539 popq %r14
540 popq %r13
541 popq %r12
542 popq %rbp
543 popq %rbx
544
545 ret
546
547 SET_SIZE(big_sqr_vec)
548
549 #endif /* lint */
|