1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <sys/asm_linkage.h>
30
31 #if defined(lint) || defined(__lint)
32
33 #include <sys/types.h>
34
35 /* ARGSUSED */
36 uint64_t
37 big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
38 { return (0); }
39
40 /* ARGSUSED */
41 uint64_t
42 big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
43 { return (0); }
44
45 /* ARGSUSED */
46 void
47 big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
48 {}
49
50 #else /* lint */
51
52 / ------------------------------------------------------------------------
53 /
54 / Implementation of big_mul_set_vec which exploits
55 / the 64X64->128 bit unsigned multiply instruction.
56 /
57 / As defined in Sun's bignum library for pkcs11, bignums are
58 / composed of an array of 32-bit "digits" along with descriptive
59 / information. The arrays of digits are only required to be
60 / aligned on 32-bit boundary. This implementation works only
61 / when the two factors and the result happen to be 64 bit aligned
62 / and have an even number of digits.
63 /
64 / ------------------------------------------------------------------------
65
66 / r = a * digit, r and a are vectors of length len
67 / returns the carry digit
68 / r and a are 64 bit aligned.
69 /
70 / uint64_t
71 / big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
72 /
73 ENTRY(big_mul_set_vec64)
74 xorq %rax, %rax / if (len == 0) return (0)
75 testq %rdx, %rdx
76 jz .L17
77
78 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
79 xorq %r9, %r9 / cy = 0
80
81 .L15:
82 cmpq $8, %r8 / 8 - len
83 jb .L16
84 movq 0(%rsi), %rax / rax = a[0]
85 movq 8(%rsi), %r11 / prefetch a[1]
86 mulq %rcx / p = a[0] * digit
87 addq %r9, %rax
88 adcq $0, %rdx / p += cy
89 movq %rax, 0(%rdi) / r[0] = lo(p)
90 movq %rdx, %r9 / cy = hi(p)
91
92 movq %r11, %rax
93 movq 16(%rsi), %r11 / prefetch a[2]
94 mulq %rcx / p = a[1] * digit
95 addq %r9, %rax
96 adcq $0, %rdx / p += cy
97 movq %rax, 8(%rdi) / r[1] = lo(p)
98 movq %rdx, %r9 / cy = hi(p)
99
100 movq %r11, %rax
101 movq 24(%rsi), %r11 / prefetch a[3]
102 mulq %rcx / p = a[2] * digit
103 addq %r9, %rax
104 adcq $0, %rdx / p += cy
105 movq %rax, 16(%rdi) / r[2] = lo(p)
106 movq %rdx, %r9 / cy = hi(p)
107
108 movq %r11, %rax
109 movq 32(%rsi), %r11 / prefetch a[4]
110 mulq %rcx / p = a[3] * digit
111 addq %r9, %rax
112 adcq $0, %rdx / p += cy
113 movq %rax, 24(%rdi) / r[3] = lo(p)
114 movq %rdx, %r9 / cy = hi(p)
115
116 movq %r11, %rax
117 movq 40(%rsi), %r11 / prefetch a[5]
118 mulq %rcx / p = a[4] * digit
119 addq %r9, %rax
120 adcq $0, %rdx / p += cy
121 movq %rax, 32(%rdi) / r[4] = lo(p)
122 movq %rdx, %r9 / cy = hi(p)
123
124 movq %r11, %rax
125 movq 48(%rsi), %r11 / prefetch a[6]
126 mulq %rcx / p = a[5] * digit
127 addq %r9, %rax
128 adcq $0, %rdx / p += cy
129 movq %rax, 40(%rdi) / r[5] = lo(p)
130 movq %rdx, %r9 / cy = hi(p)
131
132 movq %r11, %rax
133 movq 56(%rsi), %r11 / prefetch a[7]
134 mulq %rcx / p = a[6] * digit
135 addq %r9, %rax
136 adcq $0, %rdx / p += cy
137 movq %rax, 48(%rdi) / r[6] = lo(p)
138 movq %rdx, %r9 / cy = hi(p)
139
140 movq %r11, %rax
141 mulq %rcx / p = a[7] * digit
142 addq %r9, %rax
143 adcq $0, %rdx / p += cy
144 movq %rax, 56(%rdi) / r[7] = lo(p)
145 movq %rdx, %r9 / cy = hi(p)
146
147 addq $64, %rsi
148 addq $64, %rdi
149 subq $8, %r8
150
151 jz .L17
152 jmp .L15
153
154 .L16:
155 movq 0(%rsi), %rax
156 mulq %rcx / p = a[0] * digit
157 addq %r9, %rax
158 adcq $0, %rdx / p += cy
159 movq %rax, 0(%rdi) / r[0] = lo(p)
160 movq %rdx, %r9 / cy = hi(p)
161 decq %r8
162 jz .L17
163
164 movq 8(%rsi), %rax
165 mulq %rcx / p = a[1] * digit
166 addq %r9, %rax
167 adcq $0, %rdx / p += cy
168 movq %rax, 8(%rdi) / r[1] = lo(p)
169 movq %rdx, %r9 / cy = hi(p)
170 decq %r8
171 jz .L17
172
173 movq 16(%rsi), %rax
174 mulq %rcx / p = a[2] * digit
175 addq %r9, %rax
176 adcq $0, %rdx / p += cy
177 movq %rax, 16(%rdi) / r[2] = lo(p)
178 movq %rdx, %r9 / cy = hi(p)
179 decq %r8
180 jz .L17
181
182 movq 24(%rsi), %rax
183 mulq %rcx / p = a[3] * digit
184 addq %r9, %rax
185 adcq $0, %rdx / p += cy
186 movq %rax, 24(%rdi) / r[3] = lo(p)
187 movq %rdx, %r9 / cy = hi(p)
188 decq %r8
189 jz .L17
190
191 movq 32(%rsi), %rax
192 mulq %rcx / p = a[4] * digit
193 addq %r9, %rax
194 adcq $0, %rdx / p += cy
195 movq %rax, 32(%rdi) / r[4] = lo(p)
196 movq %rdx, %r9 / cy = hi(p)
197 decq %r8
198 jz .L17
199
200 movq 40(%rsi), %rax
201 mulq %rcx / p = a[5] * digit
202 addq %r9, %rax
203 adcq $0, %rdx / p += cy
204 movq %rax, 40(%rdi) / r[5] = lo(p)
205 movq %rdx, %r9 / cy = hi(p)
206 decq %r8
207 jz .L17
208
209 movq 48(%rsi), %rax
210 mulq %rcx / p = a[6] * digit
211 addq %r9, %rax
212 adcq $0, %rdx / p += cy
213 movq %rax, 48(%rdi) / r[6] = lo(p)
214 movq %rdx, %r9 / cy = hi(p)
215 decq %r8
216 jz .L17
217
218
219 .L17:
220 movq %r9, %rax
221 ret
222 SET_SIZE(big_mul_set_vec64)
223
224 / ------------------------------------------------------------------------
225 /
226 / Implementation of big_mul_add_vec which exploits
227 / the 64X64->128 bit unsigned multiply instruction.
228 /
229 / As defined in Sun's bignum library for pkcs11, bignums are
230 / composed of an array of 32-bit "digits" along with descriptive
231 / information. The arrays of digits are only required to be
232 / aligned on 32-bit boundary. This implementation works only
233 / when the two factors and the result happen to be 64 bit aligned
234 / and have an even number of digits.
235 /
236 / ------------------------------------------------------------------------
237
238 / r += a * digit, r and a are vectors of length len
239 / returns the carry digit
240 / r and a are 64 bit aligned.
241 /
242 / uint64_t
243 / big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
244 /
245 ENTRY(big_mul_add_vec64)
246 xorq %rax, %rax / if (len == 0) return (0)
247 testq %rdx, %rdx
248 jz .L27
249
250 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
251 xorq %r9, %r9 / cy = 0
252
253 .L25:
254 cmpq $8, %r8 / 8 - len
255 jb .L26
256 movq 0(%rsi), %rax / rax = a[0]
257 movq 0(%rdi), %r10 / r10 = r[0]
258 movq 8(%rsi), %r11 / prefetch a[1]
259 mulq %rcx / p = a[0] * digit
260 addq %r10, %rax
261 adcq $0, %rdx / p += r[0]
262 movq 8(%rdi), %r10 / prefetch r[1]
263 addq %r9, %rax
264 adcq $0, %rdx / p += cy
265 movq %rax, 0(%rdi) / r[0] = lo(p)
266 movq %rdx, %r9 / cy = hi(p)
267
268 movq %r11, %rax
269 movq 16(%rsi), %r11 / prefetch a[2]
270 mulq %rcx / p = a[1] * digit
271 addq %r10, %rax
272 adcq $0, %rdx / p += r[1]
273 movq 16(%rdi), %r10 / prefetch r[2]
274 addq %r9, %rax
275 adcq $0, %rdx / p += cy
276 movq %rax, 8(%rdi) / r[1] = lo(p)
277 movq %rdx, %r9 / cy = hi(p)
278
279 movq %r11, %rax
280 movq 24(%rsi), %r11 / prefetch a[3]
281 mulq %rcx / p = a[2] * digit
282 addq %r10, %rax
283 adcq $0, %rdx / p += r[2]
284 movq 24(%rdi), %r10 / prefetch r[3]
285 addq %r9, %rax
286 adcq $0, %rdx / p += cy
287 movq %rax, 16(%rdi) / r[2] = lo(p)
288 movq %rdx, %r9 / cy = hi(p)
289
290 movq %r11, %rax
291 movq 32(%rsi), %r11 / prefetch a[4]
292 mulq %rcx / p = a[3] * digit
293 addq %r10, %rax
294 adcq $0, %rdx / p += r[3]
295 movq 32(%rdi), %r10 / prefetch r[4]
296 addq %r9, %rax
297 adcq $0, %rdx / p += cy
298 movq %rax, 24(%rdi) / r[3] = lo(p)
299 movq %rdx, %r9 / cy = hi(p)
300
301 movq %r11, %rax
302 movq 40(%rsi), %r11 / prefetch a[5]
303 mulq %rcx / p = a[4] * digit
304 addq %r10, %rax
305 adcq $0, %rdx / p += r[4]
306 movq 40(%rdi), %r10 / prefetch r[5]
307 addq %r9, %rax
308 adcq $0, %rdx / p += cy
309 movq %rax, 32(%rdi) / r[4] = lo(p)
310 movq %rdx, %r9 / cy = hi(p)
311
312 movq %r11, %rax
313 movq 48(%rsi), %r11 / prefetch a[6]
314 mulq %rcx / p = a[5] * digit
315 addq %r10, %rax
316 adcq $0, %rdx / p += r[5]
317 movq 48(%rdi), %r10 / prefetch r[6]
318 addq %r9, %rax
319 adcq $0, %rdx / p += cy
320 movq %rax, 40(%rdi) / r[5] = lo(p)
321 movq %rdx, %r9 / cy = hi(p)
322
323 movq %r11, %rax
324 movq 56(%rsi), %r11 / prefetch a[7]
325 mulq %rcx / p = a[6] * digit
326 addq %r10, %rax
327 adcq $0, %rdx / p += r[6]
328 movq 56(%rdi), %r10 / prefetch r[7]
329 addq %r9, %rax
330 adcq $0, %rdx / p += cy
331 movq %rax, 48(%rdi) / r[6] = lo(p)
332 movq %rdx, %r9 / cy = hi(p)
333
334 movq %r11, %rax
335 mulq %rcx / p = a[7] * digit
336 addq %r10, %rax
337 adcq $0, %rdx / p += r[7]
338 addq %r9, %rax
339 adcq $0, %rdx / p += cy
340 movq %rax, 56(%rdi) / r[7] = lo(p)
341 movq %rdx, %r9 / cy = hi(p)
342
343 addq $64, %rsi
344 addq $64, %rdi
345 subq $8, %r8
346
347 jz .L27
348 jmp .L25
349
350 .L26:
351 movq 0(%rsi), %rax
352 movq 0(%rdi), %r10
353 mulq %rcx / p = a[0] * digit
354 addq %r10, %rax
355 adcq $0, %rdx / p += r[0]
356 addq %r9, %rax
357 adcq $0, %rdx / p += cy
358 movq %rax, 0(%rdi) / r[0] = lo(p)
359 movq %rdx, %r9 / cy = hi(p)
360 decq %r8
361 jz .L27
362
363 movq 8(%rsi), %rax
364 movq 8(%rdi), %r10
365 mulq %rcx / p = a[1] * digit
366 addq %r10, %rax
367 adcq $0, %rdx / p += r[1]
368 addq %r9, %rax
369 adcq $0, %rdx / p += cy
370 movq %rax, 8(%rdi) / r[1] = lo(p)
371 movq %rdx, %r9 / cy = hi(p)
372 decq %r8
373 jz .L27
374
375 movq 16(%rsi), %rax
376 movq 16(%rdi), %r10
377 mulq %rcx / p = a[2] * digit
378 addq %r10, %rax
379 adcq $0, %rdx / p += r[2]
380 addq %r9, %rax
381 adcq $0, %rdx / p += cy
382 movq %rax, 16(%rdi) / r[2] = lo(p)
383 movq %rdx, %r9 / cy = hi(p)
384 decq %r8
385 jz .L27
386
387 movq 24(%rsi), %rax
388 movq 24(%rdi), %r10
389 mulq %rcx / p = a[3] * digit
390 addq %r10, %rax
391 adcq $0, %rdx / p += r[3]
392 addq %r9, %rax
393 adcq $0, %rdx / p += cy
394 movq %rax, 24(%rdi) / r[3] = lo(p)
395 movq %rdx, %r9 / cy = hi(p)
396 decq %r8
397 jz .L27
398
399 movq 32(%rsi), %rax
400 movq 32(%rdi), %r10
401 mulq %rcx / p = a[4] * digit
402 addq %r10, %rax
403 adcq $0, %rdx / p += r[4]
404 addq %r9, %rax
405 adcq $0, %rdx / p += cy
406 movq %rax, 32(%rdi) / r[4] = lo(p)
407 movq %rdx, %r9 / cy = hi(p)
408 decq %r8
409 jz .L27
410
411 movq 40(%rsi), %rax
412 movq 40(%rdi), %r10
413 mulq %rcx / p = a[5] * digit
414 addq %r10, %rax
415 adcq $0, %rdx / p += r[5]
416 addq %r9, %rax
417 adcq $0, %rdx / p += cy
418 movq %rax, 40(%rdi) / r[5] = lo(p)
419 movq %rdx, %r9 / cy = hi(p)
420 decq %r8
421 jz .L27
422
423 movq 48(%rsi), %rax
424 movq 48(%rdi), %r10
425 mulq %rcx / p = a[6] * digit
426 addq %r10, %rax
427 adcq $0, %rdx / p += r[6]
428 addq %r9, %rax
429 adcq $0, %rdx / p += cy
430 movq %rax, 48(%rdi) / r[6] = lo(p)
431 movq %rdx, %r9 / cy = hi(p)
432 decq %r8
433 jz .L27
434
435
436 .L27:
437 movq %r9, %rax
438 ret
439 SET_SIZE(big_mul_add_vec64)
440
441
442 / void
443 / big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
444
445 ENTRY(big_sqr_vec64)
446 pushq %rbx
447 pushq %rbp
448 pushq %r12
449 pushq %r13
450 pushq %r14
451 pushq %r15
452 pushq %rdx / save arg3, len
453 pushq %rsi / save arg2, a
454 pushq %rdi / save arg1, r
455
456 leaq 8(%rdi), %r13 / tr = r + 1
457 movq %rsi, %r14 / ta = a
458 movq %rdx, %r15 / tlen = len
459 decq %r15 / tlen = len - 1
460 movq %r13, %rdi / arg1 = tr
461 leaq 8(%r14), %rsi / arg2 = ta + 1
462 movq %r15, %rdx / arg3 = tlen
463 movq 0(%r14), %rcx / arg4 = ta[0]
464 call big_mul_set_vec64
465 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
466 .L31:
467 decq %r15 / --tlen
468 jz .L32 / while (--tlen != 0)
469
470 addq $16, %r13 / tr += 2
471 addq $8, %r14 / ++ta
472 movq %r13, %rdi / arg1 = tr
473 leaq 8(%r14), %rsi / arg2 = ta + 1
474 movq %r15, %rdx / arg3 = tlen
475 movq 0(%r14), %rcx / arg4 = ta[0]
476 call big_mul_add_vec64
477 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
478 jmp .L31
479
480 .L32:
481
482 / No more function calls after this.
483 / Restore arguments to registers.
484 / However, don't use %rdx for arg3, len, because it is heavily
485 / used by the hardware MUL instruction. Use %r8, instead.
486 movq 0(%rsp), %rdi / %rdi == arg1 == r
487 movq 8(%rsp), %rsi / %rsi == arg2 == a
488 movq 16(%rsp), %r8 / %r8 == arg3 == len
489
490 movq 0(%rsi), %rax / %rax = a[0];
491 mulq %rax / s = %edx:%eax = a[0]**2
492 movq %rax, 0(%rdi) / r[0] = lo64(s)
493 movq %rdx, %r9 / cy = hi64(s)
494 xorq %rdx, %rdx
495 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
496 addq %rax, %rax
497 adcq $0, %rdx / p = p << 1
498 addq %r9, %rax
499 adcq $0, %rdx / p = (r[1] << 1) + cy
500 movq %rax, 8(%rdi) / r[1] = lo64(p)
501 movq %rdx, %r9 / cy = hi64(p)
502 movq $1, %r11 / row = 1
503 movq $2, %r12 / col = 2
504 movq %r8, %r15
505 decq %r15 / tlen = len - 1
506 .L33:
507 cmpq %r8, %r11 / len - row
508 jae .L34 / while (row < len)
509
510 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row]
511 mulq %rax / s = s * s
512 xorq %rbx, %rbx
513 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col]
514 addq %rcx, %rcx
515 adcq $0, %rbx / p = p << 1
516 addq %rcx, %rax
517 adcq %rbx, %rdx / t = p + s
518 xorq %r10, %r10
519 movq %rax, %rbp / t2 = 0:lo64(t)
520 addq %r9, %rbp
521 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy
522 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2)
523 xorq %rcx, %rcx
524 movq %rdx, %r9
525 addq %r10, %r9
526 adcq $0, %rcx / cy = hi64(t) + hi64(t2)
527 cmpq %r11, %r15
528 je .L34 / if (row == len - 1) break
529 xorq %rdx, %rdx
530 movq 8(%rdi, %r12, 8), %rax
531 addq %rax, %rax
532 adcq $0, %rdx
533 addq %r9, %rax
534 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy
535 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
536 movq %rdx, %r9 / cy = hi64(p)
537
538 incq %r11 / ++row
539 addq $2, %r12 / col += 2
540 jmp .L33
541
542 .L34:
543 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
544
545 addq $24, %rsp / skip %rdi, %rsi, %rdx
546 popq %r15
547 popq %r14
548 popq %r13
549 popq %r12
550 popq %rbp
551 popq %rbx
552
553 ret
554
555 SET_SIZE(big_sqr_vec64)
556
557 #endif /* lint */