Print this page
6799218 RSA using Solaris Kernel Crypto framework lagging behind OpenSSL
5016936 bignumimpl:big_mul: potential memory leak
6810280 panic from bignum module: vmem_xalloc(): size == 0
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/bignum/amd64/bignum_amd64_asm.s
+++ new/usr/src/common/bignum/amd64/bignum_amd64_asm.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 - * Common Development and Distribution License, Version 1.0 only
6 - * (the "License"). You may not use this file except in compliance
7 - * with the License.
5 + * Common Development and Distribution License (the "License").
6 + * You may not use this file except in compliance with the License.
8 7 *
9 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 9 * or http://www.opensolaris.org/os/licensing.
11 10 * See the License for the specific language governing permissions
12 11 * and limitations under the License.
13 12 *
14 13 * When distributing Covered Code, include this CDDL HEADER in each
15 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 15 * If applicable, add the following below this CDDL HEADER, with the
17 16 * fields enclosed by brackets "[]" replaced with your own identifying
18 17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 18 *
20 19 * CDDL HEADER END
21 20 */
22 21 /*
23 - * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
22 + * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 23 * Use is subject to license terms.
25 24 */
26 25
27 -#pragma ident "%Z%%M% %I% %E% SMI"
28 -
29 26 #include <sys/asm_linkage.h>
30 27
31 28 #if defined(lint) || defined(__lint)
32 29
33 30 #include <sys/types.h>
34 31
35 32 /* ARGSUSED */
36 33 uint64_t
37 -big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
34 +big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
38 35 { return (0); }
39 36
40 37 /* ARGSUSED */
41 38 uint64_t
42 -big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
39 +big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
43 40 { return (0); }
44 41
45 42 /* ARGSUSED */
46 43 void
47 -big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
44 +big_sqr_vec(uint64_t *r, uint64_t *a, int len)
48 45 {}
49 46
50 47 #else /* lint */
51 48
52 49 / ------------------------------------------------------------------------
53 50 /
54 51 / Implementation of big_mul_set_vec which exploits
55 52 / the 64X64->128 bit unsigned multiply instruction.
56 53 /
57 54 / As defined in Sun's bignum library for pkcs11, bignums are
58 -/ composed of an array of 32-bit "digits" along with descriptive
59 -/ information. The arrays of digits are only required to be
60 -/ aligned on 32-bit boundary. This implementation works only
61 -/ when the two factors and the result happen to be 64 bit aligned
62 -/ and have an even number of digits.
55 +/ composed of an array of 64-bit "digits" or "chunks" along with
56 +/ descriptive information.
63 57 /
64 58 / ------------------------------------------------------------------------
65 59
66 60 / r = a * digit, r and a are vectors of length len
67 61 / returns the carry digit
68 62 / r and a are 64 bit aligned.
69 63 /
70 64 / uint64_t
71 -/ big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
65 +/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
72 66 /
73 - ENTRY(big_mul_set_vec64)
67 + ENTRY(big_mul_set_vec)
74 68 xorq %rax, %rax / if (len == 0) return (0)
75 69 testq %rdx, %rdx
76 70 jz .L17
77 71
78 72 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
79 73 xorq %r9, %r9 / cy = 0
80 74
81 75 .L15:
82 76 cmpq $8, %r8 / 8 - len
83 77 jb .L16
84 78 movq 0(%rsi), %rax / rax = a[0]
85 79 movq 8(%rsi), %r11 / prefetch a[1]
86 80 mulq %rcx / p = a[0] * digit
87 81 addq %r9, %rax
88 82 adcq $0, %rdx / p += cy
89 83 movq %rax, 0(%rdi) / r[0] = lo(p)
90 84 movq %rdx, %r9 / cy = hi(p)
91 85
92 86 movq %r11, %rax
93 87 movq 16(%rsi), %r11 / prefetch a[2]
94 88 mulq %rcx / p = a[1] * digit
95 89 addq %r9, %rax
96 90 adcq $0, %rdx / p += cy
97 91 movq %rax, 8(%rdi) / r[1] = lo(p)
98 92 movq %rdx, %r9 / cy = hi(p)
99 93
100 94 movq %r11, %rax
101 95 movq 24(%rsi), %r11 / prefetch a[3]
102 96 mulq %rcx / p = a[2] * digit
103 97 addq %r9, %rax
104 98 adcq $0, %rdx / p += cy
105 99 movq %rax, 16(%rdi) / r[2] = lo(p)
106 100 movq %rdx, %r9 / cy = hi(p)
107 101
108 102 movq %r11, %rax
109 103 movq 32(%rsi), %r11 / prefetch a[4]
110 104 mulq %rcx / p = a[3] * digit
111 105 addq %r9, %rax
112 106 adcq $0, %rdx / p += cy
113 107 movq %rax, 24(%rdi) / r[3] = lo(p)
114 108 movq %rdx, %r9 / cy = hi(p)
115 109
116 110 movq %r11, %rax
117 111 movq 40(%rsi), %r11 / prefetch a[5]
118 112 mulq %rcx / p = a[4] * digit
119 113 addq %r9, %rax
120 114 adcq $0, %rdx / p += cy
121 115 movq %rax, 32(%rdi) / r[4] = lo(p)
122 116 movq %rdx, %r9 / cy = hi(p)
123 117
124 118 movq %r11, %rax
125 119 movq 48(%rsi), %r11 / prefetch a[6]
126 120 mulq %rcx / p = a[5] * digit
127 121 addq %r9, %rax
128 122 adcq $0, %rdx / p += cy
129 123 movq %rax, 40(%rdi) / r[5] = lo(p)
130 124 movq %rdx, %r9 / cy = hi(p)
131 125
132 126 movq %r11, %rax
133 127 movq 56(%rsi), %r11 / prefetch a[7]
134 128 mulq %rcx / p = a[6] * digit
135 129 addq %r9, %rax
136 130 adcq $0, %rdx / p += cy
137 131 movq %rax, 48(%rdi) / r[6] = lo(p)
138 132 movq %rdx, %r9 / cy = hi(p)
139 133
140 134 movq %r11, %rax
141 135 mulq %rcx / p = a[7] * digit
142 136 addq %r9, %rax
143 137 adcq $0, %rdx / p += cy
144 138 movq %rax, 56(%rdi) / r[7] = lo(p)
145 139 movq %rdx, %r9 / cy = hi(p)
146 140
147 141 addq $64, %rsi
148 142 addq $64, %rdi
149 143 subq $8, %r8
150 144
151 145 jz .L17
152 146 jmp .L15
153 147
154 148 .L16:
155 149 movq 0(%rsi), %rax
156 150 mulq %rcx / p = a[0] * digit
157 151 addq %r9, %rax
158 152 adcq $0, %rdx / p += cy
159 153 movq %rax, 0(%rdi) / r[0] = lo(p)
160 154 movq %rdx, %r9 / cy = hi(p)
161 155 decq %r8
162 156 jz .L17
163 157
164 158 movq 8(%rsi), %rax
165 159 mulq %rcx / p = a[1] * digit
166 160 addq %r9, %rax
167 161 adcq $0, %rdx / p += cy
168 162 movq %rax, 8(%rdi) / r[1] = lo(p)
169 163 movq %rdx, %r9 / cy = hi(p)
170 164 decq %r8
171 165 jz .L17
172 166
173 167 movq 16(%rsi), %rax
174 168 mulq %rcx / p = a[2] * digit
175 169 addq %r9, %rax
176 170 adcq $0, %rdx / p += cy
177 171 movq %rax, 16(%rdi) / r[2] = lo(p)
178 172 movq %rdx, %r9 / cy = hi(p)
179 173 decq %r8
180 174 jz .L17
181 175
182 176 movq 24(%rsi), %rax
183 177 mulq %rcx / p = a[3] * digit
184 178 addq %r9, %rax
185 179 adcq $0, %rdx / p += cy
186 180 movq %rax, 24(%rdi) / r[3] = lo(p)
187 181 movq %rdx, %r9 / cy = hi(p)
188 182 decq %r8
189 183 jz .L17
190 184
191 185 movq 32(%rsi), %rax
192 186 mulq %rcx / p = a[4] * digit
193 187 addq %r9, %rax
194 188 adcq $0, %rdx / p += cy
195 189 movq %rax, 32(%rdi) / r[4] = lo(p)
196 190 movq %rdx, %r9 / cy = hi(p)
197 191 decq %r8
198 192 jz .L17
199 193
200 194 movq 40(%rsi), %rax
201 195 mulq %rcx / p = a[5] * digit
202 196 addq %r9, %rax
203 197 adcq $0, %rdx / p += cy
204 198 movq %rax, 40(%rdi) / r[5] = lo(p)
205 199 movq %rdx, %r9 / cy = hi(p)
206 200 decq %r8
207 201 jz .L17
208 202
209 203 movq 48(%rsi), %rax
210 204 mulq %rcx / p = a[6] * digit
211 205 addq %r9, %rax
↓ open down ↓ |
128 lines elided |
↑ open up ↑ |
212 206 adcq $0, %rdx / p += cy
213 207 movq %rax, 48(%rdi) / r[6] = lo(p)
214 208 movq %rdx, %r9 / cy = hi(p)
215 209 decq %r8
216 210 jz .L17
217 211
218 212
219 213 .L17:
220 214 movq %r9, %rax
221 215 ret
222 - SET_SIZE(big_mul_set_vec64)
216 + SET_SIZE(big_mul_set_vec)
223 217
218 +
224 219 / ------------------------------------------------------------------------
225 220 /
226 221 / Implementation of big_mul_add_vec which exploits
227 222 / the 64X64->128 bit unsigned multiply instruction.
228 223 /
229 224 / As defined in Sun's bignum library for pkcs11, bignums are
230 -/ composed of an array of 32-bit "digits" along with descriptive
231 -/ information. The arrays of digits are only required to be
232 -/ aligned on 32-bit boundary. This implementation works only
233 -/ when the two factors and the result happen to be 64 bit aligned
234 -/ and have an even number of digits.
225 +/ composed of an array of 64-bit "digits" or "chunks" along with
226 +/ descriptive information.
235 227 /
236 228 / ------------------------------------------------------------------------
237 229
238 230 / r += a * digit, r and a are vectors of length len
239 231 / returns the carry digit
240 232 / r and a are 64 bit aligned.
241 233 /
242 234 / uint64_t
243 -/ big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
235 +/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
244 236 /
245 - ENTRY(big_mul_add_vec64)
237 + ENTRY(big_mul_add_vec)
246 238 xorq %rax, %rax / if (len == 0) return (0)
247 239 testq %rdx, %rdx
248 240 jz .L27
249 241
250 242 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
251 243 xorq %r9, %r9 / cy = 0
252 244
253 245 .L25:
254 246 cmpq $8, %r8 / 8 - len
255 247 jb .L26
256 248 movq 0(%rsi), %rax / rax = a[0]
257 249 movq 0(%rdi), %r10 / r10 = r[0]
258 250 movq 8(%rsi), %r11 / prefetch a[1]
259 251 mulq %rcx / p = a[0] * digit
260 252 addq %r10, %rax
261 253 adcq $0, %rdx / p += r[0]
262 254 movq 8(%rdi), %r10 / prefetch r[1]
263 255 addq %r9, %rax
264 256 adcq $0, %rdx / p += cy
265 257 movq %rax, 0(%rdi) / r[0] = lo(p)
266 258 movq %rdx, %r9 / cy = hi(p)
267 259
268 260 movq %r11, %rax
269 261 movq 16(%rsi), %r11 / prefetch a[2]
270 262 mulq %rcx / p = a[1] * digit
271 263 addq %r10, %rax
272 264 adcq $0, %rdx / p += r[1]
273 265 movq 16(%rdi), %r10 / prefetch r[2]
274 266 addq %r9, %rax
275 267 adcq $0, %rdx / p += cy
276 268 movq %rax, 8(%rdi) / r[1] = lo(p)
277 269 movq %rdx, %r9 / cy = hi(p)
278 270
279 271 movq %r11, %rax
280 272 movq 24(%rsi), %r11 / prefetch a[3]
281 273 mulq %rcx / p = a[2] * digit
282 274 addq %r10, %rax
283 275 adcq $0, %rdx / p += r[2]
284 276 movq 24(%rdi), %r10 / prefetch r[3]
285 277 addq %r9, %rax
286 278 adcq $0, %rdx / p += cy
287 279 movq %rax, 16(%rdi) / r[2] = lo(p)
288 280 movq %rdx, %r9 / cy = hi(p)
289 281
290 282 movq %r11, %rax
291 283 movq 32(%rsi), %r11 / prefetch a[4]
292 284 mulq %rcx / p = a[3] * digit
293 285 addq %r10, %rax
294 286 adcq $0, %rdx / p += r[3]
295 287 movq 32(%rdi), %r10 / prefetch r[4]
296 288 addq %r9, %rax
297 289 adcq $0, %rdx / p += cy
298 290 movq %rax, 24(%rdi) / r[3] = lo(p)
299 291 movq %rdx, %r9 / cy = hi(p)
300 292
301 293 movq %r11, %rax
302 294 movq 40(%rsi), %r11 / prefetch a[5]
303 295 mulq %rcx / p = a[4] * digit
304 296 addq %r10, %rax
305 297 adcq $0, %rdx / p += r[4]
306 298 movq 40(%rdi), %r10 / prefetch r[5]
307 299 addq %r9, %rax
308 300 adcq $0, %rdx / p += cy
309 301 movq %rax, 32(%rdi) / r[4] = lo(p)
310 302 movq %rdx, %r9 / cy = hi(p)
311 303
312 304 movq %r11, %rax
313 305 movq 48(%rsi), %r11 / prefetch a[6]
314 306 mulq %rcx / p = a[5] * digit
315 307 addq %r10, %rax
316 308 adcq $0, %rdx / p += r[5]
317 309 movq 48(%rdi), %r10 / prefetch r[6]
318 310 addq %r9, %rax
319 311 adcq $0, %rdx / p += cy
320 312 movq %rax, 40(%rdi) / r[5] = lo(p)
321 313 movq %rdx, %r9 / cy = hi(p)
322 314
323 315 movq %r11, %rax
324 316 movq 56(%rsi), %r11 / prefetch a[7]
325 317 mulq %rcx / p = a[6] * digit
326 318 addq %r10, %rax
327 319 adcq $0, %rdx / p += r[6]
328 320 movq 56(%rdi), %r10 / prefetch r[7]
329 321 addq %r9, %rax
330 322 adcq $0, %rdx / p += cy
331 323 movq %rax, 48(%rdi) / r[6] = lo(p)
332 324 movq %rdx, %r9 / cy = hi(p)
333 325
334 326 movq %r11, %rax
335 327 mulq %rcx / p = a[7] * digit
336 328 addq %r10, %rax
337 329 adcq $0, %rdx / p += r[7]
338 330 addq %r9, %rax
339 331 adcq $0, %rdx / p += cy
340 332 movq %rax, 56(%rdi) / r[7] = lo(p)
341 333 movq %rdx, %r9 / cy = hi(p)
342 334
343 335 addq $64, %rsi
344 336 addq $64, %rdi
345 337 subq $8, %r8
346 338
347 339 jz .L27
348 340 jmp .L25
349 341
350 342 .L26:
351 343 movq 0(%rsi), %rax
352 344 movq 0(%rdi), %r10
353 345 mulq %rcx / p = a[0] * digit
354 346 addq %r10, %rax
355 347 adcq $0, %rdx / p += r[0]
356 348 addq %r9, %rax
357 349 adcq $0, %rdx / p += cy
358 350 movq %rax, 0(%rdi) / r[0] = lo(p)
359 351 movq %rdx, %r9 / cy = hi(p)
360 352 decq %r8
361 353 jz .L27
362 354
363 355 movq 8(%rsi), %rax
364 356 movq 8(%rdi), %r10
365 357 mulq %rcx / p = a[1] * digit
366 358 addq %r10, %rax
367 359 adcq $0, %rdx / p += r[1]
368 360 addq %r9, %rax
369 361 adcq $0, %rdx / p += cy
370 362 movq %rax, 8(%rdi) / r[1] = lo(p)
371 363 movq %rdx, %r9 / cy = hi(p)
372 364 decq %r8
373 365 jz .L27
374 366
375 367 movq 16(%rsi), %rax
376 368 movq 16(%rdi), %r10
377 369 mulq %rcx / p = a[2] * digit
378 370 addq %r10, %rax
379 371 adcq $0, %rdx / p += r[2]
380 372 addq %r9, %rax
381 373 adcq $0, %rdx / p += cy
382 374 movq %rax, 16(%rdi) / r[2] = lo(p)
383 375 movq %rdx, %r9 / cy = hi(p)
384 376 decq %r8
385 377 jz .L27
386 378
387 379 movq 24(%rsi), %rax
388 380 movq 24(%rdi), %r10
389 381 mulq %rcx / p = a[3] * digit
390 382 addq %r10, %rax
391 383 adcq $0, %rdx / p += r[3]
392 384 addq %r9, %rax
393 385 adcq $0, %rdx / p += cy
394 386 movq %rax, 24(%rdi) / r[3] = lo(p)
395 387 movq %rdx, %r9 / cy = hi(p)
396 388 decq %r8
397 389 jz .L27
398 390
399 391 movq 32(%rsi), %rax
400 392 movq 32(%rdi), %r10
401 393 mulq %rcx / p = a[4] * digit
402 394 addq %r10, %rax
403 395 adcq $0, %rdx / p += r[4]
404 396 addq %r9, %rax
405 397 adcq $0, %rdx / p += cy
406 398 movq %rax, 32(%rdi) / r[4] = lo(p)
407 399 movq %rdx, %r9 / cy = hi(p)
408 400 decq %r8
409 401 jz .L27
410 402
411 403 movq 40(%rsi), %rax
412 404 movq 40(%rdi), %r10
413 405 mulq %rcx / p = a[5] * digit
414 406 addq %r10, %rax
415 407 adcq $0, %rdx / p += r[5]
416 408 addq %r9, %rax
417 409 adcq $0, %rdx / p += cy
418 410 movq %rax, 40(%rdi) / r[5] = lo(p)
419 411 movq %rdx, %r9 / cy = hi(p)
420 412 decq %r8
421 413 jz .L27
422 414
423 415 movq 48(%rsi), %rax
424 416 movq 48(%rdi), %r10
425 417 mulq %rcx / p = a[6] * digit
426 418 addq %r10, %rax
427 419 adcq $0, %rdx / p += r[6]
428 420 addq %r9, %rax
↓ open down ↓ |
173 lines elided |
↑ open up ↑ |
429 421 adcq $0, %rdx / p += cy
430 422 movq %rax, 48(%rdi) / r[6] = lo(p)
431 423 movq %rdx, %r9 / cy = hi(p)
432 424 decq %r8
433 425 jz .L27
434 426
435 427
436 428 .L27:
437 429 movq %r9, %rax
438 430 ret
439 - SET_SIZE(big_mul_add_vec64)
431 + SET_SIZE(big_mul_add_vec)
440 432
441 433
442 434 / void
443 -/ big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
435 +/ big_sqr_vec(uint64_t *r, uint64_t *a, int len)
444 436
445 - ENTRY(big_sqr_vec64)
437 + ENTRY(big_sqr_vec)
446 438 pushq %rbx
447 439 pushq %rbp
448 440 pushq %r12
449 441 pushq %r13
450 442 pushq %r14
451 443 pushq %r15
452 444 pushq %rdx / save arg3, len
453 445 pushq %rsi / save arg2, a
454 446 pushq %rdi / save arg1, r
455 447
456 448 leaq 8(%rdi), %r13 / tr = r + 1
457 449 movq %rsi, %r14 / ta = a
458 450 movq %rdx, %r15 / tlen = len
459 451 decq %r15 / tlen = len - 1
460 452 movq %r13, %rdi / arg1 = tr
461 453 leaq 8(%r14), %rsi / arg2 = ta + 1
462 454 movq %r15, %rdx / arg3 = tlen
463 455 movq 0(%r14), %rcx / arg4 = ta[0]
464 - call big_mul_set_vec64
456 + call big_mul_set_vec
465 457 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
466 458 .L31:
467 459 decq %r15 / --tlen
468 460 jz .L32 / while (--tlen != 0)
469 461
470 462 addq $16, %r13 / tr += 2
471 463 addq $8, %r14 / ++ta
472 464 movq %r13, %rdi / arg1 = tr
473 465 leaq 8(%r14), %rsi / arg2 = ta + 1
474 466 movq %r15, %rdx / arg3 = tlen
475 467 movq 0(%r14), %rcx / arg4 = ta[0]
476 - call big_mul_add_vec64
468 + call big_mul_add_vec
477 469 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
478 470 jmp .L31
479 471
480 472 .L32:
481 473
482 474 / No more function calls after this.
483 475 / Restore arguments to registers.
484 476 / However, don't use %rdx for arg3, len, because it is heavily
485 477 / used by the hardware MUL instruction. Use %r8, instead.
486 478 movq 0(%rsp), %rdi / %rdi == arg1 == r
487 479 movq 8(%rsp), %rsi / %rsi == arg2 == a
488 480 movq 16(%rsp), %r8 / %r8 == arg3 == len
489 481
490 482 movq 0(%rsi), %rax / %rax = a[0];
491 483 mulq %rax / s = %edx:%eax = a[0]**2
492 484 movq %rax, 0(%rdi) / r[0] = lo64(s)
493 485 movq %rdx, %r9 / cy = hi64(s)
494 486 xorq %rdx, %rdx
495 487 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
496 488 addq %rax, %rax
497 489 adcq $0, %rdx / p = p << 1
498 490 addq %r9, %rax
499 491 adcq $0, %rdx / p = (r[1] << 1) + cy
500 492 movq %rax, 8(%rdi) / r[1] = lo64(p)
501 493 movq %rdx, %r9 / cy = hi64(p)
502 494 movq $1, %r11 / row = 1
503 495 movq $2, %r12 / col = 2
504 496 movq %r8, %r15
505 497 decq %r15 / tlen = len - 1
506 498 .L33:
507 499 cmpq %r8, %r11 / len - row
508 500 jae .L34 / while (row < len)
509 501
510 502 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row]
511 503 mulq %rax / s = s * s
512 504 xorq %rbx, %rbx
513 505 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col]
514 506 addq %rcx, %rcx
515 507 adcq $0, %rbx / p = p << 1
516 508 addq %rcx, %rax
517 509 adcq %rbx, %rdx / t = p + s
518 510 xorq %r10, %r10
519 511 movq %rax, %rbp / t2 = 0:lo64(t)
520 512 addq %r9, %rbp
521 513 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy
522 514 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2)
523 515 xorq %rcx, %rcx
524 516 movq %rdx, %r9
525 517 addq %r10, %r9
526 518 adcq $0, %rcx / cy = hi64(t) + hi64(t2)
527 519 cmpq %r11, %r15
528 520 je .L34 / if (row == len - 1) break
529 521 xorq %rdx, %rdx
530 522 movq 8(%rdi, %r12, 8), %rax
531 523 addq %rax, %rax
532 524 adcq $0, %rdx
533 525 addq %r9, %rax
534 526 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy
535 527 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
536 528 movq %rdx, %r9 / cy = hi64(p)
537 529
538 530 incq %r11 / ++row
539 531 addq $2, %r12 / col += 2
540 532 jmp .L33
541 533
542 534 .L34:
543 535 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
544 536
↓ open down ↓ |
58 lines elided |
↑ open up ↑ |
545 537 addq $24, %rsp / skip %rdi, %rsi, %rdx
546 538 popq %r15
547 539 popq %r14
548 540 popq %r13
549 541 popq %r12
550 542 popq %rbp
551 543 popq %rbx
552 544
553 545 ret
554 546
555 - SET_SIZE(big_sqr_vec64)
547 + SET_SIZE(big_sqr_vec)
556 548
557 549 #endif /* lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX