1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <sys/asm_linkage.h>
  30 
  31 #if defined(lint) || defined(__lint)
  32 
  33 #include <sys/types.h>
  34 
  35 /* ARGSUSED */
  36 uint64_t
  37 big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  38 { return (0); }
  39 
  40 /* ARGSUSED */
  41 uint64_t
  42 big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  43 { return (0); }
  44 
  45 /* ARGSUSED */
  46 void
  47 big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
  48 {}
  49 
  50 #else   /* lint */
  51 
  52 / ------------------------------------------------------------------------
  53 /
  54 /  Implementation of big_mul_set_vec which exploits
  55 /  the 64X64->128 bit  unsigned multiply instruction.
  56 /
  57 /  As defined in Sun's bignum library for pkcs11, bignums are
  58 /  composed of an array of 32-bit "digits" along with descriptive
  59 /  information.  The arrays of digits are only required to be
  60 /  aligned on 32-bit boundary.  This implementation works only
  61 /  when the two factors and the result happen to be 64 bit aligned
  62 /  and have an even number of digits.
  63 /
  64 / ------------------------------------------------------------------------
  65 
  66 / r = a * digit, r and a are vectors of length len
  67 / returns the carry digit
  68 / r and a are 64 bit aligned.
  69 /
  70 / uint64_t
  71 / big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  72 /
  73         ENTRY(big_mul_set_vec64)
  74         xorq    %rax, %rax              / if (len == 0) return (0)
  75         testq   %rdx, %rdx
  76         jz      .L17
  77 
  78         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
  79         xorq    %r9, %r9                / cy = 0
  80 
  81 .L15:
  82         cmpq    $8, %r8                 / 8 - len
  83         jb      .L16
  84         movq    0(%rsi), %rax           / rax = a[0]
  85         movq    8(%rsi), %r11           / prefetch a[1]
  86         mulq    %rcx                    / p = a[0] * digit
  87         addq    %r9, %rax
  88         adcq    $0, %rdx                / p += cy
  89         movq    %rax, 0(%rdi)           / r[0] = lo(p)
  90         movq    %rdx, %r9               / cy = hi(p)
  91 
  92         movq    %r11, %rax
  93         movq    16(%rsi), %r11          / prefetch a[2]
  94         mulq    %rcx                    / p = a[1] * digit
  95         addq    %r9, %rax
  96         adcq    $0, %rdx                / p += cy
  97         movq    %rax, 8(%rdi)           / r[1] = lo(p)
  98         movq    %rdx, %r9               / cy = hi(p)
  99 
 100         movq    %r11, %rax
 101         movq    24(%rsi), %r11          / prefetch a[3]
 102         mulq    %rcx                    / p = a[2] * digit
 103         addq    %r9, %rax
 104         adcq    $0, %rdx                / p += cy
 105         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 106         movq    %rdx, %r9               / cy = hi(p)
 107 
 108         movq    %r11, %rax
 109         movq    32(%rsi), %r11          / prefetch a[4]
 110         mulq    %rcx                    / p = a[3] * digit
 111         addq    %r9, %rax
 112         adcq    $0, %rdx                / p += cy
 113         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 114         movq    %rdx, %r9               / cy = hi(p)
 115 
 116         movq    %r11, %rax
 117         movq    40(%rsi), %r11          / prefetch a[5]
 118         mulq    %rcx                    / p = a[4] * digit
 119         addq    %r9, %rax
 120         adcq    $0, %rdx                / p += cy
 121         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 122         movq    %rdx, %r9               / cy = hi(p)
 123 
 124         movq    %r11, %rax
 125         movq    48(%rsi), %r11          / prefetch a[6]
 126         mulq    %rcx                    / p = a[5] * digit
 127         addq    %r9, %rax
 128         adcq    $0, %rdx                / p += cy
 129         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 130         movq    %rdx, %r9               / cy = hi(p)
 131 
 132         movq    %r11, %rax
 133         movq    56(%rsi), %r11          / prefetch a[7]
 134         mulq    %rcx                    / p = a[6] * digit
 135         addq    %r9, %rax
 136         adcq    $0, %rdx                / p += cy
 137         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 138         movq    %rdx, %r9               / cy = hi(p)
 139 
 140         movq    %r11, %rax
 141         mulq    %rcx                    / p = a[7] * digit
 142         addq    %r9, %rax
 143         adcq    $0, %rdx                / p += cy
 144         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 145         movq    %rdx, %r9               / cy = hi(p)
 146 
 147         addq    $64, %rsi
 148         addq    $64, %rdi
 149         subq    $8, %r8
 150 
 151         jz      .L17
 152         jmp     .L15
 153 
 154 .L16:
 155         movq    0(%rsi), %rax
 156         mulq    %rcx                    / p = a[0] * digit
 157         addq    %r9, %rax
 158         adcq    $0, %rdx                / p += cy
 159         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 160         movq    %rdx, %r9               / cy = hi(p)
 161         decq    %r8
 162         jz      .L17
 163 
 164         movq    8(%rsi), %rax
 165         mulq    %rcx                    / p = a[1] * digit
 166         addq    %r9, %rax
 167         adcq    $0, %rdx                / p += cy
 168         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 169         movq    %rdx, %r9               / cy = hi(p)
 170         decq    %r8
 171         jz      .L17
 172 
 173         movq    16(%rsi), %rax
 174         mulq    %rcx                    / p = a[2] * digit
 175         addq    %r9, %rax
 176         adcq    $0, %rdx                / p += cy
 177         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 178         movq    %rdx, %r9               / cy = hi(p)
 179         decq    %r8
 180         jz      .L17
 181 
 182         movq    24(%rsi), %rax
 183         mulq    %rcx                    / p = a[3] * digit
 184         addq    %r9, %rax
 185         adcq    $0, %rdx                / p += cy
 186         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 187         movq    %rdx, %r9               / cy = hi(p)
 188         decq    %r8
 189         jz      .L17
 190 
 191         movq    32(%rsi), %rax
 192         mulq    %rcx                    / p = a[4] * digit
 193         addq    %r9, %rax
 194         adcq    $0, %rdx                / p += cy
 195         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 196         movq    %rdx, %r9               / cy = hi(p)
 197         decq    %r8
 198         jz      .L17
 199 
 200         movq    40(%rsi), %rax
 201         mulq    %rcx                    / p = a[5] * digit
 202         addq    %r9, %rax
 203         adcq    $0, %rdx                / p += cy
 204         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 205         movq    %rdx, %r9               / cy = hi(p)
 206         decq    %r8
 207         jz      .L17
 208 
 209         movq    48(%rsi), %rax
 210         mulq    %rcx                    / p = a[6] * digit
 211         addq    %r9, %rax
 212         adcq    $0, %rdx                / p += cy
 213         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 214         movq    %rdx, %r9               / cy = hi(p)
 215         decq    %r8
 216         jz      .L17
 217 
 218 
 219 .L17:
 220         movq    %r9, %rax
 221         ret
 222         SET_SIZE(big_mul_set_vec64)
 223 
 224 / ------------------------------------------------------------------------
 225 /
 226 /  Implementation of big_mul_add_vec which exploits
 227 /  the 64X64->128 bit  unsigned multiply instruction.
 228 /
 229 /  As defined in Sun's bignum library for pkcs11, bignums are
 230 /  composed of an array of 32-bit "digits" along with descriptive
 231 /  information.  The arrays of digits are only required to be
 232 /  aligned on 32-bit boundary.  This implementation works only
 233 /  when the two factors and the result happen to be 64 bit aligned
 234 /  and have an even number of digits.
 235 /
 236 / ------------------------------------------------------------------------
 237 
 238 / r += a * digit, r and a are vectors of length len
 239 / returns the carry digit
 240 / r and a are 64 bit aligned.
 241 /
 242 / uint64_t
 243 / big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
 244 /
 245         ENTRY(big_mul_add_vec64)
 246         xorq    %rax, %rax              / if (len == 0) return (0)
 247         testq   %rdx, %rdx
 248         jz      .L27
 249 
 250         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
 251         xorq    %r9, %r9                / cy = 0
 252 
 253 .L25:
 254         cmpq    $8, %r8                 / 8 - len
 255         jb      .L26
 256         movq    0(%rsi), %rax           / rax = a[0]
 257         movq    0(%rdi), %r10           / r10 = r[0]
 258         movq    8(%rsi), %r11           / prefetch a[1]
 259         mulq    %rcx                    / p = a[0] * digit
 260         addq    %r10, %rax
 261         adcq    $0, %rdx                / p += r[0]
 262         movq    8(%rdi), %r10           / prefetch r[1]
 263         addq    %r9, %rax
 264         adcq    $0, %rdx                / p += cy
 265         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 266         movq    %rdx, %r9               / cy = hi(p)
 267 
 268         movq    %r11, %rax
 269         movq    16(%rsi), %r11          / prefetch a[2]
 270         mulq    %rcx                    / p = a[1] * digit
 271         addq    %r10, %rax
 272         adcq    $0, %rdx                / p += r[1]
 273         movq    16(%rdi), %r10          / prefetch r[2]
 274         addq    %r9, %rax
 275         adcq    $0, %rdx                / p += cy
 276         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 277         movq    %rdx, %r9               / cy = hi(p)
 278 
 279         movq    %r11, %rax
 280         movq    24(%rsi), %r11          / prefetch a[3]
 281         mulq    %rcx                    / p = a[2] * digit
 282         addq    %r10, %rax
 283         adcq    $0, %rdx                / p += r[2]
 284         movq    24(%rdi), %r10          / prefetch r[3]
 285         addq    %r9, %rax
 286         adcq    $0, %rdx                / p += cy
 287         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 288         movq    %rdx, %r9               / cy = hi(p)
 289 
 290         movq    %r11, %rax
 291         movq    32(%rsi), %r11          / prefetch a[4]
 292         mulq    %rcx                    / p = a[3] * digit
 293         addq    %r10, %rax
 294         adcq    $0, %rdx                / p += r[3]
 295         movq    32(%rdi), %r10          / prefetch r[4]
 296         addq    %r9, %rax
 297         adcq    $0, %rdx                / p += cy
 298         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 299         movq    %rdx, %r9               / cy = hi(p)
 300 
 301         movq    %r11, %rax
 302         movq    40(%rsi), %r11          / prefetch a[5]
 303         mulq    %rcx                    / p = a[4] * digit
 304         addq    %r10, %rax
 305         adcq    $0, %rdx                / p += r[4]
 306         movq    40(%rdi), %r10          / prefetch r[5]
 307         addq    %r9, %rax
 308         adcq    $0, %rdx                / p += cy
 309         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 310         movq    %rdx, %r9               / cy = hi(p)
 311 
 312         movq    %r11, %rax
 313         movq    48(%rsi), %r11          / prefetch a[6]
 314         mulq    %rcx                    / p = a[5] * digit
 315         addq    %r10, %rax
 316         adcq    $0, %rdx                / p += r[5]
 317         movq    48(%rdi), %r10          / prefetch r[6]
 318         addq    %r9, %rax
 319         adcq    $0, %rdx                / p += cy
 320         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 321         movq    %rdx, %r9               / cy = hi(p)
 322 
 323         movq    %r11, %rax
 324         movq    56(%rsi), %r11          / prefetch a[7]
 325         mulq    %rcx                    / p = a[6] * digit
 326         addq    %r10, %rax
 327         adcq    $0, %rdx                / p += r[6]
 328         movq    56(%rdi), %r10          / prefetch r[7]
 329         addq    %r9, %rax
 330         adcq    $0, %rdx                / p += cy
 331         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 332         movq    %rdx, %r9               / cy = hi(p)
 333 
 334         movq    %r11, %rax
 335         mulq    %rcx                    / p = a[7] * digit
 336         addq    %r10, %rax
 337         adcq    $0, %rdx                / p += r[7]
 338         addq    %r9, %rax
 339         adcq    $0, %rdx                / p += cy
 340         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 341         movq    %rdx, %r9               / cy = hi(p)
 342 
 343         addq    $64, %rsi
 344         addq    $64, %rdi
 345         subq    $8, %r8
 346 
 347         jz      .L27
 348         jmp     .L25
 349 
 350 .L26:
 351         movq    0(%rsi), %rax
 352         movq    0(%rdi), %r10
 353         mulq    %rcx                    / p = a[0] * digit
 354         addq    %r10, %rax
 355         adcq    $0, %rdx                / p += r[0]
 356         addq    %r9, %rax
 357         adcq    $0, %rdx                / p += cy
 358         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 359         movq    %rdx, %r9               / cy = hi(p)
 360         decq    %r8
 361         jz      .L27
 362 
 363         movq    8(%rsi), %rax
 364         movq    8(%rdi), %r10
 365         mulq    %rcx                    / p = a[1] * digit
 366         addq    %r10, %rax
 367         adcq    $0, %rdx                / p += r[1]
 368         addq    %r9, %rax
 369         adcq    $0, %rdx                / p += cy
 370         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 371         movq    %rdx, %r9               / cy = hi(p)
 372         decq    %r8
 373         jz      .L27
 374 
 375         movq    16(%rsi), %rax
 376         movq    16(%rdi), %r10
 377         mulq    %rcx                    / p = a[2] * digit
 378         addq    %r10, %rax
 379         adcq    $0, %rdx                / p += r[2]
 380         addq    %r9, %rax
 381         adcq    $0, %rdx                / p += cy
 382         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 383         movq    %rdx, %r9               / cy = hi(p)
 384         decq    %r8
 385         jz      .L27
 386 
 387         movq    24(%rsi), %rax
 388         movq    24(%rdi), %r10
 389         mulq    %rcx                    / p = a[3] * digit
 390         addq    %r10, %rax
 391         adcq    $0, %rdx                / p += r[3]
 392         addq    %r9, %rax
 393         adcq    $0, %rdx                / p += cy
 394         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 395         movq    %rdx, %r9               / cy = hi(p)
 396         decq    %r8
 397         jz      .L27
 398 
 399         movq    32(%rsi), %rax
 400         movq    32(%rdi), %r10
 401         mulq    %rcx                    / p = a[4] * digit
 402         addq    %r10, %rax
 403         adcq    $0, %rdx                / p += r[4]
 404         addq    %r9, %rax
 405         adcq    $0, %rdx                / p += cy
 406         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 407         movq    %rdx, %r9               / cy = hi(p)
 408         decq    %r8
 409         jz      .L27
 410 
 411         movq    40(%rsi), %rax
 412         movq    40(%rdi), %r10
 413         mulq    %rcx                    / p = a[5] * digit
 414         addq    %r10, %rax
 415         adcq    $0, %rdx                / p += r[5]
 416         addq    %r9, %rax
 417         adcq    $0, %rdx                / p += cy
 418         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 419         movq    %rdx, %r9               / cy = hi(p)
 420         decq    %r8
 421         jz      .L27
 422 
 423         movq    48(%rsi), %rax
 424         movq    48(%rdi), %r10
 425         mulq    %rcx                    / p = a[6] * digit
 426         addq    %r10, %rax
 427         adcq    $0, %rdx                / p += r[6]
 428         addq    %r9, %rax
 429         adcq    $0, %rdx                / p += cy
 430         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 431         movq    %rdx, %r9               / cy = hi(p)
 432         decq    %r8
 433         jz      .L27
 434 
 435 
 436 .L27:
 437         movq    %r9, %rax
 438         ret
 439         SET_SIZE(big_mul_add_vec64)
 440 
 441 
 442 / void
 443 / big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
 444 
 445         ENTRY(big_sqr_vec64)
 446         pushq   %rbx
 447         pushq   %rbp
 448         pushq   %r12
 449         pushq   %r13
 450         pushq   %r14
 451         pushq   %r15
 452         pushq   %rdx                    / save arg3, len
 453         pushq   %rsi                    / save arg2, a
 454         pushq   %rdi                    / save arg1, r
 455 
 456         leaq    8(%rdi), %r13           / tr = r + 1
 457         movq    %rsi, %r14              / ta = a
 458         movq    %rdx, %r15              / tlen = len
 459         decq    %r15                    / tlen = len - 1
 460         movq    %r13, %rdi              / arg1 = tr
 461         leaq    8(%r14), %rsi           / arg2 = ta + 1
 462         movq    %r15, %rdx              / arg3 = tlen
 463         movq    0(%r14), %rcx           / arg4 = ta[0]
 464         call    big_mul_set_vec64
 465         movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 466 .L31:
 467         decq    %r15                    / --tlen
 468         jz      .L32                    / while (--tlen != 0)
 469 
 470         addq    $16, %r13               / tr += 2
 471         addq    $8, %r14                / ++ta
 472         movq    %r13, %rdi              / arg1 = tr
 473         leaq    8(%r14), %rsi           / arg2 = ta + 1
 474         movq    %r15, %rdx              / arg3 = tlen
 475         movq    0(%r14), %rcx           / arg4 = ta[0]
 476         call    big_mul_add_vec64
 477         movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 478         jmp     .L31
 479 
 480 .L32:
 481 
 482 / No more function calls after this.
 483 / Restore arguments to registers.
 484 / However, don't use %rdx for arg3, len, because it is heavily
 485 / used by the hardware MUL instruction.  Use %r8, instead.
 486         movq    0(%rsp), %rdi           / %rdi == arg1 == r
 487         movq    8(%rsp), %rsi           / %rsi == arg2 == a
 488         movq    16(%rsp), %r8           / %r8  == arg3 == len
 489 
 490         movq    0(%rsi), %rax           / %rax = a[0];
 491         mulq    %rax                    / s = %edx:%eax = a[0]**2
 492         movq    %rax, 0(%rdi)           / r[0] = lo64(s)
 493         movq    %rdx, %r9               / cy = hi64(s)
 494         xorq    %rdx, %rdx
 495         movq    8(%rdi), %rax           / p = %rdx:%rax = r[1]
 496         addq    %rax, %rax
 497         adcq    $0, %rdx                / p = p << 1
 498         addq    %r9, %rax
 499         adcq    $0, %rdx                / p = (r[1] << 1) + cy
 500         movq    %rax, 8(%rdi)           / r[1] = lo64(p)
 501         movq    %rdx, %r9               / cy = hi64(p)
 502         movq    $1, %r11                / row = 1
 503         movq    $2, %r12                / col = 2
 504         movq    %r8, %r15
 505         decq    %r15                    / tlen = len - 1
 506 .L33:
 507         cmpq    %r8, %r11               / len - row
 508         jae     .L34                    / while (row < len)
 509 
 510         movq    0(%rsi, %r11, 8), %rax  / s = (uint128_t)a[row]
 511         mulq    %rax                    / s = s * s
 512         xorq    %rbx, %rbx
 513         movq    0(%rdi, %r12, 8), %rcx  / p = (uint128_t)r[col]
 514         addq    %rcx, %rcx
 515         adcq    $0, %rbx                / p = p << 1
 516         addq    %rcx, %rax
 517         adcq    %rbx, %rdx              / t = p + s
 518         xorq    %r10, %r10
 519         movq    %rax, %rbp              / t2 = 0:lo64(t)
 520         addq    %r9, %rbp
 521         adcq    $0, %r10                / t2 = %r10:%rbp = lo64(t) + cy
 522         movq    %rbp, 0(%rdi, %r12, 8)  / r[col] = lo64(t2)
 523         xorq    %rcx, %rcx
 524         movq    %rdx, %r9
 525         addq    %r10, %r9
 526         adcq    $0, %rcx                / cy = hi64(t) + hi64(t2)
 527         cmpq    %r11, %r15
 528         je      .L34                    / if (row == len - 1) break
 529         xorq    %rdx, %rdx
 530         movq    8(%rdi, %r12, 8), %rax
 531         addq    %rax, %rax
 532         adcq    $0, %rdx
 533         addq    %r9, %rax
 534         adcq    %rcx, %rdx              / p = (lo64(r[col+1]) << 1) + cy
 535         movq    %rax, 8(%rdi, %r12, 8)  / r[col+1] = lo64(p)
 536         movq    %rdx, %r9               / cy = hi64(p)
 537 
 538         incq    %r11                    / ++row
 539         addq    $2, %r12                / col += 2
 540         jmp     .L33
 541 
 542 .L34:
 543         movq    %r9, 8(%rdi, %r12, 8)   / r[col+1] = lo64(cy)
 544 
 545         addq    $24, %rsp               / skip %rdi, %rsi, %rdx
 546         popq    %r15
 547         popq    %r14
 548         popq    %r13
 549         popq    %r12
 550         popq    %rbp
 551         popq    %rbx
 552 
 553         ret
 554 
 555         SET_SIZE(big_sqr_vec64)
 556 
 557 #endif  /* lint */