1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/asm_linkage.h>
  27 
  28 #if defined(lint) || defined(__lint)
  29 
  30 #include <sys/types.h>
  31 
  32 /* ARGSUSED */
  33 uint64_t
  34 big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  35 { return (0); }
  36 
  37 /* ARGSUSED */
  38 uint64_t
  39 big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  40 { return (0); }
  41 
  42 /* ARGSUSED */
  43 void
  44 big_sqr_vec(uint64_t *r, uint64_t *a, int len)
  45 {}
  46 
  47 #else   /* lint */
  48 
  49 / ------------------------------------------------------------------------
  50 /
  51 /  Implementation of big_mul_set_vec which exploits
  52 /  the 64X64->128 bit  unsigned multiply instruction.
  53 /
  54 /  As defined in Sun's bignum library for pkcs11, bignums are
  55 /  composed of an array of 64-bit "digits" or "chunks" along with
  56 /  descriptive information.
  57 /
  58 / ------------------------------------------------------------------------
  59 
  60 / r = a * digit, r and a are vectors of length len
  61 / returns the carry digit
  62 / r and a are 64 bit aligned.
  63 /
  64 / uint64_t
  65 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  66 /
  67         ENTRY(big_mul_set_vec)
  68         xorq    %rax, %rax              / if (len == 0) return (0)
  69         testq   %rdx, %rdx
  70         jz      .L17
  71 
  72         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
  73         xorq    %r9, %r9                / cy = 0
  74 
  75 .L15:
  76         cmpq    $8, %r8                 / 8 - len
  77         jb      .L16
  78         movq    0(%rsi), %rax           / rax = a[0]
  79         movq    8(%rsi), %r11           / prefetch a[1]
  80         mulq    %rcx                    / p = a[0] * digit
  81         addq    %r9, %rax
  82         adcq    $0, %rdx                / p += cy
  83         movq    %rax, 0(%rdi)           / r[0] = lo(p)
  84         movq    %rdx, %r9               / cy = hi(p)
  85 
  86         movq    %r11, %rax
  87         movq    16(%rsi), %r11          / prefetch a[2]
  88         mulq    %rcx                    / p = a[1] * digit
  89         addq    %r9, %rax
  90         adcq    $0, %rdx                / p += cy
  91         movq    %rax, 8(%rdi)           / r[1] = lo(p)
  92         movq    %rdx, %r9               / cy = hi(p)
  93 
  94         movq    %r11, %rax
  95         movq    24(%rsi), %r11          / prefetch a[3]
  96         mulq    %rcx                    / p = a[2] * digit
  97         addq    %r9, %rax
  98         adcq    $0, %rdx                / p += cy
  99         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 100         movq    %rdx, %r9               / cy = hi(p)
 101 
 102         movq    %r11, %rax
 103         movq    32(%rsi), %r11          / prefetch a[4]
 104         mulq    %rcx                    / p = a[3] * digit
 105         addq    %r9, %rax
 106         adcq    $0, %rdx                / p += cy
 107         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 108         movq    %rdx, %r9               / cy = hi(p)
 109 
 110         movq    %r11, %rax
 111         movq    40(%rsi), %r11          / prefetch a[5]
 112         mulq    %rcx                    / p = a[4] * digit
 113         addq    %r9, %rax
 114         adcq    $0, %rdx                / p += cy
 115         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 116         movq    %rdx, %r9               / cy = hi(p)
 117 
 118         movq    %r11, %rax
 119         movq    48(%rsi), %r11          / prefetch a[6]
 120         mulq    %rcx                    / p = a[5] * digit
 121         addq    %r9, %rax
 122         adcq    $0, %rdx                / p += cy
 123         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 124         movq    %rdx, %r9               / cy = hi(p)
 125 
 126         movq    %r11, %rax
 127         movq    56(%rsi), %r11          / prefetch a[7]
 128         mulq    %rcx                    / p = a[6] * digit
 129         addq    %r9, %rax
 130         adcq    $0, %rdx                / p += cy
 131         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 132         movq    %rdx, %r9               / cy = hi(p)
 133 
 134         movq    %r11, %rax
 135         mulq    %rcx                    / p = a[7] * digit
 136         addq    %r9, %rax
 137         adcq    $0, %rdx                / p += cy
 138         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 139         movq    %rdx, %r9               / cy = hi(p)
 140 
 141         addq    $64, %rsi
 142         addq    $64, %rdi
 143         subq    $8, %r8
 144 
 145         jz      .L17
 146         jmp     .L15
 147 
 148 .L16:
 149         movq    0(%rsi), %rax
 150         mulq    %rcx                    / p = a[0] * digit
 151         addq    %r9, %rax
 152         adcq    $0, %rdx                / p += cy
 153         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 154         movq    %rdx, %r9               / cy = hi(p)
 155         decq    %r8
 156         jz      .L17
 157 
 158         movq    8(%rsi), %rax
 159         mulq    %rcx                    / p = a[1] * digit
 160         addq    %r9, %rax
 161         adcq    $0, %rdx                / p += cy
 162         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 163         movq    %rdx, %r9               / cy = hi(p)
 164         decq    %r8
 165         jz      .L17
 166 
 167         movq    16(%rsi), %rax
 168         mulq    %rcx                    / p = a[2] * digit
 169         addq    %r9, %rax
 170         adcq    $0, %rdx                / p += cy
 171         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 172         movq    %rdx, %r9               / cy = hi(p)
 173         decq    %r8
 174         jz      .L17
 175 
 176         movq    24(%rsi), %rax
 177         mulq    %rcx                    / p = a[3] * digit
 178         addq    %r9, %rax
 179         adcq    $0, %rdx                / p += cy
 180         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 181         movq    %rdx, %r9               / cy = hi(p)
 182         decq    %r8
 183         jz      .L17
 184 
 185         movq    32(%rsi), %rax
 186         mulq    %rcx                    / p = a[4] * digit
 187         addq    %r9, %rax
 188         adcq    $0, %rdx                / p += cy
 189         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 190         movq    %rdx, %r9               / cy = hi(p)
 191         decq    %r8
 192         jz      .L17
 193 
 194         movq    40(%rsi), %rax
 195         mulq    %rcx                    / p = a[5] * digit
 196         addq    %r9, %rax
 197         adcq    $0, %rdx                / p += cy
 198         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 199         movq    %rdx, %r9               / cy = hi(p)
 200         decq    %r8
 201         jz      .L17
 202 
 203         movq    48(%rsi), %rax
 204         mulq    %rcx                    / p = a[6] * digit
 205         addq    %r9, %rax
 206         adcq    $0, %rdx                / p += cy
 207         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 208         movq    %rdx, %r9               / cy = hi(p)
 209         decq    %r8
 210         jz      .L17
 211 
 212 
 213 .L17:
 214         movq    %r9, %rax
 215         ret
 216         SET_SIZE(big_mul_set_vec)
 217 
 218 
 219 / ------------------------------------------------------------------------
 220 /
 221 /  Implementation of big_mul_add_vec which exploits
 222 /  the 64X64->128 bit  unsigned multiply instruction.
 223 /
 224 /  As defined in Sun's bignum library for pkcs11, bignums are
 225 /  composed of an array of 64-bit "digits" or "chunks" along with
 226 /  descriptive information.
 227 /
 228 / ------------------------------------------------------------------------
 229 
 230 / r += a * digit, r and a are vectors of length len
 231 / returns the carry digit
 232 / r and a are 64 bit aligned.
 233 /
 234 / uint64_t
 235 / big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
 236 /
 237         ENTRY(big_mul_add_vec)
 238         xorq    %rax, %rax              / if (len == 0) return (0)
 239         testq   %rdx, %rdx
 240         jz      .L27
 241 
 242         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
 243         xorq    %r9, %r9                / cy = 0
 244 
 245 .L25:
 246         cmpq    $8, %r8                 / 8 - len
 247         jb      .L26
 248         movq    0(%rsi), %rax           / rax = a[0]
 249         movq    0(%rdi), %r10           / r10 = r[0]
 250         movq    8(%rsi), %r11           / prefetch a[1]
 251         mulq    %rcx                    / p = a[0] * digit
 252         addq    %r10, %rax
 253         adcq    $0, %rdx                / p += r[0]
 254         movq    8(%rdi), %r10           / prefetch r[1]
 255         addq    %r9, %rax
 256         adcq    $0, %rdx                / p += cy
 257         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 258         movq    %rdx, %r9               / cy = hi(p)
 259 
 260         movq    %r11, %rax
 261         movq    16(%rsi), %r11          / prefetch a[2]
 262         mulq    %rcx                    / p = a[1] * digit
 263         addq    %r10, %rax
 264         adcq    $0, %rdx                / p += r[1]
 265         movq    16(%rdi), %r10          / prefetch r[2]
 266         addq    %r9, %rax
 267         adcq    $0, %rdx                / p += cy
 268         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 269         movq    %rdx, %r9               / cy = hi(p)
 270 
 271         movq    %r11, %rax
 272         movq    24(%rsi), %r11          / prefetch a[3]
 273         mulq    %rcx                    / p = a[2] * digit
 274         addq    %r10, %rax
 275         adcq    $0, %rdx                / p += r[2]
 276         movq    24(%rdi), %r10          / prefetch r[3]
 277         addq    %r9, %rax
 278         adcq    $0, %rdx                / p += cy
 279         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 280         movq    %rdx, %r9               / cy = hi(p)
 281 
 282         movq    %r11, %rax
 283         movq    32(%rsi), %r11          / prefetch a[4]
 284         mulq    %rcx                    / p = a[3] * digit
 285         addq    %r10, %rax
 286         adcq    $0, %rdx                / p += r[3]
 287         movq    32(%rdi), %r10          / prefetch r[4]
 288         addq    %r9, %rax
 289         adcq    $0, %rdx                / p += cy
 290         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 291         movq    %rdx, %r9               / cy = hi(p)
 292 
 293         movq    %r11, %rax
 294         movq    40(%rsi), %r11          / prefetch a[5]
 295         mulq    %rcx                    / p = a[4] * digit
 296         addq    %r10, %rax
 297         adcq    $0, %rdx                / p += r[4]
 298         movq    40(%rdi), %r10          / prefetch r[5]
 299         addq    %r9, %rax
 300         adcq    $0, %rdx                / p += cy
 301         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 302         movq    %rdx, %r9               / cy = hi(p)
 303 
 304         movq    %r11, %rax
 305         movq    48(%rsi), %r11          / prefetch a[6]
 306         mulq    %rcx                    / p = a[5] * digit
 307         addq    %r10, %rax
 308         adcq    $0, %rdx                / p += r[5]
 309         movq    48(%rdi), %r10          / prefetch r[6]
 310         addq    %r9, %rax
 311         adcq    $0, %rdx                / p += cy
 312         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 313         movq    %rdx, %r9               / cy = hi(p)
 314 
 315         movq    %r11, %rax
 316         movq    56(%rsi), %r11          / prefetch a[7]
 317         mulq    %rcx                    / p = a[6] * digit
 318         addq    %r10, %rax
 319         adcq    $0, %rdx                / p += r[6]
 320         movq    56(%rdi), %r10          / prefetch r[7]
 321         addq    %r9, %rax
 322         adcq    $0, %rdx                / p += cy
 323         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 324         movq    %rdx, %r9               / cy = hi(p)
 325 
 326         movq    %r11, %rax
 327         mulq    %rcx                    / p = a[7] * digit
 328         addq    %r10, %rax
 329         adcq    $0, %rdx                / p += r[7]
 330         addq    %r9, %rax
 331         adcq    $0, %rdx                / p += cy
 332         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 333         movq    %rdx, %r9               / cy = hi(p)
 334 
 335         addq    $64, %rsi
 336         addq    $64, %rdi
 337         subq    $8, %r8
 338 
 339         jz      .L27
 340         jmp     .L25
 341 
 342 .L26:
 343         movq    0(%rsi), %rax
 344         movq    0(%rdi), %r10
 345         mulq    %rcx                    / p = a[0] * digit
 346         addq    %r10, %rax
 347         adcq    $0, %rdx                / p += r[0]
 348         addq    %r9, %rax
 349         adcq    $0, %rdx                / p += cy
 350         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 351         movq    %rdx, %r9               / cy = hi(p)
 352         decq    %r8
 353         jz      .L27
 354 
 355         movq    8(%rsi), %rax
 356         movq    8(%rdi), %r10
 357         mulq    %rcx                    / p = a[1] * digit
 358         addq    %r10, %rax
 359         adcq    $0, %rdx                / p += r[1]
 360         addq    %r9, %rax
 361         adcq    $0, %rdx                / p += cy
 362         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 363         movq    %rdx, %r9               / cy = hi(p)
 364         decq    %r8
 365         jz      .L27
 366 
 367         movq    16(%rsi), %rax
 368         movq    16(%rdi), %r10
 369         mulq    %rcx                    / p = a[2] * digit
 370         addq    %r10, %rax
 371         adcq    $0, %rdx                / p += r[2]
 372         addq    %r9, %rax
 373         adcq    $0, %rdx                / p += cy
 374         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 375         movq    %rdx, %r9               / cy = hi(p)
 376         decq    %r8
 377         jz      .L27
 378 
 379         movq    24(%rsi), %rax
 380         movq    24(%rdi), %r10
 381         mulq    %rcx                    / p = a[3] * digit
 382         addq    %r10, %rax
 383         adcq    $0, %rdx                / p += r[3]
 384         addq    %r9, %rax
 385         adcq    $0, %rdx                / p += cy
 386         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 387         movq    %rdx, %r9               / cy = hi(p)
 388         decq    %r8
 389         jz      .L27
 390 
 391         movq    32(%rsi), %rax
 392         movq    32(%rdi), %r10
 393         mulq    %rcx                    / p = a[4] * digit
 394         addq    %r10, %rax
 395         adcq    $0, %rdx                / p += r[4]
 396         addq    %r9, %rax
 397         adcq    $0, %rdx                / p += cy
 398         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 399         movq    %rdx, %r9               / cy = hi(p)
 400         decq    %r8
 401         jz      .L27
 402 
 403         movq    40(%rsi), %rax
 404         movq    40(%rdi), %r10
 405         mulq    %rcx                    / p = a[5] * digit
 406         addq    %r10, %rax
 407         adcq    $0, %rdx                / p += r[5]
 408         addq    %r9, %rax
 409         adcq    $0, %rdx                / p += cy
 410         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 411         movq    %rdx, %r9               / cy = hi(p)
 412         decq    %r8
 413         jz      .L27
 414 
 415         movq    48(%rsi), %rax
 416         movq    48(%rdi), %r10
 417         mulq    %rcx                    / p = a[6] * digit
 418         addq    %r10, %rax
 419         adcq    $0, %rdx                / p += r[6]
 420         addq    %r9, %rax
 421         adcq    $0, %rdx                / p += cy
 422         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 423         movq    %rdx, %r9               / cy = hi(p)
 424         decq    %r8
 425         jz      .L27
 426 
 427 
 428 .L27:
 429         movq    %r9, %rax
 430         ret
 431         SET_SIZE(big_mul_add_vec)
 432 
 433 
 434 / void
 435 / big_sqr_vec(uint64_t *r, uint64_t *a, int len)
 436 
 437         ENTRY(big_sqr_vec)
 438         pushq   %rbx
 439         pushq   %rbp
 440         pushq   %r12
 441         pushq   %r13
 442         pushq   %r14
 443         pushq   %r15
 444         pushq   %rdx                    / save arg3, len
 445         pushq   %rsi                    / save arg2, a
 446         pushq   %rdi                    / save arg1, r
 447 
 448         leaq    8(%rdi), %r13           / tr = r + 1
 449         movq    %rsi, %r14              / ta = a
 450         movq    %rdx, %r15              / tlen = len
 451         decq    %r15                    / tlen = len - 1
 452         movq    %r13, %rdi              / arg1 = tr
 453         leaq    8(%r14), %rsi           / arg2 = ta + 1
 454         movq    %r15, %rdx              / arg3 = tlen
 455         movq    0(%r14), %rcx           / arg4 = ta[0]
 456         call    big_mul_set_vec
 457         movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 458 .L31:
 459         decq    %r15                    / --tlen
 460         jz      .L32                    / while (--tlen != 0)
 461 
 462         addq    $16, %r13               / tr += 2
 463         addq    $8, %r14                / ++ta
 464         movq    %r13, %rdi              / arg1 = tr
 465         leaq    8(%r14), %rsi           / arg2 = ta + 1
 466         movq    %r15, %rdx              / arg3 = tlen
 467         movq    0(%r14), %rcx           / arg4 = ta[0]
 468         call    big_mul_add_vec
 469         movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 470         jmp     .L31
 471 
 472 .L32:
 473 
 474 / No more function calls after this.
 475 / Restore arguments to registers.
 476 / However, don't use %rdx for arg3, len, because it is heavily
 477 / used by the hardware MUL instruction.  Use %r8, instead.
 478         movq    0(%rsp), %rdi           / %rdi == arg1 == r
 479         movq    8(%rsp), %rsi           / %rsi == arg2 == a
 480         movq    16(%rsp), %r8           / %r8  == arg3 == len
 481 
 482         movq    0(%rsi), %rax           / %rax = a[0];
 483         mulq    %rax                    / s = %edx:%eax = a[0]**2
 484         movq    %rax, 0(%rdi)           / r[0] = lo64(s)
 485         movq    %rdx, %r9               / cy = hi64(s)
 486         xorq    %rdx, %rdx
 487         movq    8(%rdi), %rax           / p = %rdx:%rax = r[1]
 488         addq    %rax, %rax
 489         adcq    $0, %rdx                / p = p << 1
 490         addq    %r9, %rax
 491         adcq    $0, %rdx                / p = (r[1] << 1) + cy
 492         movq    %rax, 8(%rdi)           / r[1] = lo64(p)
 493         movq    %rdx, %r9               / cy = hi64(p)
 494         movq    $1, %r11                / row = 1
 495         movq    $2, %r12                / col = 2
 496         movq    %r8, %r15
 497         decq    %r15                    / tlen = len - 1
 498 .L33:
 499         cmpq    %r8, %r11               / len - row
 500         jae     .L34                    / while (row < len)
 501 
 502         movq    0(%rsi, %r11, 8), %rax  / s = (uint128_t)a[row]
 503         mulq    %rax                    / s = s * s
 504         xorq    %rbx, %rbx
 505         movq    0(%rdi, %r12, 8), %rcx  / p = (uint128_t)r[col]
 506         addq    %rcx, %rcx
 507         adcq    $0, %rbx                / p = p << 1
 508         addq    %rcx, %rax
 509         adcq    %rbx, %rdx              / t = p + s
 510         xorq    %r10, %r10
 511         movq    %rax, %rbp              / t2 = 0:lo64(t)
 512         addq    %r9, %rbp
 513         adcq    $0, %r10                / t2 = %r10:%rbp = lo64(t) + cy
 514         movq    %rbp, 0(%rdi, %r12, 8)  / r[col] = lo64(t2)
 515         xorq    %rcx, %rcx
 516         movq    %rdx, %r9
 517         addq    %r10, %r9
 518         adcq    $0, %rcx                / cy = hi64(t) + hi64(t2)
 519         cmpq    %r11, %r15
 520         je      .L34                    / if (row == len - 1) break
 521         xorq    %rdx, %rdx
 522         movq    8(%rdi, %r12, 8), %rax
 523         addq    %rax, %rax
 524         adcq    $0, %rdx
 525         addq    %r9, %rax
 526         adcq    %rcx, %rdx              / p = (lo64(r[col+1]) << 1) + cy
 527         movq    %rax, 8(%rdi, %r12, 8)  / r[col+1] = lo64(p)
 528         movq    %rdx, %r9               / cy = hi64(p)
 529 
 530         incq    %r11                    / ++row
 531         addq    $2, %r12                / col += 2
 532         jmp     .L33
 533 
 534 .L34:
 535         movq    %r9, 8(%rdi, %r12, 8)   / r[col+1] = lo64(cy)
 536 
 537         addq    $24, %rsp               / skip %rdi, %rsi, %rdx
 538         popq    %r15
 539         popq    %r14
 540         popq    %r13
 541         popq    %r12
 542         popq    %rbp
 543         popq    %rbx
 544 
 545         ret
 546 
 547         SET_SIZE(big_sqr_vec)
 548 
 549 #endif  /* lint */