bignum Wdiff usr/src/common/bignum/amd64/bignum_amd64_asm.s

Print this page

6799218 RSA using Solaris Kernel Crypto framework lagging behind OpenSSL
5016936 bignumimpl:big_mul: potential memory leak
6810280 panic from bignum module: vmem_xalloc(): size == 0

Split	Close
Expand all
Collapse all

          --- old/usr/src/common/bignum/amd64/bignum_amd64_asm.s
          +++ new/usr/src/common/bignum/amd64/bignum_amd64_asm.s
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5      - * Common Development and Distribution License, Version 1.0 only
   6      - * (the "License").  You may not use this file except in compliance
   7      - * with the License.
        5 + * Common Development and Distribution License (the "License").
        6 + * You may not use this file except in compliance with the License.
   8    7   *
   9    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10    9   * or http://www.opensolaris.org/os/licensing.
  11   10   * See the License for the specific language governing permissions
  12   11   * and limitations under the License.
  13   12   *
  14   13   * When distributing Covered Code, include this CDDL HEADER in each
  15   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16   15   * If applicable, add the following below this CDDL HEADER, with the
  17   16   * fields enclosed by brackets "[]" replaced with your own identifying
  18   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  19   18   *
  20   19   * CDDL HEADER END
  21   20   */
  22   21  /*
  23      - * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
       22 + * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   23   * Use is subject to license terms.
  25   24   */
  26   25  
  27      -#pragma ident   "%Z%%M% %I%     %E% SMI"
  28      -
  29   26  #include <sys/asm_linkage.h>
  30   27  
  31   28  #if defined(lint) || defined(__lint)
  32   29  
  33   30  #include <sys/types.h>
  34   31  
  35   32  /* ARGSUSED */
  36   33  uint64_t
  37      -big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
       34 +big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  38   35  { return (0); }
  39   36  
  40   37  /* ARGSUSED */
  41   38  uint64_t
  42      -big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
       39 +big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  43   40  { return (0); }
  44   41  
  45   42  /* ARGSUSED */
  46   43  void
  47      -big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
       44 +big_sqr_vec(uint64_t *r, uint64_t *a, int len)
  48   45  {}
  49   46  
  50   47  #else   /* lint */
  51   48  
  52   49  / ------------------------------------------------------------------------
  53   50  /
  54   51  /  Implementation of big_mul_set_vec which exploits
  55   52  /  the 64X64->128 bit  unsigned multiply instruction.
  56   53  /
  57   54  /  As defined in Sun's bignum library for pkcs11, bignums are
  58      -/  composed of an array of 32-bit "digits" along with descriptive
  59      -/  information.  The arrays of digits are only required to be
  60      -/  aligned on 32-bit boundary.  This implementation works only
  61      -/  when the two factors and the result happen to be 64 bit aligned
  62      -/  and have an even number of digits.
       55 +/  composed of an array of 64-bit "digits" or "chunks" along with
       56 +/  descriptive information.
  63   57  /
  64   58  / ------------------------------------------------------------------------
  65   59  
  66   60  / r = a * digit, r and a are vectors of length len
  67   61  / returns the carry digit
  68   62  / r and a are 64 bit aligned.
  69   63  /
  70   64  / uint64_t
  71      -/ big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
       65 +/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  72   66  /
  73      -        ENTRY(big_mul_set_vec64)
       67 +        ENTRY(big_mul_set_vec)
  74   68          xorq    %rax, %rax              / if (len == 0) return (0)
  75   69          testq   %rdx, %rdx
  76   70          jz      .L17
  77   71  
  78   72          movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
  79   73          xorq    %r9, %r9                / cy = 0
  80   74  
  81   75  .L15:
  82   76          cmpq    $8, %r8                 / 8 - len
  83   77          jb      .L16

  84   78          movq    0(%rsi), %rax           / rax = a[0]
  85   79          movq    8(%rsi), %r11           / prefetch a[1]
  86   80          mulq    %rcx                    / p = a[0] * digit
  87   81          addq    %r9, %rax
  88   82          adcq    $0, %rdx                / p += cy
  89   83          movq    %rax, 0(%rdi)           / r[0] = lo(p)
  90   84          movq    %rdx, %r9               / cy = hi(p)
  91   85  
  92   86          movq    %r11, %rax
  93   87          movq    16(%rsi), %r11          / prefetch a[2]
  94   88          mulq    %rcx                    / p = a[1] * digit
  95   89          addq    %r9, %rax
  96   90          adcq    $0, %rdx                / p += cy
  97   91          movq    %rax, 8(%rdi)           / r[1] = lo(p)
  98   92          movq    %rdx, %r9               / cy = hi(p)
  99   93  
 100   94          movq    %r11, %rax
 101   95          movq    24(%rsi), %r11          / prefetch a[3]
 102   96          mulq    %rcx                    / p = a[2] * digit
 103   97          addq    %r9, %rax
 104   98          adcq    $0, %rdx                / p += cy
 105   99          movq    %rax, 16(%rdi)          / r[2] = lo(p)
 106  100          movq    %rdx, %r9               / cy = hi(p)
 107  101  
 108  102          movq    %r11, %rax
 109  103          movq    32(%rsi), %r11          / prefetch a[4]
 110  104          mulq    %rcx                    / p = a[3] * digit
 111  105          addq    %r9, %rax
 112  106          adcq    $0, %rdx                / p += cy
 113  107          movq    %rax, 24(%rdi)          / r[3] = lo(p)
 114  108          movq    %rdx, %r9               / cy = hi(p)
 115  109  
 116  110          movq    %r11, %rax
 117  111          movq    40(%rsi), %r11          / prefetch a[5]
 118  112          mulq    %rcx                    / p = a[4] * digit
 119  113          addq    %r9, %rax
 120  114          adcq    $0, %rdx                / p += cy
 121  115          movq    %rax, 32(%rdi)          / r[4] = lo(p)
 122  116          movq    %rdx, %r9               / cy = hi(p)
 123  117  
 124  118          movq    %r11, %rax
 125  119          movq    48(%rsi), %r11          / prefetch a[6]
 126  120          mulq    %rcx                    / p = a[5] * digit
 127  121          addq    %r9, %rax
 128  122          adcq    $0, %rdx                / p += cy
 129  123          movq    %rax, 40(%rdi)          / r[5] = lo(p)
 130  124          movq    %rdx, %r9               / cy = hi(p)
 131  125  
 132  126          movq    %r11, %rax
 133  127          movq    56(%rsi), %r11          / prefetch a[7]
 134  128          mulq    %rcx                    / p = a[6] * digit
 135  129          addq    %r9, %rax
 136  130          adcq    $0, %rdx                / p += cy
 137  131          movq    %rax, 48(%rdi)          / r[6] = lo(p)
 138  132          movq    %rdx, %r9               / cy = hi(p)
 139  133  
 140  134          movq    %r11, %rax
 141  135          mulq    %rcx                    / p = a[7] * digit
 142  136          addq    %r9, %rax
 143  137          adcq    $0, %rdx                / p += cy
 144  138          movq    %rax, 56(%rdi)          / r[7] = lo(p)
 145  139          movq    %rdx, %r9               / cy = hi(p)
 146  140  
 147  141          addq    $64, %rsi
 148  142          addq    $64, %rdi
 149  143          subq    $8, %r8
 150  144  
 151  145          jz      .L17
 152  146          jmp     .L15
 153  147  
 154  148  .L16:
 155  149          movq    0(%rsi), %rax
 156  150          mulq    %rcx                    / p = a[0] * digit
 157  151          addq    %r9, %rax
 158  152          adcq    $0, %rdx                / p += cy
 159  153          movq    %rax, 0(%rdi)           / r[0] = lo(p)
 160  154          movq    %rdx, %r9               / cy = hi(p)
 161  155          decq    %r8
 162  156          jz      .L17
 163  157  
 164  158          movq    8(%rsi), %rax
 165  159          mulq    %rcx                    / p = a[1] * digit
 166  160          addq    %r9, %rax
 167  161          adcq    $0, %rdx                / p += cy
 168  162          movq    %rax, 8(%rdi)           / r[1] = lo(p)
 169  163          movq    %rdx, %r9               / cy = hi(p)
 170  164          decq    %r8
 171  165          jz      .L17
 172  166  
 173  167          movq    16(%rsi), %rax
 174  168          mulq    %rcx                    / p = a[2] * digit
 175  169          addq    %r9, %rax
 176  170          adcq    $0, %rdx                / p += cy
 177  171          movq    %rax, 16(%rdi)          / r[2] = lo(p)
 178  172          movq    %rdx, %r9               / cy = hi(p)
 179  173          decq    %r8
 180  174          jz      .L17
 181  175  
 182  176          movq    24(%rsi), %rax
 183  177          mulq    %rcx                    / p = a[3] * digit
 184  178          addq    %r9, %rax
 185  179          adcq    $0, %rdx                / p += cy
 186  180          movq    %rax, 24(%rdi)          / r[3] = lo(p)
 187  181          movq    %rdx, %r9               / cy = hi(p)
 188  182          decq    %r8
 189  183          jz      .L17
 190  184  
 191  185          movq    32(%rsi), %rax
 192  186          mulq    %rcx                    / p = a[4] * digit
 193  187          addq    %r9, %rax
 194  188          adcq    $0, %rdx                / p += cy
 195  189          movq    %rax, 32(%rdi)          / r[4] = lo(p)
 196  190          movq    %rdx, %r9               / cy = hi(p)
 197  191          decq    %r8
 198  192          jz      .L17
 199  193  
 200  194          movq    40(%rsi), %rax
 201  195          mulq    %rcx                    / p = a[5] * digit
 202  196          addq    %r9, %rax
 203  197          adcq    $0, %rdx                / p += cy
 204  198          movq    %rax, 40(%rdi)          / r[5] = lo(p)
 205  199          movq    %rdx, %r9               / cy = hi(p)
 206  200          decq    %r8
 207  201          jz      .L17
 208  202  
 209  203          movq    48(%rsi), %rax
 210  204          mulq    %rcx                    / p = a[6] * digit
 211  205          addq    %r9, %rax

↓ open down ↓

128 lines elided

↑ open up ↑

 212  206          adcq    $0, %rdx                / p += cy
 213  207          movq    %rax, 48(%rdi)          / r[6] = lo(p)
 214  208          movq    %rdx, %r9               / cy = hi(p)
 215  209          decq    %r8
 216  210          jz      .L17
 217  211  
 218  212  
 219  213  .L17:
 220  214          movq    %r9, %rax
 221  215          ret
 222      -        SET_SIZE(big_mul_set_vec64)
      216 +        SET_SIZE(big_mul_set_vec)
 223  217  
      218 +
 224  219  / ------------------------------------------------------------------------
 225  220  /
 226  221  /  Implementation of big_mul_add_vec which exploits
 227  222  /  the 64X64->128 bit  unsigned multiply instruction.
 228  223  /
 229  224  /  As defined in Sun's bignum library for pkcs11, bignums are
 230      -/  composed of an array of 32-bit "digits" along with descriptive
 231      -/  information.  The arrays of digits are only required to be
 232      -/  aligned on 32-bit boundary.  This implementation works only
 233      -/  when the two factors and the result happen to be 64 bit aligned
 234      -/  and have an even number of digits.
      225 +/  composed of an array of 64-bit "digits" or "chunks" along with
      226 +/  descriptive information.
 235  227  /
 236  228  / ------------------------------------------------------------------------
 237  229  
 238  230  / r += a * digit, r and a are vectors of length len
 239  231  / returns the carry digit
 240  232  / r and a are 64 bit aligned.
 241  233  /
 242  234  / uint64_t
 243      -/ big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
      235 +/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
 244  236  /
 245      -        ENTRY(big_mul_add_vec64)
      237 +        ENTRY(big_mul_add_vec)
 246  238          xorq    %rax, %rax              / if (len == 0) return (0)
 247  239          testq   %rdx, %rdx
 248  240          jz      .L27
 249  241  
 250  242          movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
 251  243          xorq    %r9, %r9                / cy = 0
 252  244  
 253  245  .L25:
 254  246          cmpq    $8, %r8                 / 8 - len
 255  247          jb      .L26

 256  248          movq    0(%rsi), %rax           / rax = a[0]
 257  249          movq    0(%rdi), %r10           / r10 = r[0]
 258  250          movq    8(%rsi), %r11           / prefetch a[1]
 259  251          mulq    %rcx                    / p = a[0] * digit
 260  252          addq    %r10, %rax
 261  253          adcq    $0, %rdx                / p += r[0]
 262  254          movq    8(%rdi), %r10           / prefetch r[1]
 263  255          addq    %r9, %rax
 264  256          adcq    $0, %rdx                / p += cy
 265  257          movq    %rax, 0(%rdi)           / r[0] = lo(p)
 266  258          movq    %rdx, %r9               / cy = hi(p)
 267  259  
 268  260          movq    %r11, %rax
 269  261          movq    16(%rsi), %r11          / prefetch a[2]
 270  262          mulq    %rcx                    / p = a[1] * digit
 271  263          addq    %r10, %rax
 272  264          adcq    $0, %rdx                / p += r[1]
 273  265          movq    16(%rdi), %r10          / prefetch r[2]
 274  266          addq    %r9, %rax
 275  267          adcq    $0, %rdx                / p += cy
 276  268          movq    %rax, 8(%rdi)           / r[1] = lo(p)
 277  269          movq    %rdx, %r9               / cy = hi(p)
 278  270  
 279  271          movq    %r11, %rax
 280  272          movq    24(%rsi), %r11          / prefetch a[3]
 281  273          mulq    %rcx                    / p = a[2] * digit
 282  274          addq    %r10, %rax
 283  275          adcq    $0, %rdx                / p += r[2]
 284  276          movq    24(%rdi), %r10          / prefetch r[3]
 285  277          addq    %r9, %rax
 286  278          adcq    $0, %rdx                / p += cy
 287  279          movq    %rax, 16(%rdi)          / r[2] = lo(p)
 288  280          movq    %rdx, %r9               / cy = hi(p)
 289  281  
 290  282          movq    %r11, %rax
 291  283          movq    32(%rsi), %r11          / prefetch a[4]
 292  284          mulq    %rcx                    / p = a[3] * digit
 293  285          addq    %r10, %rax
 294  286          adcq    $0, %rdx                / p += r[3]
 295  287          movq    32(%rdi), %r10          / prefetch r[4]
 296  288          addq    %r9, %rax
 297  289          adcq    $0, %rdx                / p += cy
 298  290          movq    %rax, 24(%rdi)          / r[3] = lo(p)
 299  291          movq    %rdx, %r9               / cy = hi(p)
 300  292  
 301  293          movq    %r11, %rax
 302  294          movq    40(%rsi), %r11          / prefetch a[5]
 303  295          mulq    %rcx                    / p = a[4] * digit
 304  296          addq    %r10, %rax
 305  297          adcq    $0, %rdx                / p += r[4]
 306  298          movq    40(%rdi), %r10          / prefetch r[5]
 307  299          addq    %r9, %rax
 308  300          adcq    $0, %rdx                / p += cy
 309  301          movq    %rax, 32(%rdi)          / r[4] = lo(p)
 310  302          movq    %rdx, %r9               / cy = hi(p)
 311  303  
 312  304          movq    %r11, %rax
 313  305          movq    48(%rsi), %r11          / prefetch a[6]
 314  306          mulq    %rcx                    / p = a[5] * digit
 315  307          addq    %r10, %rax
 316  308          adcq    $0, %rdx                / p += r[5]
 317  309          movq    48(%rdi), %r10          / prefetch r[6]
 318  310          addq    %r9, %rax
 319  311          adcq    $0, %rdx                / p += cy
 320  312          movq    %rax, 40(%rdi)          / r[5] = lo(p)
 321  313          movq    %rdx, %r9               / cy = hi(p)
 322  314  
 323  315          movq    %r11, %rax
 324  316          movq    56(%rsi), %r11          / prefetch a[7]
 325  317          mulq    %rcx                    / p = a[6] * digit
 326  318          addq    %r10, %rax
 327  319          adcq    $0, %rdx                / p += r[6]
 328  320          movq    56(%rdi), %r10          / prefetch r[7]
 329  321          addq    %r9, %rax
 330  322          adcq    $0, %rdx                / p += cy
 331  323          movq    %rax, 48(%rdi)          / r[6] = lo(p)
 332  324          movq    %rdx, %r9               / cy = hi(p)
 333  325  
 334  326          movq    %r11, %rax
 335  327          mulq    %rcx                    / p = a[7] * digit
 336  328          addq    %r10, %rax
 337  329          adcq    $0, %rdx                / p += r[7]
 338  330          addq    %r9, %rax
 339  331          adcq    $0, %rdx                / p += cy
 340  332          movq    %rax, 56(%rdi)          / r[7] = lo(p)
 341  333          movq    %rdx, %r9               / cy = hi(p)
 342  334  
 343  335          addq    $64, %rsi
 344  336          addq    $64, %rdi
 345  337          subq    $8, %r8
 346  338  
 347  339          jz      .L27
 348  340          jmp     .L25
 349  341  
 350  342  .L26:
 351  343          movq    0(%rsi), %rax
 352  344          movq    0(%rdi), %r10
 353  345          mulq    %rcx                    / p = a[0] * digit
 354  346          addq    %r10, %rax
 355  347          adcq    $0, %rdx                / p += r[0]
 356  348          addq    %r9, %rax
 357  349          adcq    $0, %rdx                / p += cy
 358  350          movq    %rax, 0(%rdi)           / r[0] = lo(p)
 359  351          movq    %rdx, %r9               / cy = hi(p)
 360  352          decq    %r8
 361  353          jz      .L27
 362  354  
 363  355          movq    8(%rsi), %rax
 364  356          movq    8(%rdi), %r10
 365  357          mulq    %rcx                    / p = a[1] * digit
 366  358          addq    %r10, %rax
 367  359          adcq    $0, %rdx                / p += r[1]
 368  360          addq    %r9, %rax
 369  361          adcq    $0, %rdx                / p += cy
 370  362          movq    %rax, 8(%rdi)           / r[1] = lo(p)
 371  363          movq    %rdx, %r9               / cy = hi(p)
 372  364          decq    %r8
 373  365          jz      .L27
 374  366  
 375  367          movq    16(%rsi), %rax
 376  368          movq    16(%rdi), %r10
 377  369          mulq    %rcx                    / p = a[2] * digit
 378  370          addq    %r10, %rax
 379  371          adcq    $0, %rdx                / p += r[2]
 380  372          addq    %r9, %rax
 381  373          adcq    $0, %rdx                / p += cy
 382  374          movq    %rax, 16(%rdi)          / r[2] = lo(p)
 383  375          movq    %rdx, %r9               / cy = hi(p)
 384  376          decq    %r8
 385  377          jz      .L27
 386  378  
 387  379          movq    24(%rsi), %rax
 388  380          movq    24(%rdi), %r10
 389  381          mulq    %rcx                    / p = a[3] * digit
 390  382          addq    %r10, %rax
 391  383          adcq    $0, %rdx                / p += r[3]
 392  384          addq    %r9, %rax
 393  385          adcq    $0, %rdx                / p += cy
 394  386          movq    %rax, 24(%rdi)          / r[3] = lo(p)
 395  387          movq    %rdx, %r9               / cy = hi(p)
 396  388          decq    %r8
 397  389          jz      .L27
 398  390  
 399  391          movq    32(%rsi), %rax
 400  392          movq    32(%rdi), %r10
 401  393          mulq    %rcx                    / p = a[4] * digit
 402  394          addq    %r10, %rax
 403  395          adcq    $0, %rdx                / p += r[4]
 404  396          addq    %r9, %rax
 405  397          adcq    $0, %rdx                / p += cy
 406  398          movq    %rax, 32(%rdi)          / r[4] = lo(p)
 407  399          movq    %rdx, %r9               / cy = hi(p)
 408  400          decq    %r8
 409  401          jz      .L27
 410  402  
 411  403          movq    40(%rsi), %rax
 412  404          movq    40(%rdi), %r10
 413  405          mulq    %rcx                    / p = a[5] * digit
 414  406          addq    %r10, %rax
 415  407          adcq    $0, %rdx                / p += r[5]
 416  408          addq    %r9, %rax
 417  409          adcq    $0, %rdx                / p += cy
 418  410          movq    %rax, 40(%rdi)          / r[5] = lo(p)
 419  411          movq    %rdx, %r9               / cy = hi(p)
 420  412          decq    %r8
 421  413          jz      .L27
 422  414  
 423  415          movq    48(%rsi), %rax
 424  416          movq    48(%rdi), %r10
 425  417          mulq    %rcx                    / p = a[6] * digit
 426  418          addq    %r10, %rax
 427  419          adcq    $0, %rdx                / p += r[6]
 428  420          addq    %r9, %rax

↓ open down ↓

173 lines elided

↑ open up ↑

 429  421          adcq    $0, %rdx                / p += cy
 430  422          movq    %rax, 48(%rdi)          / r[6] = lo(p)
 431  423          movq    %rdx, %r9               / cy = hi(p)
 432  424          decq    %r8
 433  425          jz      .L27
 434  426  
 435  427  
 436  428  .L27:
 437  429          movq    %r9, %rax
 438  430          ret
 439      -        SET_SIZE(big_mul_add_vec64)
      431 +        SET_SIZE(big_mul_add_vec)
 440  432  
 441  433  
 442  434  / void
 443      -/ big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
      435 +/ big_sqr_vec(uint64_t *r, uint64_t *a, int len)
 444  436  
 445      -        ENTRY(big_sqr_vec64)
      437 +        ENTRY(big_sqr_vec)
 446  438          pushq   %rbx
 447  439          pushq   %rbp
 448  440          pushq   %r12
 449  441          pushq   %r13
 450  442          pushq   %r14
 451  443          pushq   %r15
 452  444          pushq   %rdx                    / save arg3, len
 453  445          pushq   %rsi                    / save arg2, a
 454  446          pushq   %rdi                    / save arg1, r
 455  447  
 456  448          leaq    8(%rdi), %r13           / tr = r + 1
 457  449          movq    %rsi, %r14              / ta = a
 458  450          movq    %rdx, %r15              / tlen = len
 459  451          decq    %r15                    / tlen = len - 1
 460  452          movq    %r13, %rdi              / arg1 = tr
 461  453          leaq    8(%r14), %rsi           / arg2 = ta + 1
 462  454          movq    %r15, %rdx              / arg3 = tlen
 463  455          movq    0(%r14), %rcx           / arg4 = ta[0]
 464      -        call    big_mul_set_vec64
      456 +        call    big_mul_set_vec
 465  457          movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 466  458  .L31:
 467  459          decq    %r15                    / --tlen
 468  460          jz      .L32                    / while (--tlen != 0)
 469  461  
 470  462          addq    $16, %r13               / tr += 2
 471  463          addq    $8, %r14                / ++ta
 472  464          movq    %r13, %rdi              / arg1 = tr
 473  465          leaq    8(%r14), %rsi           / arg2 = ta + 1
 474  466          movq    %r15, %rdx              / arg3 = tlen
 475  467          movq    0(%r14), %rcx           / arg4 = ta[0]
 476      -        call    big_mul_add_vec64
      468 +        call    big_mul_add_vec
 477  469          movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 478  470          jmp     .L31
 479  471  
 480  472  .L32:
 481  473  
 482  474  / No more function calls after this.
 483  475  / Restore arguments to registers.
 484  476  / However, don't use %rdx for arg3, len, because it is heavily
 485  477  / used by the hardware MUL instruction.  Use %r8, instead.
 486  478          movq    0(%rsp), %rdi           / %rdi == arg1 == r

 487  479          movq    8(%rsp), %rsi           / %rsi == arg2 == a
 488  480          movq    16(%rsp), %r8           / %r8  == arg3 == len
 489  481  
 490  482          movq    0(%rsi), %rax           / %rax = a[0];
 491  483          mulq    %rax                    / s = %edx:%eax = a[0]**2
 492  484          movq    %rax, 0(%rdi)           / r[0] = lo64(s)
 493  485          movq    %rdx, %r9               / cy = hi64(s)
 494  486          xorq    %rdx, %rdx
 495  487          movq    8(%rdi), %rax           / p = %rdx:%rax = r[1]
 496  488          addq    %rax, %rax
 497  489          adcq    $0, %rdx                / p = p << 1
 498  490          addq    %r9, %rax
 499  491          adcq    $0, %rdx                / p = (r[1] << 1) + cy
 500  492          movq    %rax, 8(%rdi)           / r[1] = lo64(p)
 501  493          movq    %rdx, %r9               / cy = hi64(p)
 502  494          movq    $1, %r11                / row = 1
 503  495          movq    $2, %r12                / col = 2
 504  496          movq    %r8, %r15
 505  497          decq    %r15                    / tlen = len - 1
 506  498  .L33:
 507  499          cmpq    %r8, %r11               / len - row
 508  500          jae     .L34                    / while (row < len)
 509  501  
 510  502          movq    0(%rsi, %r11, 8), %rax  / s = (uint128_t)a[row]
 511  503          mulq    %rax                    / s = s * s
 512  504          xorq    %rbx, %rbx
 513  505          movq    0(%rdi, %r12, 8), %rcx  / p = (uint128_t)r[col]
 514  506          addq    %rcx, %rcx
 515  507          adcq    $0, %rbx                / p = p << 1
 516  508          addq    %rcx, %rax
 517  509          adcq    %rbx, %rdx              / t = p + s
 518  510          xorq    %r10, %r10
 519  511          movq    %rax, %rbp              / t2 = 0:lo64(t)
 520  512          addq    %r9, %rbp
 521  513          adcq    $0, %r10                / t2 = %r10:%rbp = lo64(t) + cy
 522  514          movq    %rbp, 0(%rdi, %r12, 8)  / r[col] = lo64(t2)
 523  515          xorq    %rcx, %rcx
 524  516          movq    %rdx, %r9
 525  517          addq    %r10, %r9
 526  518          adcq    $0, %rcx                / cy = hi64(t) + hi64(t2)
 527  519          cmpq    %r11, %r15
 528  520          je      .L34                    / if (row == len - 1) break
 529  521          xorq    %rdx, %rdx
 530  522          movq    8(%rdi, %r12, 8), %rax
 531  523          addq    %rax, %rax
 532  524          adcq    $0, %rdx
 533  525          addq    %r9, %rax
 534  526          adcq    %rcx, %rdx              / p = (lo64(r[col+1]) << 1) + cy
 535  527          movq    %rax, 8(%rdi, %r12, 8)  / r[col+1] = lo64(p)
 536  528          movq    %rdx, %r9               / cy = hi64(p)
 537  529  
 538  530          incq    %r11                    / ++row
 539  531          addq    $2, %r12                / col += 2
 540  532          jmp     .L33
 541  533  
 542  534  .L34:
 543  535          movq    %r9, 8(%rdi, %r12, 8)   / r[col+1] = lo64(cy)
 544  536

↓ open down ↓

58 lines elided

↑ open up ↑

 545  537          addq    $24, %rsp               / skip %rdi, %rsi, %rdx
 546  538          popq    %r15
 547  539          popq    %r14
 548  540          popq    %r13
 549  541          popq    %r12
 550  542          popq    %rbp
 551  543          popq    %rbx
 552  544  
 553  545          ret
 554  546  
 555      -        SET_SIZE(big_sqr_vec64)
      547 +        SET_SIZE(big_sqr_vec)
 556  548  
 557  549  #endif  /* lint */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX