bignum Cdiff usr/src/common/bignum/bignumimpl.c

Print this page

6799218 RSA using Solaris Kernel Crypto framework lagging behind OpenSSL
5016936 bignumimpl:big_mul: potential memory leak
6810280 panic from bignum module: vmem_xalloc(): size == 0


*** 17,36 ****
   * information: Portions Copyright [yyyy] [name of copyright owner]
   *
   * CDDL HEADER END
   */
  /*
!  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
- #pragma ident   "%Z%%M% %I%     %E% SMI"
- 
- #define big_div_pos_fast big_div_pos
- 
- #include "bignum.h"
- 
  /*
   * Configuration guide
   * -------------------
   *
   * There are 4 preprocessor symbols used to configure the bignum
--- 17,30 ----
   * information: Portions Copyright [yyyy] [name of copyright owner]
   *
   * CDDL HEADER END
   */
  /*
!  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
  /*
   * Configuration guide
   * -------------------
   *
   * There are 4 preprocessor symbols used to configure the bignum
*** 48,58 ****
   *   for all 4 functions.  You cannot pick and choose which subset of these
   *   functions to support; that would lead to a rat's nest of #ifdefs.
   *
   * HWCAP
   *   Meaning: Call multiply support functions through a function pointer.
!  *   On x86, there are multiple implementations for differnt hardware
   *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
   *   a function is first used.  So, the support functions are called through
   *   a function pointer.  There is no need for that on Sparc, because there
   *   is only one implementation; support functions are called directly.
   *   Later, if there were some new VIS instruction, or something, and a
--- 42,52 ----
   *   for all 4 functions.  You cannot pick and choose which subset of these
   *   functions to support; that would lead to a rat's nest of #ifdefs.
   *
   * HWCAP
   *   Meaning: Call multiply support functions through a function pointer.
!  *   On x86, there are multiple implementations for different hardware
   *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
   *   a function is first used.  So, the support functions are called through
   *   a function pointer.  There is no need for that on Sparc, because there
   *   is only one implementation; support functions are called directly.
   *   Later, if there were some new VIS instruction, or something, and a
*** 66,85 ****
   *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
   *
   */
  
  
  #ifdef  _KERNEL
  #include <sys/ddi.h>
  #include <sys/mdesc.h>
  #include <sys/crypto/common.h>
  
- #include <sys/types.h>
  #include <sys/kmem.h>
  #include <sys/param.h>
  #include <sys/sunddi.h>
  
  #define big_malloc(size)        kmem_alloc(size, KM_NOSLEEP)
  #define big_free(ptr, size)     kmem_free(ptr, size)
  
  void *
  big_realloc(void *from, size_t oldsize, size_t newsize)
--- 60,96 ----
   *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
   *
   */
  
  
+ #include <sys/types.h>
+ #include "bignum.h"
+ 
  #ifdef  _KERNEL
  #include <sys/ddi.h>
  #include <sys/mdesc.h>
  #include <sys/crypto/common.h>
  
  #include <sys/kmem.h>
  #include <sys/param.h>
  #include <sys/sunddi.h>
  
+ #else
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <assert.h>
+ #define ASSERT  assert
+ #endif  /* _KERNEL */
+ 
+ #ifdef  _LP64 /* truncate 64-bit size_t to 32-bits */
+ #define UI32(ui)        ((uint32_t)ui)
+ #else /* size_t already 32-bits */
+ #define UI32(ui)        (ui)
+ #endif
+ 
+ 
+ #ifdef  _KERNEL
  #define big_malloc(size)        kmem_alloc(size, KM_NOSLEEP)
  #define big_free(ptr, size)     kmem_free(ptr, size)
  
  void *
  big_realloc(void *from, size_t oldsize, size_t newsize)
*** 93,107 ****
          return (rv);
  }
  
  #else   /* _KERNEL */
  
- #include <stdlib.h>
- #include <stdio.h>
- #include <assert.h>
- #define ASSERT  assert
- 
  #ifndef MALLOC_DEBUG
  
  #define big_malloc(size)        malloc(size)
  #define big_free(ptr, size)     free(ptr)
  
--- 104,113 ----
*** 124,149 ****
  }
  #endif /* MALLOC_DEBUG */
  
  #define big_realloc(x, y, z) realloc((x), (z))
  
  void
  printbignum(char *aname, BIGNUM *a)
  {
          int i;
  
          (void) printf("\n%s\n%d\n", aname, a->sign*a->len);
          for (i = a->len - 1; i >= 0; i--) {
  #ifdef BIGNUM_CHUNK_32
                  (void) printf("%08x ", a->value[i]);
!                 if ((i % 8 == 0) && (i != 0)) {
                          (void) printf("\n");
                  }
  #else
                  (void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
                      (uint32_t)((a->value[i]) & 0xffffffff));
!                 if ((i % 4 == 0) && (i != 0)) {
                          (void) printf("\n");
                  }
  #endif
          }
          (void) printf("\n");
--- 130,160 ----
  }
  #endif /* MALLOC_DEBUG */
  
  #define big_realloc(x, y, z) realloc((x), (z))
  
+ 
+ /*
+  * printbignum()
+  * Print a BIGNUM type to stdout.
+  */
  void
  printbignum(char *aname, BIGNUM *a)
  {
          int i;
  
          (void) printf("\n%s\n%d\n", aname, a->sign*a->len);
          for (i = a->len - 1; i >= 0; i--) {
  #ifdef BIGNUM_CHUNK_32
                  (void) printf("%08x ", a->value[i]);
!                 if (((i & (BITSINBYTE - 1)) == 0) && (i != 0)) {
                          (void) printf("\n");
                  }
  #else
                  (void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
                      (uint32_t)((a->value[i]) & 0xffffffff));
!                 if (((i & 3) == 0) && (i != 0)) { /* end of this chunk */
                          (void) printf("\n");
                  }
  #endif
          }
          (void) printf("\n");
*** 150,164 ****
  }
  
  #endif  /* _KERNEL */
  
  
! /* size in BIG_CHUNK_SIZE-bit words */
  BIG_ERR_CODE
  big_init(BIGNUM *number, int size)
  {
!         number->value = big_malloc(sizeof (BIG_CHUNK_TYPE) * size);
          if (number->value == NULL) {
                  return (BIG_NO_MEM);
          }
          number->size = size;
          number->len = 0;
--- 161,191 ----
  }
  
  #endif  /* _KERNEL */
  
  
! /*
!  * big_init()
!  * Initialize and allocate memory for a BIGNUM type.
!  *
!  * big_init(number, size) is equivalent to big_init1(number, size, NULL, 0)
!  *
!  * Note: call big_finish() to free memory allocated by big_init().
!  *
!  * Input:
!  * number       Uninitialized memory for BIGNUM
!  * size         Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
!  *
!  * Output:
!  * number       Initialized BIGNUM
!  *
!  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
!  */
  BIG_ERR_CODE
  big_init(BIGNUM *number, int size)
  {
!         number->value = big_malloc(BIGNUM_WORDSIZE * size);
          if (number->value == NULL) {
                  return (BIG_NO_MEM);
          }
          number->size = size;
          number->len = 0;
*** 165,180 ****
          number->sign = 1;
          number->malloced = 1;
          return (BIG_OK);
  }
  
! /* size in BIG_CHUNK_SIZE-bit words */
  BIG_ERR_CODE
  big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
  {
          if ((buf == NULL) || (size > bufsize)) {
!                 number->value = big_malloc(sizeof (BIG_CHUNK_TYPE) * size);
                  if (number->value == NULL) {
                          return (BIG_NO_MEM);
                  }
                  number->size = size;
                  number->malloced = 1;
--- 192,227 ----
          number->sign = 1;
          number->malloced = 1;
          return (BIG_OK);
  }
  
! 
! /*
!  * big_init1()
!  * Initialize and, if needed, allocate memory for a BIGNUM type.
!  * Use the buffer passed, buf, if any, instad of allocating memory
!  * if it's at least "size" bytes.
!  *
!  * Note: call big_finish() to free memory allocated by big_init().
!  *
!  * Input:
!  * number       Uninitialized memory for BIGNUM
!  * size         Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
!  * buf          Buffer for storing a BIGNUM.
!  *              If NULL, big_init1() will allocate a buffer
!  * bufsize      Size, in BIG_CHUNK_SIZE_bit words, of buf
!  *
!  * Output:
!  * number       Initialized BIGNUM
!  *
!  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
!  */
  BIG_ERR_CODE
  big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
  {
          if ((buf == NULL) || (size > bufsize)) {
!                 number->value = big_malloc(BIGNUM_WORDSIZE * size);
                  if (number->value == NULL) {
                          return (BIG_NO_MEM);
                  }
                  number->size = size;
                  number->malloced = 1;
*** 187,306 ****
                  number->sign = 1;
  
          return (BIG_OK);
  }
  
  void
  big_finish(BIGNUM *number)
  {
          if (number->malloced == 1) {
!                 big_free(number->value,
!                     sizeof (BIG_CHUNK_TYPE) * number->size);
                  number->malloced = 0;
          }
  }
  
  
  /*
   *  bn->size should be at least
!  * (len + sizeof (BIG_CHUNK_TYPE) - 1) / sizeof (BIG_CHUNK_TYPE) bytes
   * converts from byte-big-endian format to bignum format (words in
   * little endian order, but bytes within the words big endian)
   */
  void
  bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
  {
!         int             i, j, offs;
          BIG_CHUNK_TYPE  word;
          uchar_t         *knwordp;
  
! #ifdef  _LP64
!         offs = (uint32_t)len % sizeof (BIG_CHUNK_TYPE);
!         bn->len = (uint32_t)len / sizeof (BIG_CHUNK_TYPE);
  
!         for (i = 0; i < (uint32_t)len / sizeof (BIG_CHUNK_TYPE); i++) {
! #else   /* !_LP64 */
!         offs = len % sizeof (BIG_CHUNK_TYPE);
!         bn->len = len / sizeof (BIG_CHUNK_TYPE);
!         for (i = 0; i < len / sizeof (BIG_CHUNK_TYPE); i++) {
! #endif  /* _LP64 */
!                 knwordp = &(kn[len - sizeof (BIG_CHUNK_TYPE) * (i + 1)]);
                  word = knwordp[0];
!                 for (j = 1; j < sizeof (BIG_CHUNK_TYPE); j++) {
!                         word = (word << 8)+ knwordp[j];
                  }
                  bn->value[i] = word;
          }
          if (offs > 0) {
                  word = kn[0];
!                 for (i = 1; i < offs; i++) word = (word << 8) + kn[i];
                  bn->value[bn->len++] = word;
          }
!         while ((bn->len > 1) && (bn->value[bn->len-1] == 0)) {
                  bn->len --;
          }
  }
  
  /*
   * copies the least significant len bytes if
!  * len < bn->len * sizeof (BIG_CHUNK_TYPE)
   * converts from bignum format to byte-big-endian format.
   * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
   */
  void
  bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
  {
!         int             i, j, offs;
          BIG_CHUNK_TYPE  word;
  
!         if (len < sizeof (BIG_CHUNK_TYPE) * bn->len) {
! #ifdef  _LP64
!                 for (i = 0; i < (uint32_t)len / sizeof (BIG_CHUNK_TYPE); i++) {
! #else   /* !_LP64 */
!                 for (i = 0; i < len / sizeof (BIG_CHUNK_TYPE); i++) {
! #endif  /* _LP64 */
                          word = bn->value[i];
!                         for (j = 0; j < sizeof (BIG_CHUNK_TYPE); j++) {
!                                 kn[len - sizeof (BIG_CHUNK_TYPE) * i - j - 1] =
                                      word & 0xff;
!                                 word = word >> 8;
                          }
                  }
! #ifdef  _LP64
!                 offs = (uint32_t)len % sizeof (BIG_CHUNK_TYPE);
! #else   /* !_LP64 */
!                 offs = len % sizeof (BIG_CHUNK_TYPE);
! #endif  /* _LP64 */
                  if (offs > 0) {
!                         word = bn->value[len / sizeof (BIG_CHUNK_TYPE)];
! #ifdef  _LP64
!                         for (i =  (uint32_t)len % sizeof (BIG_CHUNK_TYPE);
!                             i > 0; i --) {
! #else   /* !_LP64 */
!                         for (i = len % sizeof (BIG_CHUNK_TYPE);
!                             i > 0; i --) {
! #endif  /* _LP64 */
                                  kn[i - 1] = word & 0xff;
!                                 word = word >> 8;
                          }
                  }
          } else {
                  for (i = 0; i < bn->len; i++) {
                          word = bn->value[i];
!                         for (j = 0; j < sizeof (BIG_CHUNK_TYPE); j++) {
!                                 kn[len - sizeof (BIG_CHUNK_TYPE) * i - j - 1] =
                                      word & 0xff;
!                                 word = word >> 8;
                          }
                  }
! #ifdef  _LP64
!                 for (i = 0;
!                     i < (uint32_t)len - sizeof (BIG_CHUNK_TYPE) * bn->len;
!                     i++) {
! #else   /* !_LP64 */
!                 for (i = 0; i < len - sizeof (BIG_CHUNK_TYPE) * bn->len; i++) {
! #endif  /* _LP64 */
                          kn[i] = 0;
                  }
          }
  }
  
--- 234,342 ----
          number->sign = 1;
  
          return (BIG_OK);
  }
  
+ 
+ /*
+  * big_finish()
+  * Free memory, if any, allocated by big_init() or big_init1().
+  */
  void
  big_finish(BIGNUM *number)
  {
          if (number->malloced == 1) {
!                 big_free(number->value, BIGNUM_WORDSIZE * number->size);
                  number->malloced = 0;
          }
  }
  
  
  /*
   * bn->size should be at least
!  * (len + BIGNUM_WORDSIZE - 1) / BIGNUM_WORDSIZE bytes
   * converts from byte-big-endian format to bignum format (words in
   * little endian order, but bytes within the words big endian)
   */
  void
  bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
  {
!         int             i, j;
!         uint32_t        offs;
!         const uint32_t  slen = UI32(len);
          BIG_CHUNK_TYPE  word;
          uchar_t         *knwordp;
  
!         if (slen == 0) {
!                 bn->len = 1;
!                 bn->value[0] = 0;
!                 return;
!         }
  
!         offs = slen % BIGNUM_WORDSIZE;
!         bn->len = slen / BIGNUM_WORDSIZE;
! 
!         for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
!                 knwordp = &(kn[slen - BIGNUM_WORDSIZE * (i + 1)]);
                  word = knwordp[0];
!                 for (j = 1; j < BIGNUM_WORDSIZE; j++) {
!                         word = (word << BITSINBYTE) + knwordp[j];
                  }
                  bn->value[i] = word;
          }
          if (offs > 0) {
                  word = kn[0];
!                 for (i = 1; i < offs; i++) word = (word << BITSINBYTE) + kn[i];
                  bn->value[bn->len++] = word;
          }
!         while ((bn->len > 1) && (bn->value[bn->len - 1] == 0)) {
                  bn->len --;
          }
  }
  
+ 
  /*
   * copies the least significant len bytes if
!  * len < bn->len * BIGNUM_WORDSIZE
   * converts from bignum format to byte-big-endian format.
   * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
   */
  void
  bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
  {
!         int             i, j;
!         uint32_t        offs;
!         const uint32_t  slen = UI32(len);
          BIG_CHUNK_TYPE  word;
  
!         if (len < BIGNUM_WORDSIZE * bn->len) {
!                 for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
                          word = bn->value[i];
!                         for (j = 0; j < BIGNUM_WORDSIZE; j++) {
!                                 kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
                                      word & 0xff;
!                                 word = word >> BITSINBYTE;
                          }
                  }
!                 offs = slen % BIGNUM_WORDSIZE;
                  if (offs > 0) {
!                         word = bn->value[slen / BIGNUM_WORDSIZE];
!                         for (i =  slen % BIGNUM_WORDSIZE; i > 0; i --) {
                                  kn[i - 1] = word & 0xff;
!                                 word = word >> BITSINBYTE;
                          }
                  }
          } else {
                  for (i = 0; i < bn->len; i++) {
                          word = bn->value[i];
!                         for (j = 0; j < BIGNUM_WORDSIZE; j++) {
!                                 kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
                                      word & 0xff;
!                                 word = word >> BITSINBYTE;
                          }
                  }
!                 for (i = 0; i < slen - BIGNUM_WORDSIZE * bn->len; i++) {
                          kn[i] = 0;
                  }
          }
  }
  
*** 313,330 ****
  
          l = a->len - 1;
          while ((l > 0) && (a->value[l] == 0)) {
                  l--;
          }
!         b = sizeof (BIG_CHUNK_TYPE) * BITSINBYTE;
          c = a->value[l];
          while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
                  c = c << 1;
                  b--;
          }
  
!         return (l * sizeof (BIG_CHUNK_TYPE) * BITSINBYTE + b);
  }
  
  
  BIG_ERR_CODE
  big_copy(BIGNUM *dest, BIGNUM *src)
--- 349,366 ----
  
          l = a->len - 1;
          while ((l > 0) && (a->value[l] == 0)) {
                  l--;
          }
!         b = BIG_CHUNK_SIZE;
          c = a->value[l];
          while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
                  c = c << 1;
                  b--;
          }
  
!         return (l * BIG_CHUNK_SIZE + b);
  }
  
  
  BIG_ERR_CODE
  big_copy(BIGNUM *dest, BIGNUM *src)
*** 338,352 ****
          }
          src->len = len;
          if (dest->size < len) {
                  if (dest->malloced == 1) {
                          newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
!                             sizeof (BIG_CHUNK_TYPE) * dest->size,
!                             sizeof (BIG_CHUNK_TYPE) * len);
                  } else {
                          newptr = (BIG_CHUNK_TYPE *)
!                             big_malloc(sizeof (BIG_CHUNK_TYPE) * len);
                          if (newptr != NULL) {
                                  dest->malloced = 1;
                          }
                  }
                  if (newptr == NULL) {
--- 374,388 ----
          }
          src->len = len;
          if (dest->size < len) {
                  if (dest->malloced == 1) {
                          newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
!                             BIGNUM_WORDSIZE * dest->size,
!                             BIGNUM_WORDSIZE * len);
                  } else {
                          newptr = (BIG_CHUNK_TYPE *)
!                             big_malloc(BIGNUM_WORDSIZE * len);
                          if (newptr != NULL) {
                                  dest->malloced = 1;
                          }
                  }
                  if (newptr == NULL) {
*** 373,386 ****
  
          if (number->size >= size)
                  return (BIG_OK);
          if (number->malloced) {
                  number->value = big_realloc(number->value,
!                     sizeof (BIG_CHUNK_TYPE) * number->size,
!                     sizeof (BIG_CHUNK_TYPE) * size);
          } else {
!                 newptr = big_malloc(sizeof (BIG_CHUNK_TYPE) * size);
                  if (newptr != NULL) {
                          for (i = 0; i < number->size; i++) {
                                  newptr[i] = number->value[i];
                          }
                  }
--- 409,422 ----
  
          if (number->size >= size)
                  return (BIG_OK);
          if (number->malloced) {
                  number->value = big_realloc(number->value,
!                     BIGNUM_WORDSIZE * number->size,
!                     BIGNUM_WORDSIZE * size);
          } else {
!                 newptr = big_malloc(BIGNUM_WORDSIZE * size);
                  if (newptr != NULL) {
                          for (i = 0; i < number->size; i++) {
                                  newptr[i] = number->value[i];
                          }
                  }
*** 559,569 ****
                          if (bb->value[i] > 0) {
                                  return (-1);
                          }
                  }
          } else {
!                 i = aa->len-1;
          }
          for (; i >= 0; i--) {
                  if (aa->value[i] > bb->value[i]) {
                          return (1);
                  } else if (aa->value[i] < bb->value[i]) {
--- 595,605 ----
                          if (bb->value[i] > 0) {
                                  return (-1);
                          }
                  }
          } else {
!                 i = aa->len - 1;
          }
          for (; i >= 0; i--) {
                  if (aa->value[i] > bb->value[i]) {
                          return (1);
                  } else if (aa->value[i] < bb->value[i]) {
*** 910,920 ****
                  return;
          }
          cy = aa->value[0] >> offs;
          for (i = 1; i < aa->len; i++) {
                  ai = aa->value[i];
!                 result->value[i-1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
                  cy = ai >> offs;
          }
          result->len = aa->len;
          result->value[result->len - 1] = cy;
          result->sign = aa->sign;
--- 946,956 ----
                  return;
          }
          cy = aa->value[0] >> offs;
          for (i = 1; i < aa->len; i++) {
                  ai = aa->value[i];
!                 result->value[i - 1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
                  cy = ai >> offs;
          }
          result->len = aa->len;
          result->value[result->len - 1] = cy;
          result->sign = aa->sign;
*** 924,934 ****
  /*
   * result = aa/bb   remainder = aa mod bb
   * it is assumed that aa and bb are positive
   */
  BIG_ERR_CODE
! big_div_pos_fast(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
  {
          BIG_ERR_CODE    err = BIG_OK;
          int             i, alen, blen, tlen, rlen, offs;
          BIG_CHUNK_TYPE  higha, highb, coeff;
          BIG_CHUNK_TYPE  *a, *b;
--- 960,970 ----
  /*
   * result = aa/bb   remainder = aa mod bb
   * it is assumed that aa and bb are positive
   */
  BIG_ERR_CODE
! big_div_pos(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
  {
          BIG_ERR_CODE    err = BIG_OK;
          int             i, alen, blen, tlen, rlen, offs;
          BIG_CHUNK_TYPE  higha, highb, coeff;
          BIG_CHUNK_TYPE  *a, *b;
*** 1075,1084 ****
--- 1111,1121 ----
  ret1:
          big_finish(&bblow);
          return (err);
  }
  
+ 
  /*
   * If there is no processor-specific integer implementation of
   * the lower level multiply functions, then this code is provided
   * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
   * big_sqr_vec().
*** 1104,1114 ****
  
  #define UNROLL8
  
  #define MUL_SET_VEC_ROUND_PREFETCH(R) \
          p = pf * d; \
!         pf = (uint64_t)a[R+1]; \
          t = p + cy; \
          r[R] = (uint32_t)t; \
          cy = t >> 32
  
  #define MUL_SET_VEC_ROUND_NOPREFETCH(R) \
--- 1141,1151 ----
  
  #define UNROLL8
  
  #define MUL_SET_VEC_ROUND_PREFETCH(R) \
          p = pf * d; \
!         pf = (uint64_t)a[R + 1]; \
          t = p + cy; \
          r[R] = (uint32_t)t; \
          cy = t >> 32
  
  #define MUL_SET_VEC_ROUND_NOPREFETCH(R) \
*** 1118,1128 ****
          cy = t >> 32
  
  #define MUL_ADD_VEC_ROUND_PREFETCH(R) \
          t = (uint64_t)r[R]; \
          p = pf * d; \
!         pf = (uint64_t)a[R+1]; \
          t = p + t + cy; \
          r[R] = (uint32_t)t; \
          cy = t >> 32
  
  #define MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
--- 1155,1165 ----
          cy = t >> 32
  
  #define MUL_ADD_VEC_ROUND_PREFETCH(R) \
          t = (uint64_t)r[R]; \
          p = pf * d; \
!         pf = (uint64_t)a[R + 1]; \
          t = p + t + cy; \
          r[R] = (uint32_t)t; \
          cy = t >> 32
  
  #define MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
*** 1274,1290 ****
                  t2 = (uint64_t)d + cy;
                  r[col] = (uint32_t)t2;
                  cy = (t >> 32) + (t2 >> 32);
                  if (row == len - 1)
                          break;
!                 p = ((uint64_t)r[col+1] << 1) + cy;
!                 r[col+1] = (uint32_t)p;
                  cy = p >> 32;
                  ++row;
                  col += 2;
          }
!         r[col+1] = (uint32_t)cy;
  }
  
  #else /* BIG_CHUNK_SIZE == 64 */
  
  /*
--- 1311,1327 ----
                  t2 = (uint64_t)d + cy;
                  r[col] = (uint32_t)t2;
                  cy = (t >> 32) + (t2 >> 32);
                  if (row == len - 1)
                          break;
!                 p = ((uint64_t)r[col + 1] << 1) + cy;
!                 r[col + 1] = (uint32_t)p;
                  cy = p >> 32;
                  ++row;
                  col += 2;
          }
!         r[col + 1] = (uint32_t)cy;
  }
  
  #else /* BIG_CHUNK_SIZE == 64 */
  
  /*
*** 1359,1373 ****
          int i;
  
          ASSERT(r != a);
          r[len] = big_mul_set_vec(r, a, len, a[0]);
          for (i = 1; i < len; ++i)
!                 r[len + i] = big_mul_add_vec(r+i, a, len, a[i]);
  }
  
  #endif /* BIG_CHUNK_SIZE == 32/64 */
  
  #else /* ! UMUL64 */
  
  #if (BIG_CHUNK_SIZE != 32)
  #error Don't use 64-bit chunks without defining UMUL64
  #endif
--- 1396,1411 ----
          int i;
  
          ASSERT(r != a);
          r[len] = big_mul_set_vec(r, a, len, a[0]);
          for (i = 1; i < len; ++i)
!                 r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
  }
  
  #endif /* BIG_CHUNK_SIZE == 32/64 */
  
+ 
  #else /* ! UMUL64 */
  
  #if (BIG_CHUNK_SIZE != 32)
  #error Don't use 64-bit chunks without defining UMUL64
  #endif
*** 1430,1440 ****
          int i;
  
          ASSERT(r != a);
          r[len] = big_mul_set_vec(r, a, len, a[0]);
          for (i = 1; i < len; ++i)
!                 r[len + i] = big_mul_add_vec(r+i, a, len, a[i]);
  }
  
  #endif /* UMUL64 */
  
  void
--- 1468,1478 ----
          int i;
  
          ASSERT(r != a);
          r[len] = big_mul_set_vec(r, a, len, a[0]);
          for (i = 1; i < len; ++i)
!                 r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
  }
  
  #endif /* UMUL64 */
  
  void
*** 1443,1453 ****
  {
          int i;
  
          r[alen] = big_mul_set_vec(r, a, alen, b[0]);
          for (i = 1; i < blen; ++i)
!                 r[alen + i] = big_mul_add_vec(r+i, a, alen, b[i]);
  }
  
  
  #endif /* ! PSR_MUL */
  
--- 1481,1491 ----
  {
          int i;
  
          r[alen] = big_mul_set_vec(r, a, alen, b[0]);
          for (i = 1; i < blen; ++i)
!                 r[alen + i] = big_mul_add_vec(r + i, a, alen, b[i]);
  }
  
  
  #endif /* ! PSR_MUL */
  
*** 1490,1499 ****
--- 1528,1538 ----
                  blen--;
          }
          bb->len = blen;
  
          rsize = alen + blen;
+         ASSERT(rsize > 0);
          if (result->size < rsize) {
                  err = big_extend(result, rsize);
                  if (err != BIG_OK) {
                          return (err);
                  }
*** 1547,1564 ****
          if (t[rsize - 1] == 0) {
                  tmp1.len = rsize - 1;
          } else {
                  tmp1.len = rsize;
          }
!         if ((err = big_copy(result, &tmp1)) != BIG_OK) {
!                 return (err);
!         }
          result->sign = sign;
  
          big_finish(&tmp1);
  
!         return (BIG_OK);
  }
  
  
  /*
   * caller must ensure that  a < n,  b < n  and  ret->size >=  2 * n->len + 1
--- 1586,1603 ----
          if (t[rsize - 1] == 0) {
                  tmp1.len = rsize - 1;
          } else {
                  tmp1.len = rsize;
          }
! 
!         err = big_copy(result, &tmp1);
! 
          result->sign = sign;
  
          big_finish(&tmp1);
  
!         return (err);
  }
  
  
  /*
   * caller must ensure that  a < n,  b < n  and  ret->size >=  2 * n->len + 1
*** 1617,1629 ****
          else {
                  for (i = 0; i < nlen; i++) {
                          rr[i] = rr[i + nlen];
                  }
          }
!         for (i = nlen - 1; (i >= 0) && (rr[i] == 0); i--)
                  ;
!         ret->len = i+1;
  
          return (BIG_OK);
  }
  
  
--- 1656,1670 ----
          else {
                  for (i = 0; i < nlen; i++) {
                          rr[i] = rr[i + nlen];
                  }
          }
! 
!         /* Remove leading zeros, but keep at least 1 digit: */
!         for (i = nlen - 1; (i > 0) && (rr[i] == 0); i--)
                  ;
!         ret->len = i + 1;
  
          return (BIG_OK);
  }
  
  
*** 1762,1773 ****
  {
          BIGNUM          apowers[APOWERS_MAX_SIZE];
          BIGNUM          tmp1;
          BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
          int             i, j, k, l, m, p;
!         int             bit, bitind, bitcount, groupbits, apowerssize;
!         int             nbits;
          BIG_ERR_CODE    err;
  
          nbits = big_numbits(e);
          if (nbits < 50) {
                  groupbits = 1;
--- 1803,1814 ----
  {
          BIGNUM          apowers[APOWERS_MAX_SIZE];
          BIGNUM          tmp1;
          BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
          int             i, j, k, l, m, p;
!         uint32_t        bit, bitind, bitcount, groupbits, apowerssize;
!         uint32_t        nbits;
          BIG_ERR_CODE    err;
  
          nbits = big_numbits(e);
          if (nbits < 50) {
                  groupbits = 1;
*** 1781,1794 ****
          if ((err = big_init1(&tmp1, 2 * n->len + 1,
              tmp1value, arraysize(tmp1value))) != BIG_OK) {
                  return (err);
          }
  
!         /* set the malloced bit to help cleanup */
          for (i = 0; i < apowerssize; i++) {
                  apowers[i].malloced = 0;
          }
          for (i = 0; i < apowerssize; i++) {
                  if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
                      BIG_OK) {
                          goto ret;
                  }
--- 1822,1836 ----
          if ((err = big_init1(&tmp1, 2 * n->len + 1,
              tmp1value, arraysize(tmp1value))) != BIG_OK) {
                  return (err);
          }
  
!         /* clear the malloced bit to help cleanup */
          for (i = 0; i < apowerssize; i++) {
                  apowers[i].malloced = 0;
          }
+ 
          for (i = 0; i < apowerssize; i++) {
                  if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
                      BIG_OK) {
                          goto ret;
                  }
*** 1801,1811 ****
          }
          (void) big_copy(ma, &tmp1);
  
          for (i = 1; i < apowerssize; i++) {
                  if ((err = big_mont_mul(&tmp1, ma,
!                     &(apowers[i-1]), n, n0)) != BIG_OK) {
                          goto ret;
                  }
                  (void) big_copy(&apowers[i], &tmp1);
          }
  
--- 1843,1853 ----
          }
          (void) big_copy(ma, &tmp1);
  
          for (i = 1; i < apowerssize; i++) {
                  if ((err = big_mont_mul(&tmp1, ma,
!                     &(apowers[i - 1]), n, n0)) != BIG_OK) {
                          goto ret;
                  }
                  (void) big_copy(&apowers[i], &tmp1);
          }
  
*** 1910,1925 ****
  static BIG_ERR_CODE
  big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
      BIGNUM *tmp, BIG_CHUNK_TYPE n0)
  {
  
!         int             i, j, k, l, m, p, bit, bitind, bitcount, nlen;
          double          dn0;
          double          *dn, *dt, *d16r, *d32r;
          uint32_t        *nint, *prod;
          double          *apowers[APOWERS_MAX_SIZE];
!         int             nbits, groupbits, apowerssize;
          BIG_ERR_CODE    err = BIG_OK;
  
  #ifdef _KERNEL
          uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
          kfpu_t *fpu;
--- 1952,1968 ----
  static BIG_ERR_CODE
  big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
      BIGNUM *tmp, BIG_CHUNK_TYPE n0)
  {
  
!         int             i, j, k, l, m, p;
!         uint32_t        bit, bitind, bitcount, nlen;
          double          dn0;
          double          *dn, *dt, *d16r, *d32r;
          uint32_t        *nint, *prod;
          double          *apowers[APOWERS_MAX_SIZE];
!         uint32_t        nbits, groupbits, apowerssize;
          BIG_ERR_CODE    err = BIG_OK;
  
  #ifdef _KERNEL
          uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
          kfpu_t *fpu;
*** 2075,2085 ****
                                                      d16r, dt, dn, nint,
                                                      nlen, dn0);
                                          }
                                          conv_i32_to_d32(d32r, prod, nlen);
                                          mont_mulf_noconv(prod, d32r,
!                                             apowers[p >> (l+1)],
                                              dt, dn, nint, nlen, dn0);
                                          for (m = 0; m < l; m++) {
                                                  conv_i32_to_d32_and_d16(d32r,
                                                      d16r, prod, nlen);
                                                  mont_mulf_noconv(prod, d32r,
--- 2118,2128 ----
                                                      d16r, dt, dn, nint,
                                                      nlen, dn0);
                                          }
                                          conv_i32_to_d32(d32r, prod, nlen);
                                          mont_mulf_noconv(prod, d32r,
!                                             apowers[p >> (l + 1)],
                                              dt, dn, nint, nlen, dn0);
                                          for (m = 0; m < l; m++) {
                                                  conv_i32_to_d32_and_d16(d32r,
                                                      d16r, prod, nlen);
                                                  mont_mulf_noconv(prod, d32r,
*** 2181,2192 ****
          if ((err = big_init1(&tmp, 2 * n->len + 1,
              tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
                  goto ret1;
          }
  
!         /* set the malloced bit to help cleanup */
          rr.malloced = 0;
          if (n_rr == NULL) {
                  if ((err = big_init1(&rr, 2 * n->len + 1,
                      rrvalue, arraysize(rrvalue))) != BIG_OK) {
                          goto ret2;
                  }
--- 2224,2236 ----
          if ((err = big_init1(&tmp, 2 * n->len + 1,
              tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
                  goto ret1;
          }
  
!         /* clear the malloced bit to help cleanup */
          rr.malloced = 0;
+ 
          if (n_rr == NULL) {
                  if ((err = big_init1(&rr, 2 * n->len + 1,
                      rrvalue, arraysize(rrvalue))) != BIG_OK) {
                          goto ret2;
                  }
*** 2379,2389 ****
          BIGNUM          t1, t2, t3, prod;
          BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
          BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
          BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
          BIG_CHUNK_TYPE  prodvalue[BIGTMPSIZE];
!         int             i, nbits, diff, nrootbits, highbits;
          BIG_ERR_CODE    err;
  
          nbits = big_numbits(n);
  
          if ((err = big_init1(&t1, n->len + 1,
--- 2423,2434 ----
          BIGNUM          t1, t2, t3, prod;
          BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
          BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
          BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
          BIG_CHUNK_TYPE  prodvalue[BIGTMPSIZE];
!         int             i, diff;
!         uint32_t        nbits, nrootbits, highbits;
          BIG_ERR_CODE    err;
  
          nbits = big_numbits(n);
  
          if ((err = big_init1(&t1, n->len + 1,
*** 2549,2559 ****
  
  
  BIG_ERR_CODE
  big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
  {
!         int             m, w, i;
          BIG_CHUNK_TYPE  bit;
          BIGNUM          ki, tmp, tmp2;
          BIG_CHUNK_TYPE  kivalue[BIGTMPSIZE];
          BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
          BIG_CHUNK_TYPE  tmp2value[BIGTMPSIZE];
--- 2594,2605 ----
  
  
  BIG_ERR_CODE
  big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
  {
!         int             i;
!         uint32_t        m, w;
          BIG_CHUNK_TYPE  bit;
          BIGNUM          ki, tmp, tmp2;
          BIG_CHUNK_TYPE  kivalue[BIGTMPSIZE];
          BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
          BIG_CHUNK_TYPE  tmp2value[BIGTMPSIZE];
*** 2567,2577 ****
  
          if ((err = big_init1(&ki, k->len + 1,
              kivalue, arraysize(kivalue))) != BIG_OK)
                  return (err);
  
!         if ((err = big_init1(&tmp, 2 * n->len +1,
              tmpvalue, arraysize(tmpvalue))) != BIG_OK)
                  goto ret1;
  
          if ((err = big_init1(&tmp2, n->len,
              tmp2value, arraysize(tmp2value))) != BIG_OK)
--- 2613,2623 ----
  
          if ((err = big_init1(&ki, k->len + 1,
              kivalue, arraysize(kivalue))) != BIG_OK)
                  return (err);
  
!         if ((err = big_init1(&tmp, 2 * n->len + 1,
              tmpvalue, arraysize(tmpvalue))) != BIG_OK)
                  goto ret1;
  
          if ((err = big_init1(&tmp2, n->len,
              tmp2value, arraysize(tmp2value))) != BIG_OK)
*** 2779,2798 ****
  }
  
  
  #define SIEVESIZE 1000
  
- uint32_t smallprimes[] =
- {
- 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
- 51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97
- };
  
- 
  BIG_ERR_CODE
  big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
  {
          BIG_ERR_CODE    err;
          int             sieve[SIEVESIZE];
          int             i;
          uint32_t        off, p;
  
--- 2825,2841 ----
  }
  
  
  #define SIEVESIZE 1000
  
  
  BIG_ERR_CODE
  big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
  {
+         static const uint32_t smallprimes[] = {
+             3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
+             51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97 };
          BIG_ERR_CODE    err;
          int             sieve[SIEVESIZE];
          int             i;
          uint32_t        off, p;