1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * If compiled without -DRF_INLINE_MACROS then needs -lm at link time
  28  * If compiled with -DRF_INLINE_MACROS then needs conv.il at compile time
  29  * (i.e. cc <compiler_flags> -DRF_INLINE_MACROS conv.il mont_mulf.c )
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <math.h>
  34 
  35 static const double TwoTo16 = 65536.0;
  36 static const double TwoToMinus16 = 1.0/65536.0;
  37 static const double Zero = 0.0;
  38 static const double TwoTo32 = 65536.0 * 65536.0;
  39 static const double TwoToMinus32 = 1.0 / (65536.0 * 65536.0);
  40 
  41 #ifdef RF_INLINE_MACROS
  42 
  43 double upper32(double);
  44 double lower32(double, double);
  45 double mod(double, double, double);
  46 
  47 #else
  48 
  49 static double
  50 upper32(double x)
  51 {
  52         return (floor(x * TwoToMinus32));
  53 }
  54 
  55 
  56 static double
  57 lower32(double x, double y)
  58 {
  59         return (x - TwoTo32 * floor(x * TwoToMinus32));
  60 }
  61 
  62 static double
  63 mod(double x, double oneoverm, double m)
  64 {
  65         return (x - m * floor(x * oneoverm));
  66 }
  67 
  68 #endif
  69 
  70 
  71 static void
  72 cleanup(double *dt, int from, int tlen)
  73 {
  74         int i;
  75         double tmp, tmp1, x, x1;
  76 
  77         tmp = tmp1 = Zero;
  78 
  79         for (i = 2 * from; i < 2 * tlen; i += 2) {
  80                 x = dt[i];
  81                 x1 = dt[i + 1];
  82                 dt[i] = lower32(x, Zero) + tmp;
  83                 dt[i + 1] = lower32(x1, Zero) + tmp1;
  84                 tmp = upper32(x);
  85                 tmp1 = upper32(x1);
  86         }
  87 }
  88 
  89 
  90 void
  91 conv_d16_to_i32(uint32_t *i32, double *d16, int64_t *tmp, int ilen)
  92 {
  93         int i;
  94         int64_t t, t1,          /* Using int64_t and not uint64_t */
  95             a, b, c, d;         /* because more efficient code is */
  96                                 /* generated this way, and there  */
  97                                 /* is no overflow.  */
  98         t1 = 0;
  99         a = (int64_t)d16[0];
 100         b = (int64_t)d16[1];
 101         for (i = 0; i < ilen - 1; i++) {
 102                 c = (int64_t)d16[2 * i + 2];
 103                 t1 += a & 0xffffffff;
 104                 t = (a >> 32);
 105                 d = (int64_t)d16[2 * i + 3];
 106                 t1 += (b & 0xffff) << 16;
 107                 t += (b >> 16) + (t1 >> 32);
 108                 i32[i] = t1 & 0xffffffff;
 109                 t1 = t;
 110                 a = c;
 111                 b = d;
 112         }
 113         t1 += a & 0xffffffff;
 114         t = (a >> 32);
 115         t1 += (b & 0xffff) << 16;
 116         i32[i] = t1 & 0xffffffff;
 117 }
 118 
 119 void
 120 conv_i32_to_d32(double *d32, uint32_t *i32, int len)
 121 {
 122         int i;
 123 
 124 #pragma pipeloop(0)
 125         for (i = 0; i < len; i++)
 126                 d32[i] = (double)(i32[i]);
 127 }
 128 
 129 
 130 void
 131 conv_i32_to_d16(double *d16, uint32_t *i32, int len)
 132 {
 133         int i;
 134         uint32_t a;
 135 
 136 #pragma pipeloop(0)
 137         for (i = 0; i < len; i++) {
 138                 a = i32[i];
 139                 d16[2 * i] = (double)(a & 0xffff);
 140                 d16[2 * i + 1] = (double)(a >> 16);
 141         }
 142 }
 143 
 144 #ifdef RF_INLINE_MACROS
 145 
 146 void
 147 i16_to_d16_and_d32x4(const double *,    /* 1/(2^16) */
 148                         const double *, /* 2^16 */
 149                         const double *, /* 0 */
 150                         double *,       /* result16 */
 151                         double *,       /* result32 */
 152                         float *);       /* source - should be unsigned int* */
 153                                         /* converted to float* */
 154 
 155 #else
 156 
 157 
 158 static void
 159 i16_to_d16_and_d32x4(const double *dummy1,      /* 1/(2^16) */
 160                         const double *dummy2,   /* 2^16 */
 161                         const double *dummy3,   /* 0 */
 162                         double *result16,
 163                         double *result32,
 164                         float *src)     /* source - should be unsigned int* */
 165                                         /* converted to float* */
 166 {
 167         uint32_t *i32;
 168         uint32_t a, b, c, d;
 169 
 170         i32 = (uint32_t *)src;
 171         a = i32[0];
 172         b = i32[1];
 173         c = i32[2];
 174         d = i32[3];
 175         result16[0] = (double)(a & 0xffff);
 176         result16[1] = (double)(a >> 16);
 177         result32[0] = (double)a;
 178         result16[2] = (double)(b & 0xffff);
 179         result16[3] = (double)(b >> 16);
 180         result32[1] = (double)b;
 181         result16[4] = (double)(c & 0xffff);
 182         result16[5] = (double)(c >> 16);
 183         result32[2] = (double)c;
 184         result16[6] = (double)(d & 0xffff);
 185         result16[7] = (double)(d >> 16);
 186         result32[3] = (double)d;
 187 }
 188 
 189 #endif
 190 
 191 
 192 void
 193 conv_i32_to_d32_and_d16(double *d32, double *d16, uint32_t *i32, int len)
 194 {
 195         int i;
 196         uint32_t a;
 197 
 198 #pragma pipeloop(0)
 199         for (i = 0; i < len - 3; i += 4) {
 200                 i16_to_d16_and_d32x4(&TwoToMinus16, &TwoTo16, &Zero,
 201                     &(d16[2*i]), &(d32[i]), (float *)(&(i32[i])));
 202         }
 203         for (; i < len; i++) {
 204                 a = i32[i];
 205                 d32[i] = (double)(i32[i]);
 206                 d16[2 * i] = (double)(a & 0xffff);
 207                 d16[2 * i + 1] = (double)(a >> 16);
 208         }
 209 }
 210 
 211 
 212 static void
 213 adjust_montf_result(uint32_t *i32, uint32_t *nint, int len)
 214 {
 215         int64_t acc;
 216         int i;
 217 
 218         if (i32[len] > 0)
 219                 i = -1;
 220         else {
 221                 for (i = len - 1; i >= 0; i--) {
 222                         if (i32[i] != nint[i]) break;
 223                 }
 224         }
 225         if ((i < 0) || (i32[i] > nint[i])) {
 226                 acc = 0;
 227                 for (i = 0; i < len; i++) {
 228                         acc = acc + (uint64_t)(i32[i]) - (uint64_t)(nint[i]);
 229                         i32[i] = acc & 0xffffffff;
 230                         acc = acc >> 32;
 231                 }
 232         }
 233 }
 234 
 235 
 236 /*
 237  * the lengths of the input arrays should be at least the following:
 238  * result[nlen+1], dm1[nlen], dm2[2*nlen+1], dt[4*nlen+2], dn[nlen], nint[nlen]
 239  * all of them should be different from one another
 240  */
 241 void mont_mulf_noconv(uint32_t *result,
 242                         double *dm1, double *dm2, double *dt,
 243                         double *dn, uint32_t *nint,
 244                         int nlen, double dn0)
 245 {
 246         int i, j, jj;
 247         double digit, m2j, a, b;
 248         double *pdm1, *pdm2, *pdn, *pdtj, pdn_0, pdm1_0;
 249 
 250         pdm1 = &(dm1[0]);
 251         pdm2 = &(dm2[0]);
 252         pdn = &(dn[0]);
 253         pdm2[2 * nlen] = Zero;
 254 
 255         if (nlen != 16) {
 256                 for (i = 0; i < 4 * nlen + 2; i++)
 257                         dt[i] = Zero;
 258                 a = dt[0] = pdm1[0] * pdm2[0];
 259                 digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16);
 260 
 261                 pdtj = &(dt[0]);
 262                 for (j = jj = 0; j < 2 * nlen; j++, jj++, pdtj++) {
 263                         m2j = pdm2[j];
 264                         a = pdtj[0] + pdn[0] * digit;
 265                         b = pdtj[1] + pdm1[0] * pdm2[j + 1] + a * TwoToMinus16;
 266                         pdtj[1] = b;
 267 
 268 #pragma pipeloop(0)
 269                         for (i = 1; i < nlen; i++) {
 270                                 pdtj[2 * i] += pdm1[i] * m2j + pdn[i] * digit;
 271                         }
 272                         if (jj == 30) {
 273                                 cleanup(dt, j / 2 + 1, 2 * nlen + 1);
 274                                 jj = 0;
 275                         }
 276 
 277                         digit = mod(lower32(b, Zero) * dn0,
 278                             TwoToMinus16, TwoTo16);
 279                 }
 280         } else {
 281                 a = dt[0] = pdm1[0] * pdm2[0];
 282 
 283                 dt[65] = dt[64] = dt[63] = dt[62] = dt[61] = dt[60] =
 284                     dt[59] = dt[58] = dt[57] = dt[56] = dt[55] =
 285                     dt[54] = dt[53] = dt[52] = dt[51] = dt[50] =
 286                     dt[49] = dt[48] = dt[47] = dt[46] = dt[45] =
 287                     dt[44] = dt[43] = dt[42] = dt[41] = dt[40] =
 288                     dt[39] = dt[38] = dt[37] = dt[36] = dt[35] =
 289                     dt[34] = dt[33] = dt[32] = dt[31] = dt[30] =
 290                     dt[29] = dt[28] = dt[27] = dt[26] = dt[25] =
 291                     dt[24] = dt[23] = dt[22] = dt[21] = dt[20] =
 292                     dt[19] = dt[18] = dt[17] = dt[16] = dt[15] =
 293                     dt[14] = dt[13] = dt[12] = dt[11] = dt[10] =
 294                     dt[9] = dt[8] = dt[7] = dt[6] = dt[5] = dt[4] =
 295                     dt[3] = dt[2] = dt[1] = Zero;
 296 
 297                 pdn_0 = pdn[0];
 298                 pdm1_0 = pdm1[0];
 299 
 300                 digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16);
 301                 pdtj = &(dt[0]);
 302 
 303                 for (j = 0; j < 32; j++, pdtj++) {
 304 
 305                         m2j = pdm2[j];
 306                         a = pdtj[0] + pdn_0 * digit;
 307                         b = pdtj[1] + pdm1_0 * pdm2[j + 1] + a * TwoToMinus16;
 308                         pdtj[1] = b;
 309 
 310                         pdtj[2] += pdm1[1] *m2j + pdn[1] * digit;
 311                         pdtj[4] += pdm1[2] *m2j + pdn[2] * digit;
 312                         pdtj[6] += pdm1[3] *m2j + pdn[3] * digit;
 313                         pdtj[8] += pdm1[4] *m2j + pdn[4] * digit;
 314                         pdtj[10] += pdm1[5] *m2j + pdn[5] * digit;
 315                         pdtj[12] += pdm1[6] *m2j + pdn[6] * digit;
 316                         pdtj[14] += pdm1[7] *m2j + pdn[7] * digit;
 317                         pdtj[16] += pdm1[8] *m2j + pdn[8] * digit;
 318                         pdtj[18] += pdm1[9] *m2j + pdn[9] * digit;
 319                         pdtj[20] += pdm1[10] *m2j + pdn[10] * digit;
 320                         pdtj[22] += pdm1[11] *m2j + pdn[11] * digit;
 321                         pdtj[24] += pdm1[12] *m2j + pdn[12] * digit;
 322                         pdtj[26] += pdm1[13] *m2j + pdn[13] * digit;
 323                         pdtj[28] += pdm1[14] *m2j + pdn[14] * digit;
 324                         pdtj[30] += pdm1[15] *m2j + pdn[15] * digit;
 325                         /* no need for cleanup, cannot overflow */
 326                         digit = mod(lower32(b, Zero) * dn0,
 327                             TwoToMinus16, TwoTo16);
 328                 }
 329         }
 330 
 331         conv_d16_to_i32(result, dt + 2 * nlen, (int64_t *)dt, nlen + 1);
 332         adjust_montf_result(result, nint, nlen);
 333 }