1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #ifndef _MD5_BYTESWAP_H
  28 #define _MD5_BYTESWAP_H
  29 
  30 #pragma ident   "%Z%%M% %I%     %E% SMI"
  31 
  32 /*
  33  * definitions for inline functions for little-endian loads.
  34  *
  35  * This file has special definitions for UltraSPARC architectures,
  36  * which have a special address space identifier for loading 32 and 16 bit
  37  * integers in little-endian byte order.
  38  *
  39  * This file and common/crypto/md5/sparc/sun4[uv]/byteswap.il implement the
  40  * same thing and must be changed together.
  41  */
  42 
  43 #if defined(__sparc)
  44 #include <v9/sys/asi.h>
  45 #endif
  46 
  47 #ifdef  __cplusplus
  48 extern "C" {
  49 #endif
  50 
  51 #if defined(_LITTLE_ENDIAN)
  52 
  53 /*
  54  * Little-endian optimization:  I don't need to do any weirdness.   On
  55  * some little-endian boxen, I'll have to do alignment checks, but I can do
  56  * that below.
  57  */
  58 
  59 #if !defined(__i386) && !defined(__amd64)
  60 /*
  61  * i386 and amd64 don't require aligned 4-byte loads.  The symbol
  62  * _MD5_CHECK_ALIGNMENT indicates below whether the MD5Transform function
  63  * requires alignment checking.
  64  */
  65 #define _MD5_CHECK_ALIGNMENT
  66 #endif /* !__i386 && !__amd64 */
  67 
  68 #define LOAD_LITTLE_32(addr)    (*(uint32_t *)(addr))
  69 
  70 #else   /* !_LITTLE_ENDIAN */
  71 
  72 /*
  73  * sparc v9/v8plus optimization:
  74  *
  75  * on the sparc v9/v8plus, we can load data little endian.  however, since
  76  * the compiler doesn't have direct support for little endian, we
  77  * link to an assembly-language routine `load_little_32' to do
  78  * the magic.  note that special care must be taken to ensure the
  79  * address is 32-bit aligned -- in the interest of speed, we don't
  80  * check to make sure, since careful programming can guarantee this
  81  * for us.
  82  */
  83 #if defined(sun4u)
  84 
  85 /* Define alignment check because we can 4-byte load as little endian. */
  86 #define _MD5_CHECK_ALIGNMENT
  87 #define LOAD_LITTLE_32(addr)    load_little_32((uint32_t *)(addr))
  88 
  89 #if !defined(__lint) && defined(__GNUC__)
  90 
  91 static __inline__ uint32_t
  92 load_little_32(uint32_t *addr)
  93 {
  94         uint32_t value;
  95 
  96         __asm__(
  97             "lduwa      [%1] %2, %0\n\t"
  98         : "=r" (value)
  99         : "r" (addr), "i" (ASI_PL));
 100 
 101         return (value);
 102 }
 103 
 104 static __inline__ uint16_t
 105 load_little_16(uint16_t *addr)
 106 {
 107         uint16_t value;
 108 
 109         __asm__(
 110             "lduha      [%1] %2, %0\n\t"
 111         : "=r" (value)
 112         : "r" (addr), "i" (ASI_PL));
 113 
 114         return (value);
 115 }
 116 
 117 #endif  /* !__lint && __GNUC__ */
 118 
 119 #if !defined(__GNUC__)
 120 extern  uint32_t load_little_32(uint32_t *);
 121 #endif  /* !__GNUC__ */
 122 
 123 /* Placate lint */
 124 #if defined(__lint)
 125 uint32_t
 126 load_little_32(uint32_t *addr)
 127 {
 128         return (*addr);
 129 }
 130 #endif  /* __lint */
 131 
 132 #else   /* !sun4u */
 133 
 134 /* big endian -- will work on little endian, but slowly */
 135 /* Since we do byte operations, we don't have to check for alignment. */
 136 #define LOAD_LITTLE_32(addr)    \
 137         ((addr)[0] | ((addr)[1] << 8) | ((addr)[2] << 16) | ((addr)[3] << 24))
 138 
 139 #endif  /* sun4u */
 140 
 141 #if defined(sun4v)
 142 
 143 /*
 144  * For N1 want to minimize number of arithmetic operations. This is best
 145  * achieved by using the %asi register to specify ASI for the lduwa operations.
 146  * Also, have a separate inline template for each word, so can utilize the
 147  * immediate offset in lduwa, without relying on the compiler to do the right
 148  * thing.
 149  *
 150  * Moving to 64-bit loads might also be beneficial.
 151  */
 152 #define LOAD_LITTLE_32_0(addr)  load_little_32_0((uint32_t *)(addr))
 153 #define LOAD_LITTLE_32_1(addr)  load_little_32_1((uint32_t *)(addr))
 154 #define LOAD_LITTLE_32_2(addr)  load_little_32_2((uint32_t *)(addr))
 155 #define LOAD_LITTLE_32_3(addr)  load_little_32_3((uint32_t *)(addr))
 156 #define LOAD_LITTLE_32_4(addr)  load_little_32_4((uint32_t *)(addr))
 157 #define LOAD_LITTLE_32_5(addr)  load_little_32_5((uint32_t *)(addr))
 158 #define LOAD_LITTLE_32_6(addr)  load_little_32_6((uint32_t *)(addr))
 159 #define LOAD_LITTLE_32_7(addr)  load_little_32_7((uint32_t *)(addr))
 160 #define LOAD_LITTLE_32_8(addr)  load_little_32_8((uint32_t *)(addr))
 161 #define LOAD_LITTLE_32_9(addr)  load_little_32_9((uint32_t *)(addr))
 162 #define LOAD_LITTLE_32_a(addr)  load_little_32_a((uint32_t *)(addr))
 163 #define LOAD_LITTLE_32_b(addr)  load_little_32_b((uint32_t *)(addr))
 164 #define LOAD_LITTLE_32_c(addr)  load_little_32_c((uint32_t *)(addr))
 165 #define LOAD_LITTLE_32_d(addr)  load_little_32_d((uint32_t *)(addr))
 166 #define LOAD_LITTLE_32_e(addr)  load_little_32_e((uint32_t *)(addr))
 167 #define LOAD_LITTLE_32_f(addr)  load_little_32_f((uint32_t *)(addr))
 168 
 169 #if !defined(__lint) && defined(__GNUC__)
 170 
 171 /*
 172  * This actually sets the ASI register, not necessarily to ASI_PL.
 173  */
 174 static __inline__ void
 175 set_little(uint8_t asi)
 176 {
 177         __asm__ __volatile__(
 178                 "wr     %%g0, %0, %%asi\n\t"
 179         : /* Nothing */
 180         : "r" (asi));
 181 }
 182 
 183 static __inline__ uint8_t
 184 get_little(void)
 185 {
 186         uint8_t asi;
 187 
 188         __asm__ __volatile__(
 189                 "rd     %%asi, %0\n\t"
 190         : "=r" (asi));
 191 
 192         return (asi);
 193 }
 194 
 195 /*
 196  * We have 16 functions which differ only in the offset from which they
 197  * load.  Use this preprocessor template to simplify maintenance.  Its
 198  * argument is the offset in hex, without the 0x.
 199  */
 200 #define LL_TEMPLATE(__off)                      \
 201 static __inline__ uint32_t                      \
 202 load_little_32_##__off(uint32_t *addr)          \
 203 {                                               \
 204         uint32_t value;                         \
 205         __asm__(                                \
 206                 "lduwa  [%1 + %2]%%asi, %0\n\t" \
 207         : "=r" (value)                          \
 208         : "r" (addr), "i" ((0x##__off) << 2));    \
 209         return (value);                         \
 210 }
 211 
 212 LL_TEMPLATE(0)
 213 LL_TEMPLATE(1)
 214 LL_TEMPLATE(2)
 215 LL_TEMPLATE(3)
 216 LL_TEMPLATE(4)
 217 LL_TEMPLATE(5)
 218 LL_TEMPLATE(6)
 219 LL_TEMPLATE(7)
 220 LL_TEMPLATE(8)
 221 LL_TEMPLATE(9)
 222 LL_TEMPLATE(a)
 223 LL_TEMPLATE(b)
 224 LL_TEMPLATE(c)
 225 LL_TEMPLATE(d)
 226 LL_TEMPLATE(e)
 227 LL_TEMPLATE(f)
 228 #undef  LL_TEMPLATE
 229 
 230 #endif  /* !__lint && __GNUC__ */
 231 
 232 #if !defined(__GNUC__)
 233 /*
 234  * Using the %asi register to achieve little endian loads - register
 235  * is set using a inline template.
 236  *
 237  * Saves a few arithmetic ops as can now use an immediate offset with the
 238  * lduwa instructions.
 239  */
 240 extern void set_little(uint32_t);
 241 extern uint32_t get_little(void);
 242 
 243 extern  uint32_t load_little_32_0(uint32_t *);
 244 extern  uint32_t load_little_32_1(uint32_t *);
 245 extern  uint32_t load_little_32_2(uint32_t *);
 246 extern  uint32_t load_little_32_3(uint32_t *);
 247 extern  uint32_t load_little_32_4(uint32_t *);
 248 extern  uint32_t load_little_32_5(uint32_t *);
 249 extern  uint32_t load_little_32_6(uint32_t *);
 250 extern  uint32_t load_little_32_7(uint32_t *);
 251 extern  uint32_t load_little_32_8(uint32_t *);
 252 extern  uint32_t load_little_32_9(uint32_t *);
 253 extern  uint32_t load_little_32_a(uint32_t *);
 254 extern  uint32_t load_little_32_b(uint32_t *);
 255 extern  uint32_t load_little_32_c(uint32_t *);
 256 extern  uint32_t load_little_32_d(uint32_t *);
 257 extern  uint32_t load_little_32_e(uint32_t *);
 258 extern  uint32_t load_little_32_f(uint32_t *);
 259 #endif  /* !__GNUC__ */
 260 #endif  /* sun4v */
 261 
 262 #endif  /* _LITTLE_ENDIAN */
 263 
 264 #ifdef  __cplusplus
 265 }
 266 #endif
 267 
 268 #endif  /* !_MD5_BYTESWAP_H */