Mercurial > vba-linux
diff src/SFMT/SFMT-sse2.h @ 1:f9f4f1b99eed
importing src directory
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Sat, 03 Mar 2012 10:31:27 -0600 |
parents | |
children |
line wrap: on
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/SFMT/SFMT-sse2.h Sat Mar 03 10:31:27 2012 -0600 1.3 @@ -0,0 +1,121 @@ 1.4 +/** 1.5 + * @file SFMT-sse2.h 1.6 + * @brief SIMD oriented Fast Mersenne Twister(SFMT) for Intel SSE2 1.7 + * 1.8 + * @author Mutsuo Saito (Hiroshima University) 1.9 + * @author Makoto Matsumoto (Hiroshima University) 1.10 + * 1.11 + * @note We assume LITTLE ENDIAN in this file 1.12 + * 1.13 + * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima 1.14 + * University. All rights reserved. 1.15 + * 1.16 + * The new BSD License is applied to this software, see LICENSE.txt 1.17 + */ 1.18 + 1.19 +#ifndef SFMT_SSE2_H 1.20 +#define SFMT_SSE2_H 1.21 + 1.22 +PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, __m128i c, 1.23 + __m128i d, __m128i mask) ALWAYSINLINE; 1.24 + 1.25 +/** 1.26 + * This function represents the recursion formula. 1.27 + * @param a a 128-bit part of the interal state array 1.28 + * @param b a 128-bit part of the interal state array 1.29 + * @param c a 128-bit part of the interal state array 1.30 + * @param d a 128-bit part of the interal state array 1.31 + * @param mask 128-bit mask 1.32 + * @return output 1.33 + */ 1.34 +PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, 1.35 + __m128i c, __m128i d, __m128i mask) { 1.36 + __m128i v, x, y, z; 1.37 + 1.38 + x = _mm_load_si128(a); 1.39 + y = _mm_srli_epi32(*b, SR1); 1.40 + z = _mm_srli_si128(c, SR2); 1.41 + v = _mm_slli_epi32(d, SL1); 1.42 + z = _mm_xor_si128(z, x); 1.43 + z = _mm_xor_si128(z, v); 1.44 + x = _mm_slli_si128(x, SL2); 1.45 + y = _mm_and_si128(y, mask); 1.46 + z = _mm_xor_si128(z, x); 1.47 + z = _mm_xor_si128(z, y); 1.48 + return z; 1.49 +} 1.50 + 1.51 +/** 1.52 + * This function fills the internal state array with pseudorandom 1.53 + * integers. 1.54 + */ 1.55 +inline static void gen_rand_all(void) { 1.56 + int i; 1.57 + __m128i r, r1, r2, mask; 1.58 + mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1); 1.59 + 1.60 + r1 = _mm_load_si128(&sfmt[N - 2].si); 1.61 + r2 = _mm_load_si128(&sfmt[N - 1].si); 1.62 + for (i = 0; i < N - POS1; i++) { 1.63 + r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask); 1.64 + _mm_store_si128(&sfmt[i].si, r); 1.65 + r1 = r2; 1.66 + r2 = r; 1.67 + } 1.68 + for (; i < N; i++) { 1.69 + r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask); 1.70 + _mm_store_si128(&sfmt[i].si, r); 1.71 + r1 = r2; 1.72 + r2 = r; 1.73 + } 1.74 +} 1.75 + 1.76 +/** 1.77 + * This function fills the user-specified array with pseudorandom 1.78 + * integers. 1.79 + * 1.80 + * @param array an 128-bit array to be filled by pseudorandom numbers. 1.81 + * @param size number of 128-bit pesudorandom numbers to be generated. 1.82 + */ 1.83 +inline static void gen_rand_array(w128_t *array, int size) { 1.84 + int i, j; 1.85 + __m128i r, r1, r2, mask; 1.86 + mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1); 1.87 + 1.88 + r1 = _mm_load_si128(&sfmt[N - 2].si); 1.89 + r2 = _mm_load_si128(&sfmt[N - 1].si); 1.90 + for (i = 0; i < N - POS1; i++) { 1.91 + r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask); 1.92 + _mm_store_si128(&array[i].si, r); 1.93 + r1 = r2; 1.94 + r2 = r; 1.95 + } 1.96 + for (; i < N; i++) { 1.97 + r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask); 1.98 + _mm_store_si128(&array[i].si, r); 1.99 + r1 = r2; 1.100 + r2 = r; 1.101 + } 1.102 + /* main loop */ 1.103 + for (; i < size - N; i++) { 1.104 + r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2, 1.105 + mask); 1.106 + _mm_store_si128(&array[i].si, r); 1.107 + r1 = r2; 1.108 + r2 = r; 1.109 + } 1.110 + for (j = 0; j < 2 * N - size; j++) { 1.111 + r = _mm_load_si128(&array[j + size - N].si); 1.112 + _mm_store_si128(&sfmt[j].si, r); 1.113 + } 1.114 + for (; i < size; i++) { 1.115 + r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2, 1.116 + mask); 1.117 + _mm_store_si128(&array[i].si, r); 1.118 + _mm_store_si128(&sfmt[j++].si, r); 1.119 + r1 = r2; 1.120 + r2 = r; 1.121 + } 1.122 +} 1.123 + 1.124 +#endif