rlm@1
|
1 /**
|
rlm@1
|
2 * @file SFMT-sse2.h
|
rlm@1
|
3 * @brief SIMD oriented Fast Mersenne Twister(SFMT) for Intel SSE2
|
rlm@1
|
4 *
|
rlm@1
|
5 * @author Mutsuo Saito (Hiroshima University)
|
rlm@1
|
6 * @author Makoto Matsumoto (Hiroshima University)
|
rlm@1
|
7 *
|
rlm@1
|
8 * @note We assume LITTLE ENDIAN in this file
|
rlm@1
|
9 *
|
rlm@1
|
10 * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
|
rlm@1
|
11 * University. All rights reserved.
|
rlm@1
|
12 *
|
rlm@1
|
13 * The new BSD License is applied to this software, see LICENSE.txt
|
rlm@1
|
14 */
|
rlm@1
|
15
|
rlm@1
|
16 #ifndef SFMT_SSE2_H
|
rlm@1
|
17 #define SFMT_SSE2_H
|
rlm@1
|
18
|
rlm@1
|
19 PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, __m128i c,
|
rlm@1
|
20 __m128i d, __m128i mask) ALWAYSINLINE;
|
rlm@1
|
21
|
rlm@1
|
22 /**
|
rlm@1
|
23 * This function represents the recursion formula.
|
rlm@1
|
24 * @param a a 128-bit part of the interal state array
|
rlm@1
|
25 * @param b a 128-bit part of the interal state array
|
rlm@1
|
26 * @param c a 128-bit part of the interal state array
|
rlm@1
|
27 * @param d a 128-bit part of the interal state array
|
rlm@1
|
28 * @param mask 128-bit mask
|
rlm@1
|
29 * @return output
|
rlm@1
|
30 */
|
rlm@1
|
31 PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
|
rlm@1
|
32 __m128i c, __m128i d, __m128i mask) {
|
rlm@1
|
33 __m128i v, x, y, z;
|
rlm@1
|
34
|
rlm@1
|
35 x = _mm_load_si128(a);
|
rlm@1
|
36 y = _mm_srli_epi32(*b, SR1);
|
rlm@1
|
37 z = _mm_srli_si128(c, SR2);
|
rlm@1
|
38 v = _mm_slli_epi32(d, SL1);
|
rlm@1
|
39 z = _mm_xor_si128(z, x);
|
rlm@1
|
40 z = _mm_xor_si128(z, v);
|
rlm@1
|
41 x = _mm_slli_si128(x, SL2);
|
rlm@1
|
42 y = _mm_and_si128(y, mask);
|
rlm@1
|
43 z = _mm_xor_si128(z, x);
|
rlm@1
|
44 z = _mm_xor_si128(z, y);
|
rlm@1
|
45 return z;
|
rlm@1
|
46 }
|
rlm@1
|
47
|
rlm@1
|
48 /**
|
rlm@1
|
49 * This function fills the internal state array with pseudorandom
|
rlm@1
|
50 * integers.
|
rlm@1
|
51 */
|
rlm@1
|
52 inline static void gen_rand_all(void) {
|
rlm@1
|
53 int i;
|
rlm@1
|
54 __m128i r, r1, r2, mask;
|
rlm@1
|
55 mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
|
rlm@1
|
56
|
rlm@1
|
57 r1 = _mm_load_si128(&sfmt[N - 2].si);
|
rlm@1
|
58 r2 = _mm_load_si128(&sfmt[N - 1].si);
|
rlm@1
|
59 for (i = 0; i < N - POS1; i++) {
|
rlm@1
|
60 r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
|
rlm@1
|
61 _mm_store_si128(&sfmt[i].si, r);
|
rlm@1
|
62 r1 = r2;
|
rlm@1
|
63 r2 = r;
|
rlm@1
|
64 }
|
rlm@1
|
65 for (; i < N; i++) {
|
rlm@1
|
66 r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask);
|
rlm@1
|
67 _mm_store_si128(&sfmt[i].si, r);
|
rlm@1
|
68 r1 = r2;
|
rlm@1
|
69 r2 = r;
|
rlm@1
|
70 }
|
rlm@1
|
71 }
|
rlm@1
|
72
|
rlm@1
|
73 /**
|
rlm@1
|
74 * This function fills the user-specified array with pseudorandom
|
rlm@1
|
75 * integers.
|
rlm@1
|
76 *
|
rlm@1
|
77 * @param array an 128-bit array to be filled by pseudorandom numbers.
|
rlm@1
|
78 * @param size number of 128-bit pesudorandom numbers to be generated.
|
rlm@1
|
79 */
|
rlm@1
|
80 inline static void gen_rand_array(w128_t *array, int size) {
|
rlm@1
|
81 int i, j;
|
rlm@1
|
82 __m128i r, r1, r2, mask;
|
rlm@1
|
83 mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
|
rlm@1
|
84
|
rlm@1
|
85 r1 = _mm_load_si128(&sfmt[N - 2].si);
|
rlm@1
|
86 r2 = _mm_load_si128(&sfmt[N - 1].si);
|
rlm@1
|
87 for (i = 0; i < N - POS1; i++) {
|
rlm@1
|
88 r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
|
rlm@1
|
89 _mm_store_si128(&array[i].si, r);
|
rlm@1
|
90 r1 = r2;
|
rlm@1
|
91 r2 = r;
|
rlm@1
|
92 }
|
rlm@1
|
93 for (; i < N; i++) {
|
rlm@1
|
94 r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
|
rlm@1
|
95 _mm_store_si128(&array[i].si, r);
|
rlm@1
|
96 r1 = r2;
|
rlm@1
|
97 r2 = r;
|
rlm@1
|
98 }
|
rlm@1
|
99 /* main loop */
|
rlm@1
|
100 for (; i < size - N; i++) {
|
rlm@1
|
101 r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
|
rlm@1
|
102 mask);
|
rlm@1
|
103 _mm_store_si128(&array[i].si, r);
|
rlm@1
|
104 r1 = r2;
|
rlm@1
|
105 r2 = r;
|
rlm@1
|
106 }
|
rlm@1
|
107 for (j = 0; j < 2 * N - size; j++) {
|
rlm@1
|
108 r = _mm_load_si128(&array[j + size - N].si);
|
rlm@1
|
109 _mm_store_si128(&sfmt[j].si, r);
|
rlm@1
|
110 }
|
rlm@1
|
111 for (; i < size; i++) {
|
rlm@1
|
112 r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
|
rlm@1
|
113 mask);
|
rlm@1
|
114 _mm_store_si128(&array[i].si, r);
|
rlm@1
|
115 _mm_store_si128(&sfmt[j++].si, r);
|
rlm@1
|
116 r1 = r2;
|
rlm@1
|
117 r2 = r;
|
rlm@1
|
118 }
|
rlm@1
|
119 }
|
rlm@1
|
120
|
rlm@1
|
121 #endif
|