diff src/SFMT/SFMT-sse2.h @ 23:bf9169ad4222

add SMID-oriented fast mersenne twister
author Robert McIntyre <rlm@mit.edu>
date Sun, 04 Mar 2012 17:38:32 -0600
parents f9f4f1b99eed
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/SFMT/SFMT-sse2.h	Sun Mar 04 17:38:32 2012 -0600
     1.3 @@ -0,0 +1,121 @@
     1.4 +/** 
     1.5 + * @file  SFMT-sse2.h
     1.6 + * @brief SIMD oriented Fast Mersenne Twister(SFMT) for Intel SSE2
     1.7 + *
     1.8 + * @author Mutsuo Saito (Hiroshima University)
     1.9 + * @author Makoto Matsumoto (Hiroshima University)
    1.10 + *
    1.11 + * @note We assume LITTLE ENDIAN in this file
    1.12 + *
    1.13 + * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
    1.14 + * University. All rights reserved.
    1.15 + *
    1.16 + * The new BSD License is applied to this software, see LICENSE.txt
    1.17 + */
    1.18 +
    1.19 +#ifndef SFMT_SSE2_H
    1.20 +#define SFMT_SSE2_H
    1.21 +
    1.22 +PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, __m128i c,
    1.23 +				   __m128i d, __m128i mask) ALWAYSINLINE;
    1.24 +
    1.25 +/**
    1.26 + * This function represents the recursion formula.
    1.27 + * @param a a 128-bit part of the interal state array
    1.28 + * @param b a 128-bit part of the interal state array
    1.29 + * @param c a 128-bit part of the interal state array
    1.30 + * @param d a 128-bit part of the interal state array
    1.31 + * @param mask 128-bit mask
    1.32 + * @return output
    1.33 + */
    1.34 +PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, 
    1.35 +				   __m128i c, __m128i d, __m128i mask) {
    1.36 +    __m128i v, x, y, z;
    1.37 +    
    1.38 +    x = _mm_load_si128(a);
    1.39 +    y = _mm_srli_epi32(*b, SR1);
    1.40 +    z = _mm_srli_si128(c, SR2);
    1.41 +    v = _mm_slli_epi32(d, SL1);
    1.42 +    z = _mm_xor_si128(z, x);
    1.43 +    z = _mm_xor_si128(z, v);
    1.44 +    x = _mm_slli_si128(x, SL2);
    1.45 +    y = _mm_and_si128(y, mask);
    1.46 +    z = _mm_xor_si128(z, x);
    1.47 +    z = _mm_xor_si128(z, y);
    1.48 +    return z;
    1.49 +}
    1.50 +
    1.51 +/**
    1.52 + * This function fills the internal state array with pseudorandom
    1.53 + * integers.
    1.54 + */
    1.55 +inline static void gen_rand_all(void) {
    1.56 +    int i;
    1.57 +    __m128i r, r1, r2, mask;
    1.58 +    mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
    1.59 +
    1.60 +    r1 = _mm_load_si128(&sfmt[N - 2].si);
    1.61 +    r2 = _mm_load_si128(&sfmt[N - 1].si);
    1.62 +    for (i = 0; i < N - POS1; i++) {
    1.63 +	r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
    1.64 +	_mm_store_si128(&sfmt[i].si, r);
    1.65 +	r1 = r2;
    1.66 +	r2 = r;
    1.67 +    }
    1.68 +    for (; i < N; i++) {
    1.69 +	r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask);
    1.70 +	_mm_store_si128(&sfmt[i].si, r);
    1.71 +	r1 = r2;
    1.72 +	r2 = r;
    1.73 +    }
    1.74 +}
    1.75 +
    1.76 +/**
    1.77 + * This function fills the user-specified array with pseudorandom
    1.78 + * integers.
    1.79 + *
    1.80 + * @param array an 128-bit array to be filled by pseudorandom numbers.  
    1.81 + * @param size number of 128-bit pesudorandom numbers to be generated.
    1.82 + */
    1.83 +inline static void gen_rand_array(w128_t *array, int size) {
    1.84 +    int i, j;
    1.85 +    __m128i r, r1, r2, mask;
    1.86 +    mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
    1.87 +
    1.88 +    r1 = _mm_load_si128(&sfmt[N - 2].si);
    1.89 +    r2 = _mm_load_si128(&sfmt[N - 1].si);
    1.90 +    for (i = 0; i < N - POS1; i++) {
    1.91 +	r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
    1.92 +	_mm_store_si128(&array[i].si, r);
    1.93 +	r1 = r2;
    1.94 +	r2 = r;
    1.95 +    }
    1.96 +    for (; i < N; i++) {
    1.97 +	r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
    1.98 +	_mm_store_si128(&array[i].si, r);
    1.99 +	r1 = r2;
   1.100 +	r2 = r;
   1.101 +    }
   1.102 +    /* main loop */
   1.103 +    for (; i < size - N; i++) {
   1.104 +	r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
   1.105 +			 mask);
   1.106 +	_mm_store_si128(&array[i].si, r);
   1.107 +	r1 = r2;
   1.108 +	r2 = r;
   1.109 +    }
   1.110 +    for (j = 0; j < 2 * N - size; j++) {
   1.111 +	r = _mm_load_si128(&array[j + size - N].si);
   1.112 +	_mm_store_si128(&sfmt[j].si, r);
   1.113 +    }
   1.114 +    for (; i < size; i++) {
   1.115 +	r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
   1.116 +			 mask);
   1.117 +	_mm_store_si128(&array[i].si, r);
   1.118 +	_mm_store_si128(&sfmt[j++].si, r);
   1.119 +	r1 = r2;
   1.120 +	r2 = r;
   1.121 +    }
   1.122 +}
   1.123 +
   1.124 +#endif