1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| long long int sum_simd_unrolled(unsigned int vals[NUM_ELEMS]) { clock_t start = clock(); __m128i _127 = _mm_set1_epi32(127); long long int result = 0; for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) { __m128i result_itr = _mm_setzero_si128(); for (unsigned int i = 0; i < NUM_ELEMS / 16 * 16; i += 16) { __m128i vals_m128i = _mm_loadu_si128((__m128i *) &vals[i]); result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127))); vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 4]); result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127))); vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 8]); result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127))); vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 12]); result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127))); } unsigned int results[4] = {0,0,0,0}; _mm_storeu_si128((__m128i *)results, result_itr); for (int i = 0; i < 4; i ++) { result += results[i]; } for (int i = NUM_ELEMS / 16 * 16; i < NUM_ELEMS; i ++) { if(vals[i] >= 128) result += vals[i]; } } clock_t end = clock(); printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC); return result; }
|