wys的个人博客

你有很多事放不下?做人要潇洒一点~

0%

cs61c_lab9

cs61c_lab9

Exercise 1 - Familiarize Yourself with the SIMD Functions

  1. __m128 _mm_div_ps (__m128 a, __m128 b)

  2. __m128i _mm_max_epi8 (__m128i a, __m128i b)

  3. __m128i _mm_sra_epi16 (__m128i a, __m128i count)

Exercise 2 - Writing SIMD Code

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
long long int sum_simd(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
__m128i _127 = _mm_set1_epi32(127); // This is a vector with 127s in it... Why might you need this?
long long int result = 0; // This is where you should put your final result!
/* DO NOT DO NOT DO NOT DO NOT WRITE ANYTHING ABOVE THIS LINE. */

for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
/* YOUR CODE GOES HERE */
__m128i result_itr = _mm_setzero_si128();
for (unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4) {
__m128i vals_m128i = _mm_loadu_si128((__m128i *) &vals[i]);
result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
}
unsigned int results[4] = {0,0,0,0};
_mm_storeu_si128((__m128i *)results, result_itr);
for (int i = 0; i < 4; i ++) {
result += results[i];
}
/* You'll need a tail case. */
for (int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i ++) {
if(vals[i] >= 128) result += vals[i];
}
}
clock_t end = clock();
printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC);
return result;
}

Exercise 3 - Loop Unrolling

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
long long int sum_simd_unrolled(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
__m128i _127 = _mm_set1_epi32(127);
long long int result = 0;
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
/* COPY AND PASTE YOUR sum_simd() HERE */
/* MODIFY IT BY UNROLLING IT */
__m128i result_itr = _mm_setzero_si128();
for (unsigned int i = 0; i < NUM_ELEMS / 16 * 16; i += 16) {
__m128i vals_m128i = _mm_loadu_si128((__m128i *) &vals[i]);
result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 4]);
result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 8]);
result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 12]);
result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
}
unsigned int results[4] = {0,0,0,0};
_mm_storeu_si128((__m128i *)results, result_itr);
for (int i = 0; i < 4; i ++) {
result += results[i];
}
/* You'll need 1 or maybe 2 tail cases here. */
for (int i = NUM_ELEMS / 16 * 16; i < NUM_ELEMS; i ++) {
if(vals[i] >= 128) result += vals[i];
}
}
clock_t end = clock();
printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC);
return result;
}