SIMD unsigned char 배열 요소 값, 제곱 합 구하는 코드

int sum_array(const unsigned char* a, int n)
{
const __m128i vk0 = _mm_set1_epi8(0);       // constant vector of all 0s for use with _mm_unpacklo_epi8/_mm_unpackhi_epi8
const __m128i vk1 = _mm_set1_epi16(1);      // constant vector of all 1s for use with _mm_madd_epi16
__m128i vsum = _mm_set1_epi32(0);           // initialise vector of four partial 32 bit sums
int sum;
int i;

for (i = 0; i < n-16; i += 16)
{
__m128i v = _mm_load_si128((__m128i *)&a[i]);      // load vector of 8 bit values
__m128i vl = _mm_unpacklo_epi8(v, vk0); // unpack to two vectors of 16 bit values
__m128i vh = _mm_unpackhi_epi8(v, vk0);
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vl, vk1));
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vh, vk1));
// unpack and accumulate 16 bit values to
// 32 bit partial sum vector

}
// horizontal add of four 32 bit partial sums and return result
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
sum = _mm_cvtsi128_si32(vsum);
for(;i<n;i++)
{
sum+=a[i];
}
return sum;
}

int sum_square_array(const unsigned char* a, int n)
{
const __m128i vk0 = _mm_set1_epi8(0);       // constant vector of all 0s for use with _mm_unpacklo_epi8/_mm_unpackhi_epi8
const __m128i vk1 = _mm_set1_epi16(1);      // constant vector of all 1s for use with _mm_madd_epi16
__m128i vsum = _mm_set1_epi32(0);           // initialise vector of four partial 32 bit sums
int sum;
int i;

for (i = 0; i < n-16; i += 16)
{
__m128i v = _mm_load_si128((__m128i *)&a[i]);      // load vector of 8 bit values
__m128i vl = _mm_unpacklo_epi8(v, vk0); // unpack to two vectors of 16 bit values
__m128i vh = _mm_unpackhi_epi8(v, vk0);

vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vl, vl));
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(vh, vh));

//vsum = _mm_mul_epi32(vsum, vsum);
// unpack and accumulate 16 bit values to
// 32 bit partial sum vector
}
// horizontal add of four 32 bit partial sums and return result
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
sum = _mm_cvtsi128_si32(vsum);

for(;i<n;i++)
{
sum+=(a[i]*a[i]);
}
return sum;
}

덧글

  • Fan 2012/11/13 17:40 # 삭제 답글

    FMTTM님 쩌시네요
    힘내세요
  • question 2017/02/15 03:51 # 삭제 답글

    좋은글 잘 보았습니다!
    질문이 한가지 있는데 혹시 n은 무엇을 나타내는지 알수 있을까요?
    예를 들어 이미지를 받아와서 배열로 저장했을 때 _m128i에 64비트 씩 저장하려고 하는데 n이 무엇인지 제대로 확인이 안됩니다 ㅠㅠ
댓글 입력 영역


Google Analysis