diff options
author | Tom Rondeau <tom@trondeau.com> | 2014-03-06 13:30:23 -0500 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-03-06 14:47:26 -0500 |
commit | 0b5950d3297212264777b6502a232d9e8458636e (patch) | |
tree | 8a8dde43321b3f6a9c15997a6a24dd2f4e2fbc3c | |
parent | 2308ba9568cf3392530345d61288580b4997adc7 (diff) |
volk: adds unaligned sse 32fc_32f_dot_prod_32fc, 16i_32fc_dot_prod_32fc, and 32fc_x2_dot_prod_16i proto-kernels.
-rw-r--r-- | volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 90 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_dot_prod_16i.h | 75 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 76 |
3 files changed, 234 insertions, 7 deletions
diff --git a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index 8bc1569f61..8c66892fd9 100644 --- a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H -#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H +#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H +#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H #include <volk/volk_common.h> #include<stdio.h> @@ -37,6 +37,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const #endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE && LV_HAVE_MMX + +static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m64 m0, m1; + __m128 f0, f1, f2, f3; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); + m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); + f0 = _mm_cvtpi16_ps(m0); + f1 = _mm_cvtpi16_ps(m0); + f2 = _mm_cvtpi16_ps(m1); + f3 = _mm_cvtpi16_ps(m1); + + a0Val = _mm_unpacklo_ps(f0, f1); + a1Val = _mm_unpackhi_ps(f0, f1); + a2Val = _mm_unpacklo_ps(f2, f3); + a3Val = _mm_unpackhi_ps(f2, f3); + + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 8; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ + + + + #if LV_HAVE_SSE && LV_HAVE_MMX @@ -119,4 +203,4 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ -#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/ +#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/ diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h index 8fcc7deaed..b637f17777 100644 --- a/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h +++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -1,5 +1,5 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H -#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H +#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H +#define INCLUDED_volk_32f_x2_dot_prod_16i_H #include <volk/volk_common.h> #include<stdio.h> @@ -27,7 +27,6 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float #ifdef LV_HAVE_SSE - static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; @@ -90,9 +89,77 @@ static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* } *result = (short)dotProduct; +} + +#endif /*LV_HAVE_SSE*/ + + +#ifdef LV_HAVE_SSE + +static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_SSE*/ -#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/ +#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/ diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 8341129fef..f567ede516 100644 --- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -284,5 +284,81 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const l #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_SSE + +static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 x0Val, x1Val, x2Val, x3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + + x0Val = _mm_loadu_ps(bPtr); + x1Val = _mm_loadu_ps(bPtr); + x2Val = _mm_loadu_ps(bPtr+4); + x3Val = _mm_loadu_ps(bPtr+4); + b0Val = _mm_unpacklo_ps(x0Val, x1Val); + b1Val = _mm_unpackhi_ps(x0Val, x1Val); + b2Val = _mm_unpacklo_ps(x2Val, x3Val); + b3Val = _mm_unpackhi_ps(x2Val, x3Val); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 8; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE*/ + #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/ |