GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H 00002 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H 00003 00004 #include <volk/volk_common.h> 00005 #include<stdio.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { 00012 00013 static const int N_UNROLL = 4; 00014 00015 lv_32fc_t acc0 = 0; 00016 lv_32fc_t acc1 = 0; 00017 lv_32fc_t acc2 = 0; 00018 lv_32fc_t acc3 = 0; 00019 00020 unsigned i = 0; 00021 unsigned n = (num_points / N_UNROLL) * N_UNROLL; 00022 00023 for(i = 0; i < n; i += N_UNROLL) { 00024 acc0 += taps[i + 0] * (float)input[i + 0]; 00025 acc1 += taps[i + 1] * (float)input[i + 1]; 00026 acc2 += taps[i + 2] * (float)input[i + 2]; 00027 acc3 += taps[i + 3] * (float)input[i + 3]; 00028 } 00029 00030 for(; i < num_points; i++) { 00031 acc0 += taps[i] * (float)input[i]; 00032 } 00033 00034 *result = acc0 + acc1 + acc2 + acc3; 00035 } 00036 00037 #endif /*LV_HAVE_GENERIC*/ 00038 00039 00040 #if LV_HAVE_SSE && LV_HAVE_MMX 00041 00042 00043 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { 00044 00045 unsigned int number = 0; 00046 const unsigned int sixteenthPoints = num_points / 8; 00047 00048 float res[2]; 00049 float *realpt = &res[0], *imagpt = &res[1]; 00050 const short* aPtr = input; 00051 const float* bPtr = (float*)taps; 00052 00053 __m64 m0, m1; 00054 __m128 f0, f1, f2, f3; 00055 __m128 a0Val, a1Val, a2Val, a3Val; 00056 __m128 b0Val, b1Val, b2Val, b3Val; 00057 __m128 c0Val, c1Val, c2Val, c3Val; 00058 00059 __m128 dotProdVal0 = _mm_setzero_ps(); 00060 __m128 dotProdVal1 = _mm_setzero_ps(); 00061 __m128 dotProdVal2 = _mm_setzero_ps(); 00062 __m128 dotProdVal3 = _mm_setzero_ps(); 00063 00064 for(;number < sixteenthPoints; number++){ 00065 00066 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); 00067 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); 00068 f0 = _mm_cvtpi16_ps(m0); 00069 f1 = _mm_cvtpi16_ps(m0); 00070 f2 = _mm_cvtpi16_ps(m1); 00071 f3 = _mm_cvtpi16_ps(m1); 00072 00073 a0Val = _mm_unpacklo_ps(f0, f1); 00074 a1Val = _mm_unpackhi_ps(f0, f1); 00075 a2Val = _mm_unpacklo_ps(f2, f3); 00076 a3Val = _mm_unpackhi_ps(f2, f3); 00077 00078 b0Val = _mm_load_ps(bPtr); 00079 b1Val = _mm_load_ps(bPtr+4); 00080 b2Val = _mm_load_ps(bPtr+8); 00081 b3Val = _mm_load_ps(bPtr+12); 00082 00083 c0Val = _mm_mul_ps(a0Val, b0Val); 00084 c1Val = _mm_mul_ps(a1Val, b1Val); 00085 c2Val = _mm_mul_ps(a2Val, b2Val); 00086 c3Val = _mm_mul_ps(a3Val, b3Val); 00087 00088 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); 00089 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); 00090 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); 00091 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); 00092 00093 aPtr += 8; 00094 bPtr += 16; 00095 } 00096 00097 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00098 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00099 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00100 00101 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00102 00103 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00104 00105 *realpt = dotProductVector[0]; 00106 *imagpt = dotProductVector[1]; 00107 *realpt += dotProductVector[2]; 00108 *imagpt += dotProductVector[3]; 00109 00110 number = sixteenthPoints*8; 00111 for(;number < num_points; number++){ 00112 *realpt += ((*aPtr) * (*bPtr++)); 00113 *imagpt += ((*aPtr++) * (*bPtr++)); 00114 } 00115 00116 *result = *(lv_32fc_t*)(&res[0]); 00117 } 00118 00119 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ 00120 00121 00122 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/