GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H 00002 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H 00003 00004 #include <volk/volk_common.h> 00005 #include<stdio.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { 00012 00013 float res[2]; 00014 float *realpt = &res[0], *imagpt = &res[1]; 00015 const float* aPtr = (float*)input; 00016 const float* bPtr= taps; 00017 unsigned int number = 0; 00018 00019 *realpt = 0; 00020 *imagpt = 0; 00021 00022 for(number = 0; number < num_points; number++){ 00023 *realpt += ((*aPtr++) * (*bPtr)); 00024 *imagpt += ((*aPtr++) * (*bPtr++)); 00025 } 00026 00027 *result = *(lv_32fc_t*)(&res[0]); 00028 } 00029 00030 #endif /*LV_HAVE_GENERIC*/ 00031 00032 00033 #ifdef LV_HAVE_SSE 00034 00035 00036 static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { 00037 00038 unsigned int number = 0; 00039 const unsigned int sixteenthPoints = num_points / 8; 00040 00041 float res[2]; 00042 float *realpt = &res[0], *imagpt = &res[1]; 00043 const float* aPtr = (float*)input; 00044 const float* bPtr = taps; 00045 00046 __m128 a0Val, a1Val, a2Val, a3Val; 00047 __m128 b0Val, b1Val, b2Val, b3Val; 00048 __m128 x0Val, x1Val, x2Val, x3Val; 00049 __m128 c0Val, c1Val, c2Val, c3Val; 00050 00051 __m128 dotProdVal0 = _mm_setzero_ps(); 00052 __m128 dotProdVal1 = _mm_setzero_ps(); 00053 __m128 dotProdVal2 = _mm_setzero_ps(); 00054 __m128 dotProdVal3 = _mm_setzero_ps(); 00055 00056 for(;number < sixteenthPoints; number++){ 00057 00058 a0Val = _mm_load_ps(aPtr); 00059 a1Val = _mm_load_ps(aPtr+4); 00060 a2Val = _mm_load_ps(aPtr+8); 00061 a3Val = _mm_load_ps(aPtr+12); 00062 00063 x0Val = _mm_load_ps(bPtr); 00064 x1Val = _mm_load_ps(bPtr); 00065 x2Val = _mm_load_ps(bPtr+4); 00066 x3Val = _mm_load_ps(bPtr+4); 00067 b0Val = _mm_unpacklo_ps(x0Val, x1Val); 00068 b1Val = _mm_unpackhi_ps(x0Val, x1Val); 00069 b2Val = _mm_unpacklo_ps(x2Val, x3Val); 00070 b3Val = _mm_unpackhi_ps(x2Val, x3Val); 00071 00072 c0Val = _mm_mul_ps(a0Val, b0Val); 00073 c1Val = _mm_mul_ps(a1Val, b1Val); 00074 c2Val = _mm_mul_ps(a2Val, b2Val); 00075 c3Val = _mm_mul_ps(a3Val, b3Val); 00076 00077 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); 00078 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); 00079 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); 00080 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); 00081 00082 aPtr += 16; 00083 bPtr += 8; 00084 } 00085 00086 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00087 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00088 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00089 00090 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00091 00092 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00093 00094 *realpt = dotProductVector[0]; 00095 *imagpt = dotProductVector[1]; 00096 *realpt += dotProductVector[2]; 00097 *imagpt += dotProductVector[3]; 00098 00099 number = sixteenthPoints*8; 00100 for(;number < num_points; number++){ 00101 *realpt += ((*aPtr++) * (*bPtr)); 00102 *imagpt += ((*aPtr++) * (*bPtr++)); 00103 } 00104 00105 *result = *(lv_32fc_t*)(&res[0]); 00106 } 00107 00108 #endif /*LV_HAVE_SSE*/ 00109 00110 00111 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/