GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H 00002 #define INCLUDED_volk_32f_x2_dot_prod_16i_a_H 00003 00004 #include <volk/volk_common.h> 00005 #include<stdio.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { 00012 00013 float dotProduct = 0; 00014 const float* aPtr = input; 00015 const float* bPtr= taps; 00016 unsigned int number = 0; 00017 00018 for(number = 0; number < num_points; number++){ 00019 dotProduct += ((*aPtr++) * (*bPtr++)); 00020 } 00021 00022 *result = (int16_t)dotProduct; 00023 } 00024 00025 #endif /*LV_HAVE_GENERIC*/ 00026 00027 00028 #ifdef LV_HAVE_SSE 00029 00030 00031 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { 00032 00033 unsigned int number = 0; 00034 const unsigned int sixteenthPoints = num_points / 16; 00035 00036 float dotProduct = 0; 00037 const float* aPtr = input; 00038 const float* bPtr = taps; 00039 00040 __m128 a0Val, a1Val, a2Val, a3Val; 00041 __m128 b0Val, b1Val, b2Val, b3Val; 00042 __m128 c0Val, c1Val, c2Val, c3Val; 00043 00044 __m128 dotProdVal0 = _mm_setzero_ps(); 00045 __m128 dotProdVal1 = _mm_setzero_ps(); 00046 __m128 dotProdVal2 = _mm_setzero_ps(); 00047 __m128 dotProdVal3 = _mm_setzero_ps(); 00048 00049 for(;number < sixteenthPoints; number++){ 00050 00051 a0Val = _mm_load_ps(aPtr); 00052 a1Val = _mm_load_ps(aPtr+4); 00053 a2Val = _mm_load_ps(aPtr+8); 00054 a3Val = _mm_load_ps(aPtr+12); 00055 b0Val = _mm_load_ps(bPtr); 00056 b1Val = _mm_load_ps(bPtr+4); 00057 b2Val = _mm_load_ps(bPtr+8); 00058 b3Val = _mm_load_ps(bPtr+12); 00059 00060 c0Val = _mm_mul_ps(a0Val, b0Val); 00061 c1Val = _mm_mul_ps(a1Val, b1Val); 00062 c2Val = _mm_mul_ps(a2Val, b2Val); 00063 c3Val = _mm_mul_ps(a3Val, b3Val); 00064 00065 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); 00066 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); 00067 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); 00068 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); 00069 00070 aPtr += 16; 00071 bPtr += 16; 00072 } 00073 00074 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00075 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00076 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00077 00078 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00079 00080 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00081 00082 dotProduct = dotProductVector[0]; 00083 dotProduct += dotProductVector[1]; 00084 dotProduct += dotProductVector[2]; 00085 dotProduct += dotProductVector[3]; 00086 00087 number = sixteenthPoints*16; 00088 for(;number < num_points; number++){ 00089 dotProduct += ((*aPtr++) * (*bPtr++)); 00090 } 00091 00092 *result = (short)dotProduct; 00093 00094 } 00095 00096 #endif /*LV_HAVE_SSE*/ 00097 00098 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/