GNU Radio 3.6.5 C++ API

volk_32f_x2_dot_prod_16i_a.h

Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
00003 
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006 
00007 
00008 #ifdef LV_HAVE_GENERIC
00009 
00010 
00011 static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
00012 
00013   float dotProduct = 0;
00014   const float* aPtr = input;
00015   const float* bPtr=  taps;
00016   unsigned int number = 0;
00017 
00018   for(number = 0; number < num_points; number++){
00019     dotProduct += ((*aPtr++) * (*bPtr++));
00020   }
00021 
00022   *result = (int16_t)dotProduct;
00023 }
00024 
00025 #endif /*LV_HAVE_GENERIC*/
00026 
00027 
00028 #ifdef LV_HAVE_SSE
00029 
00030 
00031 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
00032 
00033   unsigned int number = 0;
00034   const unsigned int sixteenthPoints = num_points / 16;
00035 
00036   float dotProduct = 0;
00037   const float* aPtr = input;
00038   const float* bPtr = taps;
00039 
00040   __m128 a0Val, a1Val, a2Val, a3Val;
00041   __m128 b0Val, b1Val, b2Val, b3Val;
00042   __m128 c0Val, c1Val, c2Val, c3Val;
00043 
00044   __m128 dotProdVal0 = _mm_setzero_ps();
00045   __m128 dotProdVal1 = _mm_setzero_ps();
00046   __m128 dotProdVal2 = _mm_setzero_ps();
00047   __m128 dotProdVal3 = _mm_setzero_ps();
00048 
00049   for(;number < sixteenthPoints; number++){
00050 
00051     a0Val = _mm_load_ps(aPtr);
00052     a1Val = _mm_load_ps(aPtr+4);
00053     a2Val = _mm_load_ps(aPtr+8);
00054     a3Val = _mm_load_ps(aPtr+12);
00055     b0Val = _mm_load_ps(bPtr);
00056     b1Val = _mm_load_ps(bPtr+4);
00057     b2Val = _mm_load_ps(bPtr+8);
00058     b3Val = _mm_load_ps(bPtr+12);
00059 
00060     c0Val = _mm_mul_ps(a0Val, b0Val);
00061     c1Val = _mm_mul_ps(a1Val, b1Val);
00062     c2Val = _mm_mul_ps(a2Val, b2Val);
00063     c3Val = _mm_mul_ps(a3Val, b3Val);
00064 
00065     dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
00066     dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
00067     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
00068     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
00069 
00070     aPtr += 16;
00071     bPtr += 16;
00072   }
00073 
00074   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00075   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00076   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00077 
00078   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00079 
00080   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00081 
00082   dotProduct = dotProductVector[0];
00083   dotProduct += dotProductVector[1];
00084   dotProduct += dotProductVector[2];
00085   dotProduct += dotProductVector[3];
00086 
00087   number = sixteenthPoints*16;
00088   for(;number < num_points; number++){
00089     dotProduct += ((*aPtr++) * (*bPtr++));
00090   }
00091 
00092   *result = (short)dotProduct;
00093 
00094 }
00095 
00096 #endif /*LV_HAVE_SSE*/
00097 
00098 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/