GNU Radio 3.5.3.2 C++ API
volk_32f_x2_dot_prod_32f_u.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00003 
00004 #include<stdio.h>
00005 
00006 
00007 #ifdef LV_HAVE_GENERIC
00008 
00009 
00010 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
00011 
00012   float dotProduct = 0;
00013   const float* aPtr = input;
00014   const float* bPtr=  taps;
00015   unsigned int number = 0;
00016 
00017   for(number = 0; number < num_points; number++){
00018     dotProduct += ((*aPtr++) * (*bPtr++));
00019   }
00020   
00021   *result = dotProduct;
00022 }
00023 
00024 #endif /*LV_HAVE_GENERIC*/
00025 
00026 
00027 #ifdef LV_HAVE_SSE
00028 
00029 
00030 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00031   
00032   unsigned int number = 0;
00033   const unsigned int quarterPoints = num_points / 4;
00034 
00035   float dotProduct = 0;
00036   const float* aPtr = input;
00037   const float* bPtr = taps;
00038 
00039   __m128 aVal, bVal, cVal;
00040 
00041   __m128 dotProdVal = _mm_setzero_ps();
00042 
00043   for(;number < quarterPoints; number++){
00044       
00045     aVal = _mm_loadu_ps(aPtr); 
00046     bVal = _mm_loadu_ps(bPtr);
00047       
00048     cVal = _mm_mul_ps(aVal, bVal); 
00049 
00050     dotProdVal = _mm_add_ps(cVal, dotProdVal);
00051 
00052     aPtr += 4;
00053     bPtr += 4;
00054   }
00055 
00056   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00057 
00058   _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
00059 
00060   dotProduct = dotProductVector[0];
00061   dotProduct += dotProductVector[1];
00062   dotProduct += dotProductVector[2];
00063   dotProduct += dotProductVector[3];
00064 
00065   number = quarterPoints * 4;
00066   for(;number < num_points; number++){
00067     dotProduct += ((*aPtr++) * (*bPtr++));
00068   }
00069 
00070   *result = dotProduct;
00071   
00072 }
00073 
00074 #endif /*LV_HAVE_SSE*/  
00075 
00076 #ifdef LV_HAVE_SSE3
00077 
00078 #include <pmmintrin.h>
00079 
00080 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
00081   unsigned int number = 0;
00082   const unsigned int quarterPoints = num_points / 4;
00083 
00084   float dotProduct = 0;
00085   const float* aPtr = input;
00086   const float* bPtr = taps;
00087 
00088   __m128 aVal, bVal, cVal;
00089 
00090   __m128 dotProdVal = _mm_setzero_ps();
00091 
00092   for(;number < quarterPoints; number++){
00093       
00094     aVal = _mm_loadu_ps(aPtr); 
00095     bVal = _mm_loadu_ps(bPtr);
00096       
00097     cVal = _mm_mul_ps(aVal, bVal); 
00098 
00099     dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
00100 
00101     aPtr += 4;
00102     bPtr += 4;
00103   }
00104 
00105   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00106   dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
00107 
00108   _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
00109 
00110   dotProduct = dotProductVector[0];
00111   dotProduct += dotProductVector[1];
00112 
00113   number = quarterPoints * 4;
00114   for(;number < num_points; number++){
00115     dotProduct += ((*aPtr++) * (*bPtr++));
00116   }
00117 
00118   *result = dotProduct;
00119 }  
00120 
00121 #endif /*LV_HAVE_SSE3*/
00122 
00123 #ifdef LV_HAVE_SSE4_1
00124 
00125 #include <smmintrin.h>
00126 
00127 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
00128   unsigned int number = 0;
00129   const unsigned int sixteenthPoints = num_points / 16;
00130 
00131   float dotProduct = 0;
00132   const float* aPtr = input;
00133   const float* bPtr = taps;
00134 
00135   __m128 aVal1, bVal1, cVal1;
00136   __m128 aVal2, bVal2, cVal2;
00137   __m128 aVal3, bVal3, cVal3;
00138   __m128 aVal4, bVal4, cVal4;
00139 
00140   __m128 dotProdVal = _mm_setzero_ps();
00141 
00142   for(;number < sixteenthPoints; number++){
00143       
00144     aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
00145     aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
00146     aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
00147     aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
00148 
00149     bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
00150     bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
00151     bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
00152     bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
00153     
00154     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
00155     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
00156     cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
00157     cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
00158 
00159     cVal1 = _mm_or_ps(cVal1, cVal2);
00160     cVal3 = _mm_or_ps(cVal3, cVal4);
00161     cVal1 = _mm_or_ps(cVal1, cVal3);
00162 
00163     dotProdVal = _mm_add_ps(dotProdVal, cVal1);
00164   }
00165 
00166   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00167   _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
00168 
00169   dotProduct = dotProductVector[0];
00170   dotProduct += dotProductVector[1];
00171   dotProduct += dotProductVector[2];
00172   dotProduct += dotProductVector[3];
00173 
00174   number = sixteenthPoints * 16;
00175   for(;number < num_points; number++){
00176     dotProduct += ((*aPtr++) * (*bPtr++));
00177   }
00178 
00179   *result = dotProduct;
00180 }  
00181 
00182 #endif /*LV_HAVE_SSE4_1*/
00183 
00184 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/