GNU Radio 3.6.5 C++ API

volk_32f_x2_dot_prod_32f_a.h

Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
00003 
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006 
00007 
00008 #ifdef LV_HAVE_GENERIC
00009 
00010 
00011 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
00012 
00013   float dotProduct = 0;
00014   const float* aPtr = input;
00015   const float* bPtr=  taps;
00016   unsigned int number = 0;
00017 
00018   for(number = 0; number < num_points; number++){
00019     dotProduct += ((*aPtr++) * (*bPtr++));
00020   }
00021 
00022   *result = dotProduct;
00023 }
00024 
00025 #endif /*LV_HAVE_GENERIC*/
00026 
00027 
00028 #ifdef LV_HAVE_SSE
00029 
00030 
00031 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00032 
00033   unsigned int number = 0;
00034   const unsigned int sixteenthPoints = num_points / 16;
00035 
00036   float dotProduct = 0;
00037   const float* aPtr = input;
00038   const float* bPtr = taps;
00039 
00040   __m128 a0Val, a1Val, a2Val, a3Val;
00041   __m128 b0Val, b1Val, b2Val, b3Val;
00042   __m128 c0Val, c1Val, c2Val, c3Val;
00043 
00044   __m128 dotProdVal0 = _mm_setzero_ps();
00045   __m128 dotProdVal1 = _mm_setzero_ps();
00046   __m128 dotProdVal2 = _mm_setzero_ps();
00047   __m128 dotProdVal3 = _mm_setzero_ps();
00048 
00049   for(;number < sixteenthPoints; number++){
00050 
00051     a0Val = _mm_load_ps(aPtr);
00052     a1Val = _mm_load_ps(aPtr+4);
00053     a2Val = _mm_load_ps(aPtr+8);
00054     a3Val = _mm_load_ps(aPtr+12);
00055     b0Val = _mm_load_ps(bPtr);
00056     b1Val = _mm_load_ps(bPtr+4);
00057     b2Val = _mm_load_ps(bPtr+8);
00058     b3Val = _mm_load_ps(bPtr+12);
00059 
00060     c0Val = _mm_mul_ps(a0Val, b0Val);
00061     c1Val = _mm_mul_ps(a1Val, b1Val);
00062     c2Val = _mm_mul_ps(a2Val, b2Val);
00063     c3Val = _mm_mul_ps(a3Val, b3Val);
00064 
00065     dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
00066     dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
00067     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
00068     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
00069 
00070     aPtr += 16;
00071     bPtr += 16;
00072   }
00073 
00074   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00075   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00076   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00077 
00078   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00079 
00080   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00081 
00082   dotProduct = dotProductVector[0];
00083   dotProduct += dotProductVector[1];
00084   dotProduct += dotProductVector[2];
00085   dotProduct += dotProductVector[3];
00086 
00087   number = sixteenthPoints*16;
00088   for(;number < num_points; number++){
00089     dotProduct += ((*aPtr++) * (*bPtr++));
00090   }
00091 
00092   *result = dotProduct;
00093 
00094 }
00095 
00096 #endif /*LV_HAVE_SSE*/
00097 
00098 #ifdef LV_HAVE_SSE3
00099 
00100 #include <pmmintrin.h>
00101 
00102 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
00103   unsigned int number = 0;
00104   const unsigned int sixteenthPoints = num_points / 16;
00105 
00106   float dotProduct = 0;
00107   const float* aPtr = input;
00108   const float* bPtr = taps;
00109 
00110   __m128 a0Val, a1Val, a2Val, a3Val;
00111   __m128 b0Val, b1Val, b2Val, b3Val;
00112   __m128 c0Val, c1Val, c2Val, c3Val;
00113 
00114   __m128 dotProdVal0 = _mm_setzero_ps();
00115   __m128 dotProdVal1 = _mm_setzero_ps();
00116   __m128 dotProdVal2 = _mm_setzero_ps();
00117   __m128 dotProdVal3 = _mm_setzero_ps();
00118 
00119   for(;number < sixteenthPoints; number++){
00120 
00121     a0Val = _mm_load_ps(aPtr);
00122     a1Val = _mm_load_ps(aPtr+4);
00123     a2Val = _mm_load_ps(aPtr+8);
00124     a3Val = _mm_load_ps(aPtr+12);
00125     b0Val = _mm_load_ps(bPtr);
00126     b1Val = _mm_load_ps(bPtr+4);
00127     b2Val = _mm_load_ps(bPtr+8);
00128     b3Val = _mm_load_ps(bPtr+12);
00129 
00130     c0Val = _mm_mul_ps(a0Val, b0Val);
00131     c1Val = _mm_mul_ps(a1Val, b1Val);
00132     c2Val = _mm_mul_ps(a2Val, b2Val);
00133     c3Val = _mm_mul_ps(a3Val, b3Val);
00134 
00135     dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
00136     dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
00137     dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
00138     dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
00139 
00140     aPtr += 16;
00141     bPtr += 16;
00142   }
00143 
00144   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00145   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00146   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00147 
00148   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00149   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00150 
00151   dotProduct = dotProductVector[0];
00152   dotProduct += dotProductVector[1];
00153   dotProduct += dotProductVector[2];
00154   dotProduct += dotProductVector[3];
00155 
00156   number = sixteenthPoints*16;
00157   for(;number < num_points; number++){
00158     dotProduct += ((*aPtr++) * (*bPtr++));
00159   }
00160 
00161   *result = dotProduct;
00162 }
00163 
00164 #endif /*LV_HAVE_SSE3*/
00165 
00166 #ifdef LV_HAVE_SSE4_1
00167 
00168 #include <smmintrin.h>
00169 
00170 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
00171   unsigned int number = 0;
00172   const unsigned int sixteenthPoints = num_points / 16;
00173 
00174   float dotProduct = 0;
00175   const float* aPtr = input;
00176   const float* bPtr = taps;
00177 
00178   __m128 aVal1, bVal1, cVal1;
00179   __m128 aVal2, bVal2, cVal2;
00180   __m128 aVal3, bVal3, cVal3;
00181   __m128 aVal4, bVal4, cVal4;
00182 
00183   __m128 dotProdVal = _mm_setzero_ps();
00184 
00185   for(;number < sixteenthPoints; number++){
00186 
00187     aVal1 = _mm_load_ps(aPtr); aPtr += 4;
00188     aVal2 = _mm_load_ps(aPtr); aPtr += 4;
00189     aVal3 = _mm_load_ps(aPtr); aPtr += 4;
00190     aVal4 = _mm_load_ps(aPtr); aPtr += 4;
00191 
00192     bVal1 = _mm_load_ps(bPtr); bPtr += 4;
00193     bVal2 = _mm_load_ps(bPtr); bPtr += 4;
00194     bVal3 = _mm_load_ps(bPtr); bPtr += 4;
00195     bVal4 = _mm_load_ps(bPtr); bPtr += 4;
00196 
00197     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
00198     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
00199     cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
00200     cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
00201 
00202     cVal1 = _mm_or_ps(cVal1, cVal2);
00203     cVal3 = _mm_or_ps(cVal3, cVal4);
00204     cVal1 = _mm_or_ps(cVal1, cVal3);
00205 
00206     dotProdVal = _mm_add_ps(dotProdVal, cVal1);
00207   }
00208 
00209   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00210   _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
00211 
00212   dotProduct = dotProductVector[0];
00213   dotProduct += dotProductVector[1];
00214   dotProduct += dotProductVector[2];
00215   dotProduct += dotProductVector[3];
00216 
00217   number = sixteenthPoints * 16;
00218   for(;number < num_points; number++){
00219     dotProduct += ((*aPtr++) * (*bPtr++));
00220   }
00221 
00222   *result = dotProduct;
00223 }
00224 
00225 #endif /*LV_HAVE_SSE4_1*/
00226 
00227 #ifdef LV_HAVE_AVX
00228 
00229 #include <immintrin.h>
00230 
00231 static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00232 
00233   unsigned int number = 0;
00234   const unsigned int sixteenthPoints = num_points / 16;
00235 
00236   float dotProduct = 0;
00237   const float* aPtr = input;
00238   const float* bPtr = taps;
00239 
00240   __m256 a0Val, a1Val;
00241   __m256 b0Val, b1Val;
00242   __m256 c0Val, c1Val;
00243 
00244   __m256 dotProdVal0 = _mm256_setzero_ps();
00245   __m256 dotProdVal1 = _mm256_setzero_ps();
00246 
00247   for(;number < sixteenthPoints; number++){
00248 
00249     a0Val = _mm256_load_ps(aPtr);
00250     a1Val = _mm256_load_ps(aPtr+8);
00251     b0Val = _mm256_load_ps(bPtr);
00252     b1Val = _mm256_load_ps(bPtr+8);
00253 
00254     c0Val = _mm256_mul_ps(a0Val, b0Val);
00255     c1Val = _mm256_mul_ps(a1Val, b1Val);
00256 
00257     dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
00258     dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
00259 
00260     aPtr += 16;
00261     bPtr += 16;
00262   }
00263 
00264   dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
00265 
00266   __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
00267 
00268   _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00269 
00270   dotProduct = dotProductVector[0];
00271   dotProduct += dotProductVector[1];
00272   dotProduct += dotProductVector[2];
00273   dotProduct += dotProductVector[3];
00274   dotProduct += dotProductVector[4];
00275   dotProduct += dotProductVector[5];
00276   dotProduct += dotProductVector[6];
00277   dotProduct += dotProductVector[7];
00278 
00279   number = sixteenthPoints*16;
00280   for(;number < num_points; number++){
00281     dotProduct += ((*aPtr++) * (*bPtr++));
00282   }
00283 
00284   *result = dotProduct;
00285 
00286 }
00287 
00288 #endif /*LV_HAVE_AVX*/
00289 
00290 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/