doc/doxygen-3.6/volk__32f__x2__dot__prod__32f__u_8h_source.html

00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00003
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006
00007
00008 #ifdef LV_HAVE_GENERIC
00009
00010
00011 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
00012
00013   float dotProduct = 0;
00014   const float* aPtr = input;
00015   const float* bPtr=  taps;
00016   unsigned int number = 0;
00017
00018   for(number = 0; number < num_points; number++){
00019     dotProduct += ((*aPtr++) * (*bPtr++));
00020   }
00021
00022   *result = dotProduct;
00023 }
00024
00025 #endif /*LV_HAVE_GENERIC*/
00026
00027
00028 #ifdef LV_HAVE_SSE
00029
00030
00031 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00032
00033   unsigned int number = 0;
00034   const unsigned int sixteenthPoints = num_points / 16;
00035
00036   float dotProduct = 0;
00037   const float* aPtr = input;
00038   const float* bPtr = taps;
00039
00040   __m128 a0Val, a1Val, a2Val, a3Val;
00041   __m128 b0Val, b1Val, b2Val, b3Val;
00042   __m128 c0Val, c1Val, c2Val, c3Val;
00043
00044   __m128 dotProdVal0 = _mm_setzero_ps();
00045   __m128 dotProdVal1 = _mm_setzero_ps();
00046   __m128 dotProdVal2 = _mm_setzero_ps();
00047   __m128 dotProdVal3 = _mm_setzero_ps();
00048
00049   for(;number < sixteenthPoints; number++){
00050
00051     a0Val = _mm_loadu_ps(aPtr);
00052     a1Val = _mm_loadu_ps(aPtr+4);
00053     a2Val = _mm_loadu_ps(aPtr+8);
00054     a3Val = _mm_loadu_ps(aPtr+12);
00055     b0Val = _mm_loadu_ps(bPtr);
00056     b1Val = _mm_loadu_ps(bPtr+4);
00057     b2Val = _mm_loadu_ps(bPtr+8);
00058     b3Val = _mm_loadu_ps(bPtr+12);
00059
00060     c0Val = _mm_mul_ps(a0Val, b0Val);
00061     c1Val = _mm_mul_ps(a1Val, b1Val);
00062     c2Val = _mm_mul_ps(a2Val, b2Val);
00063     c3Val = _mm_mul_ps(a3Val, b3Val);
00064
00065     dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
00066     dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
00067     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
00068     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
00069
00070     aPtr += 16;
00071     bPtr += 16;
00072   }
00073
00074   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00075   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00076   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00077
00078   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00079
00080   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00081
00082   dotProduct = dotProductVector[0];
00083   dotProduct += dotProductVector[1];
00084   dotProduct += dotProductVector[2];
00085   dotProduct += dotProductVector[3];
00086
00087   number = sixteenthPoints*16;
00088   for(;number < num_points; number++){
00089     dotProduct += ((*aPtr++) * (*bPtr++));
00090   }
00091
00092   *result = dotProduct;
00093
00094 }
00095
00096 #endif /*LV_HAVE_SSE*/
00097
00098 #ifdef LV_HAVE_SSE3
00099
00100 #include <pmmintrin.h>
00101
00102 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
00103   unsigned int number = 0;
00104   const unsigned int sixteenthPoints = num_points / 16;
00105
00106   float dotProduct = 0;
00107   const float* aPtr = input;
00108   const float* bPtr = taps;
00109
00110   __m128 a0Val, a1Val, a2Val, a3Val;
00111   __m128 b0Val, b1Val, b2Val, b3Val;
00112   __m128 c0Val, c1Val, c2Val, c3Val;
00113
00114   __m128 dotProdVal0 = _mm_setzero_ps();
00115   __m128 dotProdVal1 = _mm_setzero_ps();
00116   __m128 dotProdVal2 = _mm_setzero_ps();
00117   __m128 dotProdVal3 = _mm_setzero_ps();
00118
00119   for(;number < sixteenthPoints; number++){
00120
00121     a0Val = _mm_loadu_ps(aPtr);
00122     a1Val = _mm_loadu_ps(aPtr+4);
00123     a2Val = _mm_loadu_ps(aPtr+8);
00124     a3Val = _mm_loadu_ps(aPtr+12);
00125     b0Val = _mm_loadu_ps(bPtr);
00126     b1Val = _mm_loadu_ps(bPtr+4);
00127     b2Val = _mm_loadu_ps(bPtr+8);
00128     b3Val = _mm_loadu_ps(bPtr+12);
00129
00130     c0Val = _mm_mul_ps(a0Val, b0Val);
00131     c1Val = _mm_mul_ps(a1Val, b1Val);
00132     c2Val = _mm_mul_ps(a2Val, b2Val);
00133     c3Val = _mm_mul_ps(a3Val, b3Val);
00134
00135     dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
00136     dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
00137     dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
00138     dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
00139
00140     aPtr += 16;
00141     bPtr += 16;
00142   }
00143
00144   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00145   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00146   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00147
00148   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00149   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00150
00151   dotProduct = dotProductVector[0];
00152   dotProduct += dotProductVector[1];
00153   dotProduct += dotProductVector[2];
00154   dotProduct += dotProductVector[3];
00155
00156   number = sixteenthPoints*16;
00157   for(;number < num_points; number++){
00158     dotProduct += ((*aPtr++) * (*bPtr++));
00159   }
00160
00161   *result = dotProduct;
00162 }
00163
00164 #endif /*LV_HAVE_SSE3*/
00165
00166 #ifdef LV_HAVE_SSE4_1
00167
00168 #include <smmintrin.h>
00169
00170 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
00171   unsigned int number = 0;
00172   const unsigned int sixteenthPoints = num_points / 16;
00173
00174   float dotProduct = 0;
00175   const float* aPtr = input;
00176   const float* bPtr = taps;
00177
00178   __m128 aVal1, bVal1, cVal1;
00179   __m128 aVal2, bVal2, cVal2;
00180   __m128 aVal3, bVal3, cVal3;
00181   __m128 aVal4, bVal4, cVal4;
00182
00183   __m128 dotProdVal = _mm_setzero_ps();
00184
00185   for(;number < sixteenthPoints; number++){
00186
00187     aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
00188     aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
00189     aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
00190     aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
00191
00192     bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
00193     bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
00194     bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
00195     bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
00196
00197     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
00198     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
00199     cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
00200     cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
00201
00202     cVal1 = _mm_or_ps(cVal1, cVal2);
00203     cVal3 = _mm_or_ps(cVal3, cVal4);
00204     cVal1 = _mm_or_ps(cVal1, cVal3);
00205
00206     dotProdVal = _mm_add_ps(dotProdVal, cVal1);
00207   }
00208
00209   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00210   _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
00211
00212   dotProduct = dotProductVector[0];
00213   dotProduct += dotProductVector[1];
00214   dotProduct += dotProductVector[2];
00215   dotProduct += dotProductVector[3];
00216
00217   number = sixteenthPoints * 16;
00218   for(;number < num_points; number++){
00219     dotProduct += ((*aPtr++) * (*bPtr++));
00220   }
00221
00222   *result = dotProduct;
00223 }
00224
00225 #endif /*LV_HAVE_SSE4_1*/
00226
00227 #ifdef LV_HAVE_AVX
00228
00229 #include <immintrin.h>
00230
00231 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00232
00233   unsigned int number = 0;
00234   const unsigned int sixteenthPoints = num_points / 16;
00235
00236   float dotProduct = 0;
00237   const float* aPtr = input;
00238   const float* bPtr = taps;
00239
00240   __m256 a0Val, a1Val;
00241   __m256 b0Val, b1Val;
00242   __m256 c0Val, c1Val;
00243
00244   __m256 dotProdVal0 = _mm256_setzero_ps();
00245   __m256 dotProdVal1 = _mm256_setzero_ps();
00246
00247   for(;number < sixteenthPoints; number++){
00248
00249     a0Val = _mm256_loadu_ps(aPtr);
00250     a1Val = _mm256_loadu_ps(aPtr+8);
00251     b0Val = _mm256_loadu_ps(bPtr);
00252     b1Val = _mm256_loadu_ps(bPtr+8);
00253
00254     c0Val = _mm256_mul_ps(a0Val, b0Val);
00255     c1Val = _mm256_mul_ps(a1Val, b1Val);
00256
00257     dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
00258     dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
00259
00260     aPtr += 16;
00261     bPtr += 16;
00262   }
00263
00264   dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
00265
00266   __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
00267
00268   _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00269
00270   dotProduct = dotProductVector[0];
00271   dotProduct += dotProductVector[1];
00272   dotProduct += dotProductVector[2];
00273   dotProduct += dotProductVector[3];
00274   dotProduct += dotProductVector[4];
00275   dotProduct += dotProductVector[5];
00276   dotProduct += dotProductVector[6];
00277   dotProduct += dotProductVector[7];
00278
00279   number = sixteenthPoints*16;
00280   for(;number < num_points; number++){
00281     dotProduct += ((*aPtr++) * (*bPtr++));
00282   }
00283
00284   *result = dotProduct;
00285
00286 }
00287
00288 #endif /*LV_HAVE_AVX*/
00289
00290 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/