doc/doxygen-3.5/volk__32f__x2__dot__prod__32f__u_8h_source.html

00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00003
00004 #include<stdio.h>
00005
00006
00007 #ifdef LV_HAVE_GENERIC
00008
00009
00010 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
00011
00012   float dotProduct = 0;
00013   const float* aPtr = input;
00014   const float* bPtr=  taps;
00015   unsigned int number = 0;
00016
00017   for(number = 0; number < num_points; number++){
00018     dotProduct += ((*aPtr++) * (*bPtr++));
00019   }
00020
00021   *result = dotProduct;
00022 }
00023
00024 #endif /*LV_HAVE_GENERIC*/
00025
00026
00027 #ifdef LV_HAVE_SSE
00028
00029
00030 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00031
00032   unsigned int number = 0;
00033   const unsigned int quarterPoints = num_points / 4;
00034
00035   float dotProduct = 0;
00036   const float* aPtr = input;
00037   const float* bPtr = taps;
00038
00039   __m128 aVal, bVal, cVal;
00040
00041   __m128 dotProdVal = _mm_setzero_ps();
00042
00043   for(;number < quarterPoints; number++){
00044
00045     aVal = _mm_loadu_ps(aPtr);
00046     bVal = _mm_loadu_ps(bPtr);
00047
00048     cVal = _mm_mul_ps(aVal, bVal);
00049
00050     dotProdVal = _mm_add_ps(cVal, dotProdVal);
00051
00052     aPtr += 4;
00053     bPtr += 4;
00054   }
00055
00056   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00057
00058   _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
00059
00060   dotProduct = dotProductVector[0];
00061   dotProduct += dotProductVector[1];
00062   dotProduct += dotProductVector[2];
00063   dotProduct += dotProductVector[3];
00064
00065   number = quarterPoints * 4;
00066   for(;number < num_points; number++){
00067     dotProduct += ((*aPtr++) * (*bPtr++));
00068   }
00069
00070   *result = dotProduct;
00071
00072 }
00073
00074 #endif /*LV_HAVE_SSE*/
00075
00076 #ifdef LV_HAVE_SSE3
00077
00078 #include <pmmintrin.h>
00079
00080 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
00081   unsigned int number = 0;
00082   const unsigned int quarterPoints = num_points / 4;
00083
00084   float dotProduct = 0;
00085   const float* aPtr = input;
00086   const float* bPtr = taps;
00087
00088   __m128 aVal, bVal, cVal;
00089
00090   __m128 dotProdVal = _mm_setzero_ps();
00091
00092   for(;number < quarterPoints; number++){
00093
00094     aVal = _mm_loadu_ps(aPtr);
00095     bVal = _mm_loadu_ps(bPtr);
00096
00097     cVal = _mm_mul_ps(aVal, bVal);
00098
00099     dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
00100
00101     aPtr += 4;
00102     bPtr += 4;
00103   }
00104
00105   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00106   dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
00107
00108   _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
00109
00110   dotProduct = dotProductVector[0];
00111   dotProduct += dotProductVector[1];
00112
00113   number = quarterPoints * 4;
00114   for(;number < num_points; number++){
00115     dotProduct += ((*aPtr++) * (*bPtr++));
00116   }
00117
00118   *result = dotProduct;
00119 }
00120
00121 #endif /*LV_HAVE_SSE3*/
00122
00123 #ifdef LV_HAVE_SSE4_1
00124
00125 #include <smmintrin.h>
00126
00127 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
00128   unsigned int number = 0;
00129   const unsigned int sixteenthPoints = num_points / 16;
00130
00131   float dotProduct = 0;
00132   const float* aPtr = input;
00133   const float* bPtr = taps;
00134
00135   __m128 aVal1, bVal1, cVal1;
00136   __m128 aVal2, bVal2, cVal2;
00137   __m128 aVal3, bVal3, cVal3;
00138   __m128 aVal4, bVal4, cVal4;
00139
00140   __m128 dotProdVal = _mm_setzero_ps();
00141
00142   for(;number < sixteenthPoints; number++){
00143
00144     aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
00145     aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
00146     aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
00147     aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
00148
00149     bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
00150     bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
00151     bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
00152     bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
00153
00154     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
00155     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
00156     cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
00157     cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
00158
00159     cVal1 = _mm_or_ps(cVal1, cVal2);
00160     cVal3 = _mm_or_ps(cVal3, cVal4);
00161     cVal1 = _mm_or_ps(cVal1, cVal3);
00162
00163     dotProdVal = _mm_add_ps(dotProdVal, cVal1);
00164   }
00165
00166   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00167   _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
00168
00169   dotProduct = dotProductVector[0];
00170   dotProduct += dotProductVector[1];
00171   dotProduct += dotProductVector[2];
00172   dotProduct += dotProductVector[3];
00173
00174   number = sixteenthPoints * 16;
00175   for(;number < num_points; number++){
00176     dotProduct += ((*aPtr++) * (*bPtr++));
00177   }
00178
00179   *result = dotProduct;
00180 }
00181
00182 #endif /*LV_HAVE_SSE4_1*/
00183
00184 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/