GNU Radio 3.5.3.2 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H 00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H 00003 00004 #include<stdio.h> 00005 00006 00007 #ifdef LV_HAVE_GENERIC 00008 00009 00010 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { 00011 00012 float dotProduct = 0; 00013 const float* aPtr = input; 00014 const float* bPtr= taps; 00015 unsigned int number = 0; 00016 00017 for(number = 0; number < num_points; number++){ 00018 dotProduct += ((*aPtr++) * (*bPtr++)); 00019 } 00020 00021 *result = dotProduct; 00022 } 00023 00024 #endif /*LV_HAVE_GENERIC*/ 00025 00026 00027 #ifdef LV_HAVE_SSE 00028 00029 00030 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { 00031 00032 unsigned int number = 0; 00033 const unsigned int quarterPoints = num_points / 4; 00034 00035 float dotProduct = 0; 00036 const float* aPtr = input; 00037 const float* bPtr = taps; 00038 00039 __m128 aVal, bVal, cVal; 00040 00041 __m128 dotProdVal = _mm_setzero_ps(); 00042 00043 for(;number < quarterPoints; number++){ 00044 00045 aVal = _mm_loadu_ps(aPtr); 00046 bVal = _mm_loadu_ps(bPtr); 00047 00048 cVal = _mm_mul_ps(aVal, bVal); 00049 00050 dotProdVal = _mm_add_ps(cVal, dotProdVal); 00051 00052 aPtr += 4; 00053 bPtr += 4; 00054 } 00055 00056 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00057 00058 _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector 00059 00060 dotProduct = dotProductVector[0]; 00061 dotProduct += dotProductVector[1]; 00062 dotProduct += dotProductVector[2]; 00063 dotProduct += dotProductVector[3]; 00064 00065 number = quarterPoints * 4; 00066 for(;number < num_points; number++){ 00067 dotProduct += ((*aPtr++) * (*bPtr++)); 00068 } 00069 00070 *result = dotProduct; 00071 00072 } 00073 00074 #endif /*LV_HAVE_SSE*/ 00075 00076 #ifdef LV_HAVE_SSE3 00077 00078 #include <pmmintrin.h> 00079 00080 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { 00081 unsigned int number = 0; 00082 const unsigned int quarterPoints = num_points / 4; 00083 00084 float dotProduct = 0; 00085 const float* aPtr = input; 00086 const float* bPtr = taps; 00087 00088 __m128 aVal, bVal, cVal; 00089 00090 __m128 dotProdVal = _mm_setzero_ps(); 00091 00092 for(;number < quarterPoints; number++){ 00093 00094 aVal = _mm_loadu_ps(aPtr); 00095 bVal = _mm_loadu_ps(bPtr); 00096 00097 cVal = _mm_mul_ps(aVal, bVal); 00098 00099 dotProdVal = _mm_hadd_ps(dotProdVal, cVal); 00100 00101 aPtr += 4; 00102 bPtr += 4; 00103 } 00104 00105 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00106 dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); 00107 00108 _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector 00109 00110 dotProduct = dotProductVector[0]; 00111 dotProduct += dotProductVector[1]; 00112 00113 number = quarterPoints * 4; 00114 for(;number < num_points; number++){ 00115 dotProduct += ((*aPtr++) * (*bPtr++)); 00116 } 00117 00118 *result = dotProduct; 00119 } 00120 00121 #endif /*LV_HAVE_SSE3*/ 00122 00123 #ifdef LV_HAVE_SSE4_1 00124 00125 #include <smmintrin.h> 00126 00127 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { 00128 unsigned int number = 0; 00129 const unsigned int sixteenthPoints = num_points / 16; 00130 00131 float dotProduct = 0; 00132 const float* aPtr = input; 00133 const float* bPtr = taps; 00134 00135 __m128 aVal1, bVal1, cVal1; 00136 __m128 aVal2, bVal2, cVal2; 00137 __m128 aVal3, bVal3, cVal3; 00138 __m128 aVal4, bVal4, cVal4; 00139 00140 __m128 dotProdVal = _mm_setzero_ps(); 00141 00142 for(;number < sixteenthPoints; number++){ 00143 00144 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; 00145 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; 00146 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; 00147 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; 00148 00149 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; 00150 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; 00151 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; 00152 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; 00153 00154 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); 00155 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); 00156 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); 00157 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); 00158 00159 cVal1 = _mm_or_ps(cVal1, cVal2); 00160 cVal3 = _mm_or_ps(cVal3, cVal4); 00161 cVal1 = _mm_or_ps(cVal1, cVal3); 00162 00163 dotProdVal = _mm_add_ps(dotProdVal, cVal1); 00164 } 00165 00166 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00167 _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector 00168 00169 dotProduct = dotProductVector[0]; 00170 dotProduct += dotProductVector[1]; 00171 dotProduct += dotProductVector[2]; 00172 dotProduct += dotProductVector[3]; 00173 00174 number = sixteenthPoints * 16; 00175 for(;number < num_points; number++){ 00176 dotProduct += ((*aPtr++) * (*bPtr++)); 00177 } 00178 00179 *result = dotProduct; 00180 } 00181 00182 #endif /*LV_HAVE_SSE4_1*/ 00183 00184 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/