GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H 00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H 00003 00004 #include <volk/volk_common.h> 00005 #include<stdio.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { 00012 00013 float dotProduct = 0; 00014 const float* aPtr = input; 00015 const float* bPtr= taps; 00016 unsigned int number = 0; 00017 00018 for(number = 0; number < num_points; number++){ 00019 dotProduct += ((*aPtr++) * (*bPtr++)); 00020 } 00021 00022 *result = dotProduct; 00023 } 00024 00025 #endif /*LV_HAVE_GENERIC*/ 00026 00027 00028 #ifdef LV_HAVE_SSE 00029 00030 00031 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { 00032 00033 unsigned int number = 0; 00034 const unsigned int sixteenthPoints = num_points / 16; 00035 00036 float dotProduct = 0; 00037 const float* aPtr = input; 00038 const float* bPtr = taps; 00039 00040 __m128 a0Val, a1Val, a2Val, a3Val; 00041 __m128 b0Val, b1Val, b2Val, b3Val; 00042 __m128 c0Val, c1Val, c2Val, c3Val; 00043 00044 __m128 dotProdVal0 = _mm_setzero_ps(); 00045 __m128 dotProdVal1 = _mm_setzero_ps(); 00046 __m128 dotProdVal2 = _mm_setzero_ps(); 00047 __m128 dotProdVal3 = _mm_setzero_ps(); 00048 00049 for(;number < sixteenthPoints; number++){ 00050 00051 a0Val = _mm_loadu_ps(aPtr); 00052 a1Val = _mm_loadu_ps(aPtr+4); 00053 a2Val = _mm_loadu_ps(aPtr+8); 00054 a3Val = _mm_loadu_ps(aPtr+12); 00055 b0Val = _mm_loadu_ps(bPtr); 00056 b1Val = _mm_loadu_ps(bPtr+4); 00057 b2Val = _mm_loadu_ps(bPtr+8); 00058 b3Val = _mm_loadu_ps(bPtr+12); 00059 00060 c0Val = _mm_mul_ps(a0Val, b0Val); 00061 c1Val = _mm_mul_ps(a1Val, b1Val); 00062 c2Val = _mm_mul_ps(a2Val, b2Val); 00063 c3Val = _mm_mul_ps(a3Val, b3Val); 00064 00065 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); 00066 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); 00067 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); 00068 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); 00069 00070 aPtr += 16; 00071 bPtr += 16; 00072 } 00073 00074 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00075 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00076 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00077 00078 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00079 00080 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00081 00082 dotProduct = dotProductVector[0]; 00083 dotProduct += dotProductVector[1]; 00084 dotProduct += dotProductVector[2]; 00085 dotProduct += dotProductVector[3]; 00086 00087 number = sixteenthPoints*16; 00088 for(;number < num_points; number++){ 00089 dotProduct += ((*aPtr++) * (*bPtr++)); 00090 } 00091 00092 *result = dotProduct; 00093 00094 } 00095 00096 #endif /*LV_HAVE_SSE*/ 00097 00098 #ifdef LV_HAVE_SSE3 00099 00100 #include <pmmintrin.h> 00101 00102 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { 00103 unsigned int number = 0; 00104 const unsigned int sixteenthPoints = num_points / 16; 00105 00106 float dotProduct = 0; 00107 const float* aPtr = input; 00108 const float* bPtr = taps; 00109 00110 __m128 a0Val, a1Val, a2Val, a3Val; 00111 __m128 b0Val, b1Val, b2Val, b3Val; 00112 __m128 c0Val, c1Val, c2Val, c3Val; 00113 00114 __m128 dotProdVal0 = _mm_setzero_ps(); 00115 __m128 dotProdVal1 = _mm_setzero_ps(); 00116 __m128 dotProdVal2 = _mm_setzero_ps(); 00117 __m128 dotProdVal3 = _mm_setzero_ps(); 00118 00119 for(;number < sixteenthPoints; number++){ 00120 00121 a0Val = _mm_loadu_ps(aPtr); 00122 a1Val = _mm_loadu_ps(aPtr+4); 00123 a2Val = _mm_loadu_ps(aPtr+8); 00124 a3Val = _mm_loadu_ps(aPtr+12); 00125 b0Val = _mm_loadu_ps(bPtr); 00126 b1Val = _mm_loadu_ps(bPtr+4); 00127 b2Val = _mm_loadu_ps(bPtr+8); 00128 b3Val = _mm_loadu_ps(bPtr+12); 00129 00130 c0Val = _mm_mul_ps(a0Val, b0Val); 00131 c1Val = _mm_mul_ps(a1Val, b1Val); 00132 c2Val = _mm_mul_ps(a2Val, b2Val); 00133 c3Val = _mm_mul_ps(a3Val, b3Val); 00134 00135 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); 00136 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); 00137 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); 00138 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); 00139 00140 aPtr += 16; 00141 bPtr += 16; 00142 } 00143 00144 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00145 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00146 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00147 00148 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00149 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00150 00151 dotProduct = dotProductVector[0]; 00152 dotProduct += dotProductVector[1]; 00153 dotProduct += dotProductVector[2]; 00154 dotProduct += dotProductVector[3]; 00155 00156 number = sixteenthPoints*16; 00157 for(;number < num_points; number++){ 00158 dotProduct += ((*aPtr++) * (*bPtr++)); 00159 } 00160 00161 *result = dotProduct; 00162 } 00163 00164 #endif /*LV_HAVE_SSE3*/ 00165 00166 #ifdef LV_HAVE_SSE4_1 00167 00168 #include <smmintrin.h> 00169 00170 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { 00171 unsigned int number = 0; 00172 const unsigned int sixteenthPoints = num_points / 16; 00173 00174 float dotProduct = 0; 00175 const float* aPtr = input; 00176 const float* bPtr = taps; 00177 00178 __m128 aVal1, bVal1, cVal1; 00179 __m128 aVal2, bVal2, cVal2; 00180 __m128 aVal3, bVal3, cVal3; 00181 __m128 aVal4, bVal4, cVal4; 00182 00183 __m128 dotProdVal = _mm_setzero_ps(); 00184 00185 for(;number < sixteenthPoints; number++){ 00186 00187 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; 00188 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; 00189 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; 00190 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; 00191 00192 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; 00193 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; 00194 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; 00195 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; 00196 00197 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); 00198 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); 00199 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); 00200 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); 00201 00202 cVal1 = _mm_or_ps(cVal1, cVal2); 00203 cVal3 = _mm_or_ps(cVal3, cVal4); 00204 cVal1 = _mm_or_ps(cVal1, cVal3); 00205 00206 dotProdVal = _mm_add_ps(dotProdVal, cVal1); 00207 } 00208 00209 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00210 _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector 00211 00212 dotProduct = dotProductVector[0]; 00213 dotProduct += dotProductVector[1]; 00214 dotProduct += dotProductVector[2]; 00215 dotProduct += dotProductVector[3]; 00216 00217 number = sixteenthPoints * 16; 00218 for(;number < num_points; number++){ 00219 dotProduct += ((*aPtr++) * (*bPtr++)); 00220 } 00221 00222 *result = dotProduct; 00223 } 00224 00225 #endif /*LV_HAVE_SSE4_1*/ 00226 00227 #ifdef LV_HAVE_AVX 00228 00229 #include <immintrin.h> 00230 00231 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { 00232 00233 unsigned int number = 0; 00234 const unsigned int sixteenthPoints = num_points / 16; 00235 00236 float dotProduct = 0; 00237 const float* aPtr = input; 00238 const float* bPtr = taps; 00239 00240 __m256 a0Val, a1Val; 00241 __m256 b0Val, b1Val; 00242 __m256 c0Val, c1Val; 00243 00244 __m256 dotProdVal0 = _mm256_setzero_ps(); 00245 __m256 dotProdVal1 = _mm256_setzero_ps(); 00246 00247 for(;number < sixteenthPoints; number++){ 00248 00249 a0Val = _mm256_loadu_ps(aPtr); 00250 a1Val = _mm256_loadu_ps(aPtr+8); 00251 b0Val = _mm256_loadu_ps(bPtr); 00252 b1Val = _mm256_loadu_ps(bPtr+8); 00253 00254 c0Val = _mm256_mul_ps(a0Val, b0Val); 00255 c1Val = _mm256_mul_ps(a1Val, b1Val); 00256 00257 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); 00258 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); 00259 00260 aPtr += 16; 00261 bPtr += 16; 00262 } 00263 00264 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); 00265 00266 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; 00267 00268 _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00269 00270 dotProduct = dotProductVector[0]; 00271 dotProduct += dotProductVector[1]; 00272 dotProduct += dotProductVector[2]; 00273 dotProduct += dotProductVector[3]; 00274 dotProduct += dotProductVector[4]; 00275 dotProduct += dotProductVector[5]; 00276 dotProduct += dotProductVector[6]; 00277 dotProduct += dotProductVector[7]; 00278 00279 number = sixteenthPoints*16; 00280 for(;number < num_points; number++){ 00281 dotProduct += ((*aPtr++) * (*bPtr++)); 00282 } 00283 00284 *result = dotProduct; 00285 00286 } 00287 00288 #endif /*LV_HAVE_AVX*/ 00289 00290 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/