1 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
2 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
11 static inline void volk_32fc_32f_dot_prod_32fc_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const float *
taps,
unsigned int num_points) {
14 float *realpt = &res[0], *imagpt = &res[1];
15 const float* aPtr = (
float*)input;
16 const float* bPtr=
taps;
17 unsigned int number = 0;
22 for(number = 0; number < num_points; number++){
23 *realpt += ((*aPtr++) * (*bPtr));
24 *imagpt += ((*aPtr++) * (*bPtr++));
35 #include <immintrin.h>
37 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
39 unsigned int number = 0;
40 const unsigned int sixteenthPoints = num_points / 16;
43 float *realpt = &res[0], *imagpt = &res[1];
44 const float* aPtr = (
float*)input;
45 const float* bPtr =
taps;
47 __m256 a0Val, a1Val, a2Val, a3Val;
48 __m256 b0Val, b1Val, b2Val, b3Val;
49 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
50 __m256 c0Val, c1Val, c2Val, c3Val;
52 __m256 dotProdVal0 = _mm256_setzero_ps();
53 __m256 dotProdVal1 = _mm256_setzero_ps();
54 __m256 dotProdVal2 = _mm256_setzero_ps();
55 __m256 dotProdVal3 = _mm256_setzero_ps();
57 for(;number < sixteenthPoints; number++){
59 a0Val = _mm256_load_ps(aPtr);
60 a1Val = _mm256_load_ps(aPtr+8);
61 a2Val = _mm256_load_ps(aPtr+16);
62 a3Val = _mm256_load_ps(aPtr+24);
64 x0Val = _mm256_load_ps(bPtr);
65 x1Val = _mm256_load_ps(bPtr+8);
66 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
67 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
68 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
69 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
72 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
73 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
74 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
75 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
77 c0Val = _mm256_mul_ps(a0Val, b0Val);
78 c1Val = _mm256_mul_ps(a1Val, b1Val);
79 c2Val = _mm256_mul_ps(a2Val, b2Val);
80 c3Val = _mm256_mul_ps(a3Val, b3Val);
82 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
83 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
84 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
85 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
91 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
92 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
93 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
97 _mm256_store_ps(dotProductVector,dotProdVal0);
99 *realpt = dotProductVector[0];
100 *imagpt = dotProductVector[1];
101 *realpt += dotProductVector[2];
102 *imagpt += dotProductVector[3];
103 *realpt += dotProductVector[4];
104 *imagpt += dotProductVector[5];
105 *realpt += dotProductVector[6];
106 *imagpt += dotProductVector[7];
108 number = sixteenthPoints*16;
109 for(;number < num_points; number++){
110 *realpt += ((*aPtr++) * (*bPtr));
111 *imagpt += ((*aPtr++) * (*bPtr++));
125 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
127 unsigned int number = 0;
128 const unsigned int sixteenthPoints = num_points / 8;
131 float *realpt = &res[0], *imagpt = &res[1];
132 const float* aPtr = (
float*)input;
133 const float* bPtr =
taps;
135 __m128 a0Val, a1Val, a2Val, a3Val;
136 __m128 b0Val, b1Val, b2Val, b3Val;
137 __m128 x0Val, x1Val, x2Val, x3Val;
138 __m128 c0Val, c1Val, c2Val, c3Val;
140 __m128 dotProdVal0 = _mm_setzero_ps();
141 __m128 dotProdVal1 = _mm_setzero_ps();
142 __m128 dotProdVal2 = _mm_setzero_ps();
143 __m128 dotProdVal3 = _mm_setzero_ps();
145 for(;number < sixteenthPoints; number++){
147 a0Val = _mm_load_ps(aPtr);
148 a1Val = _mm_load_ps(aPtr+4);
149 a2Val = _mm_load_ps(aPtr+8);
150 a3Val = _mm_load_ps(aPtr+12);
152 x0Val = _mm_load_ps(bPtr);
153 x1Val = _mm_load_ps(bPtr);
154 x2Val = _mm_load_ps(bPtr+4);
155 x3Val = _mm_load_ps(bPtr+4);
156 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
157 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
158 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
159 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
161 c0Val = _mm_mul_ps(a0Val, b0Val);
162 c1Val = _mm_mul_ps(a1Val, b1Val);
163 c2Val = _mm_mul_ps(a2Val, b2Val);
164 c3Val = _mm_mul_ps(a3Val, b3Val);
166 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
167 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
168 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
169 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
175 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
176 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
177 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
181 _mm_store_ps(dotProductVector,dotProdVal0);
183 *realpt = dotProductVector[0];
184 *imagpt = dotProductVector[1];
185 *realpt += dotProductVector[2];
186 *imagpt += dotProductVector[3];
188 number = sixteenthPoints*8;
189 for(;number < num_points; number++){
190 *realpt += ((*aPtr++) * (*bPtr));
191 *imagpt += ((*aPtr++) * (*bPtr++));
203 #include <immintrin.h>
205 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
207 unsigned int number = 0;
208 const unsigned int sixteenthPoints = num_points / 16;
211 float *realpt = &res[0], *imagpt = &res[1];
212 const float* aPtr = (
float*)input;
213 const float* bPtr =
taps;
215 __m256 a0Val, a1Val, a2Val, a3Val;
216 __m256 b0Val, b1Val, b2Val, b3Val;
217 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
218 __m256 c0Val, c1Val, c2Val, c3Val;
220 __m256 dotProdVal0 = _mm256_setzero_ps();
221 __m256 dotProdVal1 = _mm256_setzero_ps();
222 __m256 dotProdVal2 = _mm256_setzero_ps();
223 __m256 dotProdVal3 = _mm256_setzero_ps();
225 for(;number < sixteenthPoints; number++){
227 a0Val = _mm256_loadu_ps(aPtr);
228 a1Val = _mm256_loadu_ps(aPtr+8);
229 a2Val = _mm256_loadu_ps(aPtr+16);
230 a3Val = _mm256_loadu_ps(aPtr+24);
232 x0Val = _mm256_loadu_ps(bPtr);
233 x1Val = _mm256_loadu_ps(bPtr+8);
234 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
235 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
236 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
237 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
240 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
241 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
242 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
243 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
245 c0Val = _mm256_mul_ps(a0Val, b0Val);
246 c1Val = _mm256_mul_ps(a1Val, b1Val);
247 c2Val = _mm256_mul_ps(a2Val, b2Val);
248 c3Val = _mm256_mul_ps(a3Val, b3Val);
250 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
251 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
252 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
253 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
259 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
260 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
261 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
265 _mm256_store_ps(dotProductVector,dotProdVal0);
267 *realpt = dotProductVector[0];
268 *imagpt = dotProductVector[1];
269 *realpt += dotProductVector[2];
270 *imagpt += dotProductVector[3];
271 *realpt += dotProductVector[4];
272 *imagpt += dotProductVector[5];
273 *realpt += dotProductVector[6];
274 *imagpt += dotProductVector[7];
276 number = sixteenthPoints*16;
277 for(;number < num_points; number++){
278 *realpt += ((*aPtr++) * (*bPtr));
279 *imagpt += ((*aPtr++) * (*bPtr++));
287 #include <arm_neon.h>
289 static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll (
lv_32fc_t* __restrict result,
const lv_32fc_t* __restrict input,
const float* __restrict taps,
unsigned int num_points) {
292 const unsigned int quarterPoints = num_points / 8;
295 float *realpt = &res[0], *imagpt = &res[1];
296 const float* inputPtr = (
float*)input;
297 const float* tapsPtr =
taps;
298 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
299 float accVector_real[4];
300 float accVector_imag[4];
302 float32x4x2_t inputVector0, inputVector1;
303 float32x4_t tapsVector0, tapsVector1;
304 float32x4_t tmp_real0, tmp_imag0;
305 float32x4_t tmp_real1, tmp_imag1;
306 float32x4_t real_accumulator0, imag_accumulator0;
307 float32x4_t real_accumulator1, imag_accumulator1;
311 real_accumulator0 = vld1q_f32( zero );
312 imag_accumulator0 = vld1q_f32( zero );
313 real_accumulator1 = vld1q_f32( zero );
314 imag_accumulator1 = vld1q_f32( zero );
316 for(number=0 ;number < quarterPoints; number++){
318 tapsVector0 = vld1q_f32(tapsPtr );
319 tapsVector1 = vld1q_f32(tapsPtr+4 );
322 inputVector0 = vld2q_f32(inputPtr );
323 inputVector1 = vld2q_f32(inputPtr+8 );
326 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
327 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
329 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
330 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
332 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
333 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
335 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
336 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
342 real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
343 imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
346 vst1q_f32(accVector_real, real_accumulator0);
347 vst1q_f32(accVector_imag, imag_accumulator0);
348 *realpt = accVector_real[0] + accVector_real[1] +
349 accVector_real[2] + accVector_real[3] ;
351 *imagpt = accVector_imag[0] + accVector_imag[1] +
352 accVector_imag[2] + accVector_imag[3] ;
355 for(number=quarterPoints*8; number < num_points; number++){
356 *realpt += ((*inputPtr++) * (*tapsPtr));
357 *imagpt += ((*inputPtr++) * (*tapsPtr++));
366 #include <arm_neon.h>
368 static inline void volk_32fc_32f_dot_prod_32fc_a_neon (
lv_32fc_t* __restrict result,
const lv_32fc_t* __restrict input,
const float* __restrict taps,
unsigned int num_points) {
371 const unsigned int quarterPoints = num_points / 4;
374 float *realpt = &res[0], *imagpt = &res[1];
375 const float* inputPtr = (
float*)input;
376 const float* tapsPtr =
taps;
377 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
378 float accVector_real[4];
379 float accVector_imag[4];
381 float32x4x2_t inputVector;
382 float32x4_t tapsVector;
383 float32x4_t tmp_real, tmp_imag;
384 float32x4_t real_accumulator, imag_accumulator;
389 real_accumulator = vld1q_f32( zero );
390 imag_accumulator = vld1q_f32( zero );
392 for(number=0 ;number < quarterPoints; number++){
395 tapsVector = vld1q_f32(tapsPtr );
398 inputVector = vld2q_f32(inputPtr );
400 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
401 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
403 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
404 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
413 vst1q_f32(accVector_real, real_accumulator);
414 vst1q_f32(accVector_imag, imag_accumulator);
415 *realpt = accVector_real[0] + accVector_real[1] +
416 accVector_real[2] + accVector_real[3] ;
418 *imagpt = accVector_imag[0] + accVector_imag[1] +
419 accVector_imag[2] + accVector_imag[3] ;
422 for(number=quarterPoints*4; number < num_points; number++){
423 *realpt += ((*inputPtr++) * (*tapsPtr));
424 *imagpt += ((*inputPtr++) * (*tapsPtr++));
433 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
437 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
442 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
444 unsigned int number = 0;
445 const unsigned int sixteenthPoints = num_points / 8;
448 float *realpt = &res[0], *imagpt = &res[1];
449 const float* aPtr = (
float*)input;
450 const float* bPtr =
taps;
452 __m128 a0Val, a1Val, a2Val, a3Val;
453 __m128 b0Val, b1Val, b2Val, b3Val;
454 __m128 x0Val, x1Val, x2Val, x3Val;
455 __m128 c0Val, c1Val, c2Val, c3Val;
457 __m128 dotProdVal0 = _mm_setzero_ps();
458 __m128 dotProdVal1 = _mm_setzero_ps();
459 __m128 dotProdVal2 = _mm_setzero_ps();
460 __m128 dotProdVal3 = _mm_setzero_ps();
462 for(;number < sixteenthPoints; number++){
464 a0Val = _mm_loadu_ps(aPtr);
465 a1Val = _mm_loadu_ps(aPtr+4);
466 a2Val = _mm_loadu_ps(aPtr+8);
467 a3Val = _mm_loadu_ps(aPtr+12);
469 x0Val = _mm_loadu_ps(bPtr);
470 x1Val = _mm_loadu_ps(bPtr);
471 x2Val = _mm_loadu_ps(bPtr+4);
472 x3Val = _mm_loadu_ps(bPtr+4);
473 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
474 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
475 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
476 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
478 c0Val = _mm_mul_ps(a0Val, b0Val);
479 c1Val = _mm_mul_ps(a1Val, b1Val);
480 c2Val = _mm_mul_ps(a2Val, b2Val);
481 c3Val = _mm_mul_ps(a3Val, b3Val);
483 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
484 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
485 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
486 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
492 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
493 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
494 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
498 _mm_store_ps(dotProductVector,dotProdVal0);
500 *realpt = dotProductVector[0];
501 *imagpt = dotProductVector[1];
502 *realpt += dotProductVector[2];
503 *imagpt += dotProductVector[3];
505 number = sixteenthPoints*8;
506 for(;number < num_points; number++){
507 *realpt += ((*aPtr++) * (*bPtr));
508 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56