1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
11 static inline void volk_32f_x2_dot_prod_32f_generic(
float * result,
const float * input,
const float *
taps,
unsigned int num_points) {
14 const float* aPtr = input;
15 const float* bPtr=
taps;
16 unsigned int number = 0;
18 for(number = 0; number < num_points; number++){
19 dotProduct += ((*aPtr++) * (*bPtr++));
31 static inline void volk_32f_x2_dot_prod_32f_u_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
33 unsigned int number = 0;
34 const unsigned int sixteenthPoints = num_points / 16;
37 const float* aPtr = input;
38 const float* bPtr =
taps;
40 __m128 a0Val, a1Val, a2Val, a3Val;
41 __m128 b0Val, b1Val, b2Val, b3Val;
42 __m128 c0Val, c1Val, c2Val, c3Val;
44 __m128 dotProdVal0 = _mm_setzero_ps();
45 __m128 dotProdVal1 = _mm_setzero_ps();
46 __m128 dotProdVal2 = _mm_setzero_ps();
47 __m128 dotProdVal3 = _mm_setzero_ps();
49 for(;number < sixteenthPoints; number++){
51 a0Val = _mm_loadu_ps(aPtr);
52 a1Val = _mm_loadu_ps(aPtr+4);
53 a2Val = _mm_loadu_ps(aPtr+8);
54 a3Val = _mm_loadu_ps(aPtr+12);
55 b0Val = _mm_loadu_ps(bPtr);
56 b1Val = _mm_loadu_ps(bPtr+4);
57 b2Val = _mm_loadu_ps(bPtr+8);
58 b3Val = _mm_loadu_ps(bPtr+12);
60 c0Val = _mm_mul_ps(a0Val, b0Val);
61 c1Val = _mm_mul_ps(a1Val, b1Val);
62 c2Val = _mm_mul_ps(a2Val, b2Val);
63 c3Val = _mm_mul_ps(a3Val, b3Val);
65 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
66 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
67 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
68 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
74 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
75 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
76 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
80 _mm_store_ps(dotProductVector,dotProdVal0);
82 dotProduct = dotProductVector[0];
83 dotProduct += dotProductVector[1];
84 dotProduct += dotProductVector[2];
85 dotProduct += dotProductVector[3];
87 number = sixteenthPoints*16;
88 for(;number < num_points; number++){
89 dotProduct += ((*aPtr++) * (*bPtr++));
100 #include <pmmintrin.h>
102 static inline void volk_32f_x2_dot_prod_32f_u_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
103 unsigned int number = 0;
104 const unsigned int sixteenthPoints = num_points / 16;
106 float dotProduct = 0;
107 const float* aPtr = input;
108 const float* bPtr =
taps;
110 __m128 a0Val, a1Val, a2Val, a3Val;
111 __m128 b0Val, b1Val, b2Val, b3Val;
112 __m128 c0Val, c1Val, c2Val, c3Val;
114 __m128 dotProdVal0 = _mm_setzero_ps();
115 __m128 dotProdVal1 = _mm_setzero_ps();
116 __m128 dotProdVal2 = _mm_setzero_ps();
117 __m128 dotProdVal3 = _mm_setzero_ps();
119 for(;number < sixteenthPoints; number++){
121 a0Val = _mm_loadu_ps(aPtr);
122 a1Val = _mm_loadu_ps(aPtr+4);
123 a2Val = _mm_loadu_ps(aPtr+8);
124 a3Val = _mm_loadu_ps(aPtr+12);
125 b0Val = _mm_loadu_ps(bPtr);
126 b1Val = _mm_loadu_ps(bPtr+4);
127 b2Val = _mm_loadu_ps(bPtr+8);
128 b3Val = _mm_loadu_ps(bPtr+12);
130 c0Val = _mm_mul_ps(a0Val, b0Val);
131 c1Val = _mm_mul_ps(a1Val, b1Val);
132 c2Val = _mm_mul_ps(a2Val, b2Val);
133 c3Val = _mm_mul_ps(a3Val, b3Val);
135 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
136 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
137 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
138 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
144 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
145 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
146 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
149 _mm_store_ps(dotProductVector,dotProdVal0);
151 dotProduct = dotProductVector[0];
152 dotProduct += dotProductVector[1];
153 dotProduct += dotProductVector[2];
154 dotProduct += dotProductVector[3];
156 number = sixteenthPoints*16;
157 for(;number < num_points; number++){
158 dotProduct += ((*aPtr++) * (*bPtr++));
161 *result = dotProduct;
166 #ifdef LV_HAVE_SSE4_1
168 #include <smmintrin.h>
170 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
171 unsigned int number = 0;
172 const unsigned int sixteenthPoints = num_points / 16;
174 float dotProduct = 0;
175 const float* aPtr = input;
176 const float* bPtr =
taps;
178 __m128 aVal1, bVal1, cVal1;
179 __m128 aVal2, bVal2, cVal2;
180 __m128 aVal3, bVal3, cVal3;
181 __m128 aVal4, bVal4, cVal4;
183 __m128 dotProdVal = _mm_setzero_ps();
185 for(;number < sixteenthPoints; number++){
187 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
188 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
189 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
190 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
192 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
193 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
194 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
195 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
197 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
198 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
199 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
200 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
202 cVal1 = _mm_or_ps(cVal1, cVal2);
203 cVal3 = _mm_or_ps(cVal3, cVal4);
204 cVal1 = _mm_or_ps(cVal1, cVal3);
206 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
210 _mm_store_ps(dotProductVector, dotProdVal);
212 dotProduct = dotProductVector[0];
213 dotProduct += dotProductVector[1];
214 dotProduct += dotProductVector[2];
215 dotProduct += dotProductVector[3];
217 number = sixteenthPoints * 16;
218 for(;number < num_points; number++){
219 dotProduct += ((*aPtr++) * (*bPtr++));
222 *result = dotProduct;
229 #include <immintrin.h>
231 static inline void volk_32f_x2_dot_prod_32f_u_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
233 unsigned int number = 0;
234 const unsigned int sixteenthPoints = num_points / 16;
236 float dotProduct = 0;
237 const float* aPtr = input;
238 const float* bPtr =
taps;
244 __m256 dotProdVal0 = _mm256_setzero_ps();
245 __m256 dotProdVal1 = _mm256_setzero_ps();
247 for(;number < sixteenthPoints; number++){
249 a0Val = _mm256_loadu_ps(aPtr);
250 a1Val = _mm256_loadu_ps(aPtr+8);
251 b0Val = _mm256_loadu_ps(bPtr);
252 b1Val = _mm256_loadu_ps(bPtr+8);
254 c0Val = _mm256_mul_ps(a0Val, b0Val);
255 c1Val = _mm256_mul_ps(a1Val, b1Val);
257 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
258 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
264 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
268 _mm256_storeu_ps(dotProductVector,dotProdVal0);
270 dotProduct = dotProductVector[0];
271 dotProduct += dotProductVector[1];
272 dotProduct += dotProductVector[2];
273 dotProduct += dotProductVector[3];
274 dotProduct += dotProductVector[4];
275 dotProduct += dotProductVector[5];
276 dotProduct += dotProductVector[6];
277 dotProduct += dotProductVector[7];
279 number = sixteenthPoints*16;
280 for(;number < num_points; number++){
281 dotProduct += ((*aPtr++) * (*bPtr++));
284 *result = dotProduct;
291 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
292 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
298 #ifdef LV_HAVE_GENERIC
301 static inline void volk_32f_x2_dot_prod_32f_a_generic(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
303 float dotProduct = 0;
304 const float* aPtr = input;
305 const float* bPtr=
taps;
306 unsigned int number = 0;
308 for(number = 0; number < num_points; number++){
309 dotProduct += ((*aPtr++) * (*bPtr++));
312 *result = dotProduct;
321 static inline void volk_32f_x2_dot_prod_32f_a_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
323 unsigned int number = 0;
324 const unsigned int sixteenthPoints = num_points / 16;
326 float dotProduct = 0;
327 const float* aPtr = input;
328 const float* bPtr =
taps;
330 __m128 a0Val, a1Val, a2Val, a3Val;
331 __m128 b0Val, b1Val, b2Val, b3Val;
332 __m128 c0Val, c1Val, c2Val, c3Val;
334 __m128 dotProdVal0 = _mm_setzero_ps();
335 __m128 dotProdVal1 = _mm_setzero_ps();
336 __m128 dotProdVal2 = _mm_setzero_ps();
337 __m128 dotProdVal3 = _mm_setzero_ps();
339 for(;number < sixteenthPoints; number++){
341 a0Val = _mm_load_ps(aPtr);
342 a1Val = _mm_load_ps(aPtr+4);
343 a2Val = _mm_load_ps(aPtr+8);
344 a3Val = _mm_load_ps(aPtr+12);
345 b0Val = _mm_load_ps(bPtr);
346 b1Val = _mm_load_ps(bPtr+4);
347 b2Val = _mm_load_ps(bPtr+8);
348 b3Val = _mm_load_ps(bPtr+12);
350 c0Val = _mm_mul_ps(a0Val, b0Val);
351 c1Val = _mm_mul_ps(a1Val, b1Val);
352 c2Val = _mm_mul_ps(a2Val, b2Val);
353 c3Val = _mm_mul_ps(a3Val, b3Val);
355 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
356 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
357 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
358 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
364 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
365 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
366 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
370 _mm_store_ps(dotProductVector,dotProdVal0);
372 dotProduct = dotProductVector[0];
373 dotProduct += dotProductVector[1];
374 dotProduct += dotProductVector[2];
375 dotProduct += dotProductVector[3];
377 number = sixteenthPoints*16;
378 for(;number < num_points; number++){
379 dotProduct += ((*aPtr++) * (*bPtr++));
382 *result = dotProduct;
390 #include <pmmintrin.h>
392 static inline void volk_32f_x2_dot_prod_32f_a_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
393 unsigned int number = 0;
394 const unsigned int sixteenthPoints = num_points / 16;
396 float dotProduct = 0;
397 const float* aPtr = input;
398 const float* bPtr =
taps;
400 __m128 a0Val, a1Val, a2Val, a3Val;
401 __m128 b0Val, b1Val, b2Val, b3Val;
402 __m128 c0Val, c1Val, c2Val, c3Val;
404 __m128 dotProdVal0 = _mm_setzero_ps();
405 __m128 dotProdVal1 = _mm_setzero_ps();
406 __m128 dotProdVal2 = _mm_setzero_ps();
407 __m128 dotProdVal3 = _mm_setzero_ps();
409 for(;number < sixteenthPoints; number++){
411 a0Val = _mm_load_ps(aPtr);
412 a1Val = _mm_load_ps(aPtr+4);
413 a2Val = _mm_load_ps(aPtr+8);
414 a3Val = _mm_load_ps(aPtr+12);
415 b0Val = _mm_load_ps(bPtr);
416 b1Val = _mm_load_ps(bPtr+4);
417 b2Val = _mm_load_ps(bPtr+8);
418 b3Val = _mm_load_ps(bPtr+12);
420 c0Val = _mm_mul_ps(a0Val, b0Val);
421 c1Val = _mm_mul_ps(a1Val, b1Val);
422 c2Val = _mm_mul_ps(a2Val, b2Val);
423 c3Val = _mm_mul_ps(a3Val, b3Val);
425 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
426 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
427 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
428 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
434 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
435 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
436 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
439 _mm_store_ps(dotProductVector,dotProdVal0);
441 dotProduct = dotProductVector[0];
442 dotProduct += dotProductVector[1];
443 dotProduct += dotProductVector[2];
444 dotProduct += dotProductVector[3];
446 number = sixteenthPoints*16;
447 for(;number < num_points; number++){
448 dotProduct += ((*aPtr++) * (*bPtr++));
451 *result = dotProduct;
456 #ifdef LV_HAVE_SSE4_1
458 #include <smmintrin.h>
460 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
461 unsigned int number = 0;
462 const unsigned int sixteenthPoints = num_points / 16;
464 float dotProduct = 0;
465 const float* aPtr = input;
466 const float* bPtr =
taps;
468 __m128 aVal1, bVal1, cVal1;
469 __m128 aVal2, bVal2, cVal2;
470 __m128 aVal3, bVal3, cVal3;
471 __m128 aVal4, bVal4, cVal4;
473 __m128 dotProdVal = _mm_setzero_ps();
475 for(;number < sixteenthPoints; number++){
477 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
478 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
479 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
480 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
482 bVal1 = _mm_load_ps(bPtr); bPtr += 4;
483 bVal2 = _mm_load_ps(bPtr); bPtr += 4;
484 bVal3 = _mm_load_ps(bPtr); bPtr += 4;
485 bVal4 = _mm_load_ps(bPtr); bPtr += 4;
487 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
488 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
489 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
490 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
492 cVal1 = _mm_or_ps(cVal1, cVal2);
493 cVal3 = _mm_or_ps(cVal3, cVal4);
494 cVal1 = _mm_or_ps(cVal1, cVal3);
496 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
500 _mm_store_ps(dotProductVector, dotProdVal);
502 dotProduct = dotProductVector[0];
503 dotProduct += dotProductVector[1];
504 dotProduct += dotProductVector[2];
505 dotProduct += dotProductVector[3];
507 number = sixteenthPoints * 16;
508 for(;number < num_points; number++){
509 dotProduct += ((*aPtr++) * (*bPtr++));
512 *result = dotProduct;
519 #include <immintrin.h>
521 static inline void volk_32f_x2_dot_prod_32f_a_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
523 unsigned int number = 0;
524 const unsigned int sixteenthPoints = num_points / 16;
526 float dotProduct = 0;
527 const float* aPtr = input;
528 const float* bPtr =
taps;
534 __m256 dotProdVal0 = _mm256_setzero_ps();
535 __m256 dotProdVal1 = _mm256_setzero_ps();
537 for(;number < sixteenthPoints; number++){
539 a0Val = _mm256_load_ps(aPtr);
540 a1Val = _mm256_load_ps(aPtr+8);
541 b0Val = _mm256_load_ps(bPtr);
542 b1Val = _mm256_load_ps(bPtr+8);
544 c0Val = _mm256_mul_ps(a0Val, b0Val);
545 c1Val = _mm256_mul_ps(a1Val, b1Val);
547 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
548 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
554 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
558 _mm256_store_ps(dotProductVector,dotProdVal0);
560 dotProduct = dotProductVector[0];
561 dotProduct += dotProductVector[1];
562 dotProduct += dotProductVector[2];
563 dotProduct += dotProductVector[3];
564 dotProduct += dotProductVector[4];
565 dotProduct += dotProductVector[5];
566 dotProduct += dotProductVector[6];
567 dotProduct += dotProductVector[7];
569 number = sixteenthPoints*16;
570 for(;number < num_points; number++){
571 dotProduct += ((*aPtr++) * (*bPtr++));
574 *result = dotProduct;
581 #include <arm_neon.h>
583 static inline void volk_32f_x2_dot_prod_32f_neonopts(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
585 unsigned int quarter_points = num_points / 16;
586 float dotProduct = 0;
587 const float* aPtr = input;
588 const float* bPtr=
taps;
589 unsigned int number = 0;
591 float32x4x4_t a_val, b_val, accumulator0;
592 accumulator0.val[0] = vdupq_n_f32(0);
593 accumulator0.val[1] = vdupq_n_f32(0);
594 accumulator0.val[2] = vdupq_n_f32(0);
595 accumulator0.val[3] = vdupq_n_f32(0);
598 for( number = 0; number < quarter_points; ++number) {
599 a_val = vld4q_f32(aPtr);
600 b_val = vld4q_f32(bPtr);
601 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
602 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
603 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
604 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
608 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
609 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
610 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
612 vst1q_f32(accumulator, accumulator0.val[0]);
613 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
615 for(number = quarter_points*16; number < num_points; number++){
616 dotProduct += ((*aPtr++) * (*bPtr++));
619 *result = dotProduct;
628 static inline void volk_32f_x2_dot_prod_32f_neon(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
630 unsigned int quarter_points = num_points / 8;
631 float dotProduct = 0;
632 const float* aPtr = input;
633 const float* bPtr=
taps;
634 unsigned int number = 0;
636 float32x4x2_t a_val, b_val, accumulator_val;
637 accumulator_val.val[0] = vdupq_n_f32(0);
638 accumulator_val.val[1] = vdupq_n_f32(0);
640 for( number = 0; number < quarter_points; ++number) {
641 a_val = vld2q_f32(aPtr);
642 b_val = vld2q_f32(bPtr);
643 accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
644 accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
648 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
650 vst1q_f32(accumulator, accumulator_val.val[0]);
651 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
653 for(number = quarter_points*8; number < num_points; number++){
654 dotProduct += ((*aPtr++) * (*bPtr++));
657 *result = dotProduct;
663 extern void volk_32f_x2_dot_prod_32f_neonasm(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
667 extern void volk_32f_x2_dot_prod_32f_neonasm_opts(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9