GNU Radio Manual and C++ API Reference  3.7.5.1
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
3 
4 #include <volk/volk_common.h>
5 #include<stdio.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
12 
13  float dotProduct = 0;
14  const float* aPtr = input;
15  const float* bPtr= taps;
16  unsigned int number = 0;
17 
18  for(number = 0; number < num_points; number++){
19  dotProduct += ((*aPtr++) * (*bPtr++));
20  }
21 
22  *result = dotProduct;
23 }
24 
25 #endif /*LV_HAVE_GENERIC*/
26 
27 
28 #ifdef LV_HAVE_SSE
29 
30 
31 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
32 
33  unsigned int number = 0;
34  const unsigned int sixteenthPoints = num_points / 16;
35 
36  float dotProduct = 0;
37  const float* aPtr = input;
38  const float* bPtr = taps;
39 
40  __m128 a0Val, a1Val, a2Val, a3Val;
41  __m128 b0Val, b1Val, b2Val, b3Val;
42  __m128 c0Val, c1Val, c2Val, c3Val;
43 
44  __m128 dotProdVal0 = _mm_setzero_ps();
45  __m128 dotProdVal1 = _mm_setzero_ps();
46  __m128 dotProdVal2 = _mm_setzero_ps();
47  __m128 dotProdVal3 = _mm_setzero_ps();
48 
49  for(;number < sixteenthPoints; number++){
50 
51  a0Val = _mm_loadu_ps(aPtr);
52  a1Val = _mm_loadu_ps(aPtr+4);
53  a2Val = _mm_loadu_ps(aPtr+8);
54  a3Val = _mm_loadu_ps(aPtr+12);
55  b0Val = _mm_loadu_ps(bPtr);
56  b1Val = _mm_loadu_ps(bPtr+4);
57  b2Val = _mm_loadu_ps(bPtr+8);
58  b3Val = _mm_loadu_ps(bPtr+12);
59 
60  c0Val = _mm_mul_ps(a0Val, b0Val);
61  c1Val = _mm_mul_ps(a1Val, b1Val);
62  c2Val = _mm_mul_ps(a2Val, b2Val);
63  c3Val = _mm_mul_ps(a3Val, b3Val);
64 
65  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
66  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
67  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
68  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
69 
70  aPtr += 16;
71  bPtr += 16;
72  }
73 
74  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
75  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
76  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
77 
78  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
79 
80  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
81 
82  dotProduct = dotProductVector[0];
83  dotProduct += dotProductVector[1];
84  dotProduct += dotProductVector[2];
85  dotProduct += dotProductVector[3];
86 
87  number = sixteenthPoints*16;
88  for(;number < num_points; number++){
89  dotProduct += ((*aPtr++) * (*bPtr++));
90  }
91 
92  *result = dotProduct;
93 
94 }
95 
96 #endif /*LV_HAVE_SSE*/
97 
98 #ifdef LV_HAVE_SSE3
99 
100 #include <pmmintrin.h>
101 
102 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
103  unsigned int number = 0;
104  const unsigned int sixteenthPoints = num_points / 16;
105 
106  float dotProduct = 0;
107  const float* aPtr = input;
108  const float* bPtr = taps;
109 
110  __m128 a0Val, a1Val, a2Val, a3Val;
111  __m128 b0Val, b1Val, b2Val, b3Val;
112  __m128 c0Val, c1Val, c2Val, c3Val;
113 
114  __m128 dotProdVal0 = _mm_setzero_ps();
115  __m128 dotProdVal1 = _mm_setzero_ps();
116  __m128 dotProdVal2 = _mm_setzero_ps();
117  __m128 dotProdVal3 = _mm_setzero_ps();
118 
119  for(;number < sixteenthPoints; number++){
120 
121  a0Val = _mm_loadu_ps(aPtr);
122  a1Val = _mm_loadu_ps(aPtr+4);
123  a2Val = _mm_loadu_ps(aPtr+8);
124  a3Val = _mm_loadu_ps(aPtr+12);
125  b0Val = _mm_loadu_ps(bPtr);
126  b1Val = _mm_loadu_ps(bPtr+4);
127  b2Val = _mm_loadu_ps(bPtr+8);
128  b3Val = _mm_loadu_ps(bPtr+12);
129 
130  c0Val = _mm_mul_ps(a0Val, b0Val);
131  c1Val = _mm_mul_ps(a1Val, b1Val);
132  c2Val = _mm_mul_ps(a2Val, b2Val);
133  c3Val = _mm_mul_ps(a3Val, b3Val);
134 
135  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
136  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
137  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
138  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
139 
140  aPtr += 16;
141  bPtr += 16;
142  }
143 
144  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
145  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
146  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
147 
148  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
149  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
150 
151  dotProduct = dotProductVector[0];
152  dotProduct += dotProductVector[1];
153  dotProduct += dotProductVector[2];
154  dotProduct += dotProductVector[3];
155 
156  number = sixteenthPoints*16;
157  for(;number < num_points; number++){
158  dotProduct += ((*aPtr++) * (*bPtr++));
159  }
160 
161  *result = dotProduct;
162 }
163 
164 #endif /*LV_HAVE_SSE3*/
165 
166 #ifdef LV_HAVE_SSE4_1
167 
168 #include <smmintrin.h>
169 
170 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
171  unsigned int number = 0;
172  const unsigned int sixteenthPoints = num_points / 16;
173 
174  float dotProduct = 0;
175  const float* aPtr = input;
176  const float* bPtr = taps;
177 
178  __m128 aVal1, bVal1, cVal1;
179  __m128 aVal2, bVal2, cVal2;
180  __m128 aVal3, bVal3, cVal3;
181  __m128 aVal4, bVal4, cVal4;
182 
183  __m128 dotProdVal = _mm_setzero_ps();
184 
185  for(;number < sixteenthPoints; number++){
186 
187  aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
188  aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
189  aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
190  aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
191 
192  bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
193  bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
194  bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
195  bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
196 
197  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
198  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
199  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
200  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
201 
202  cVal1 = _mm_or_ps(cVal1, cVal2);
203  cVal3 = _mm_or_ps(cVal3, cVal4);
204  cVal1 = _mm_or_ps(cVal1, cVal3);
205 
206  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
207  }
208 
209  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
210  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
211 
212  dotProduct = dotProductVector[0];
213  dotProduct += dotProductVector[1];
214  dotProduct += dotProductVector[2];
215  dotProduct += dotProductVector[3];
216 
217  number = sixteenthPoints * 16;
218  for(;number < num_points; number++){
219  dotProduct += ((*aPtr++) * (*bPtr++));
220  }
221 
222  *result = dotProduct;
223 }
224 
225 #endif /*LV_HAVE_SSE4_1*/
226 
227 #ifdef LV_HAVE_AVX
228 
229 #include <immintrin.h>
230 
231 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
232 
233  unsigned int number = 0;
234  const unsigned int sixteenthPoints = num_points / 16;
235 
236  float dotProduct = 0;
237  const float* aPtr = input;
238  const float* bPtr = taps;
239 
240  __m256 a0Val, a1Val;
241  __m256 b0Val, b1Val;
242  __m256 c0Val, c1Val;
243 
244  __m256 dotProdVal0 = _mm256_setzero_ps();
245  __m256 dotProdVal1 = _mm256_setzero_ps();
246 
247  for(;number < sixteenthPoints; number++){
248 
249  a0Val = _mm256_loadu_ps(aPtr);
250  a1Val = _mm256_loadu_ps(aPtr+8);
251  b0Val = _mm256_loadu_ps(bPtr);
252  b1Val = _mm256_loadu_ps(bPtr+8);
253 
254  c0Val = _mm256_mul_ps(a0Val, b0Val);
255  c1Val = _mm256_mul_ps(a1Val, b1Val);
256 
257  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
258  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
259 
260  aPtr += 16;
261  bPtr += 16;
262  }
263 
264  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
265 
266  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
267 
268  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
269 
270  dotProduct = dotProductVector[0];
271  dotProduct += dotProductVector[1];
272  dotProduct += dotProductVector[2];
273  dotProduct += dotProductVector[3];
274  dotProduct += dotProductVector[4];
275  dotProduct += dotProductVector[5];
276  dotProduct += dotProductVector[6];
277  dotProduct += dotProductVector[7];
278 
279  number = sixteenthPoints*16;
280  for(;number < num_points; number++){
281  dotProduct += ((*aPtr++) * (*bPtr++));
282  }
283 
284  *result = dotProduct;
285 
286 }
287 
288 #endif /*LV_HAVE_AVX*/
289 
290 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
291 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
292 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
293 
294 #include <volk/volk_common.h>
295 #include<stdio.h>
296 
297 
298 #ifdef LV_HAVE_GENERIC
299 
300 
301 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
302 
303  float dotProduct = 0;
304  const float* aPtr = input;
305  const float* bPtr= taps;
306  unsigned int number = 0;
307 
308  for(number = 0; number < num_points; number++){
309  dotProduct += ((*aPtr++) * (*bPtr++));
310  }
311 
312  *result = dotProduct;
313 }
314 
315 #endif /*LV_HAVE_GENERIC*/
316 
317 
318 #ifdef LV_HAVE_SSE
319 
320 
321 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
322 
323  unsigned int number = 0;
324  const unsigned int sixteenthPoints = num_points / 16;
325 
326  float dotProduct = 0;
327  const float* aPtr = input;
328  const float* bPtr = taps;
329 
330  __m128 a0Val, a1Val, a2Val, a3Val;
331  __m128 b0Val, b1Val, b2Val, b3Val;
332  __m128 c0Val, c1Val, c2Val, c3Val;
333 
334  __m128 dotProdVal0 = _mm_setzero_ps();
335  __m128 dotProdVal1 = _mm_setzero_ps();
336  __m128 dotProdVal2 = _mm_setzero_ps();
337  __m128 dotProdVal3 = _mm_setzero_ps();
338 
339  for(;number < sixteenthPoints; number++){
340 
341  a0Val = _mm_load_ps(aPtr);
342  a1Val = _mm_load_ps(aPtr+4);
343  a2Val = _mm_load_ps(aPtr+8);
344  a3Val = _mm_load_ps(aPtr+12);
345  b0Val = _mm_load_ps(bPtr);
346  b1Val = _mm_load_ps(bPtr+4);
347  b2Val = _mm_load_ps(bPtr+8);
348  b3Val = _mm_load_ps(bPtr+12);
349 
350  c0Val = _mm_mul_ps(a0Val, b0Val);
351  c1Val = _mm_mul_ps(a1Val, b1Val);
352  c2Val = _mm_mul_ps(a2Val, b2Val);
353  c3Val = _mm_mul_ps(a3Val, b3Val);
354 
355  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
356  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
357  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
358  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
359 
360  aPtr += 16;
361  bPtr += 16;
362  }
363 
364  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
365  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
366  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
367 
368  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
369 
370  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
371 
372  dotProduct = dotProductVector[0];
373  dotProduct += dotProductVector[1];
374  dotProduct += dotProductVector[2];
375  dotProduct += dotProductVector[3];
376 
377  number = sixteenthPoints*16;
378  for(;number < num_points; number++){
379  dotProduct += ((*aPtr++) * (*bPtr++));
380  }
381 
382  *result = dotProduct;
383 
384 }
385 
386 #endif /*LV_HAVE_SSE*/
387 
388 #ifdef LV_HAVE_SSE3
389 
390 #include <pmmintrin.h>
391 
392 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
393  unsigned int number = 0;
394  const unsigned int sixteenthPoints = num_points / 16;
395 
396  float dotProduct = 0;
397  const float* aPtr = input;
398  const float* bPtr = taps;
399 
400  __m128 a0Val, a1Val, a2Val, a3Val;
401  __m128 b0Val, b1Val, b2Val, b3Val;
402  __m128 c0Val, c1Val, c2Val, c3Val;
403 
404  __m128 dotProdVal0 = _mm_setzero_ps();
405  __m128 dotProdVal1 = _mm_setzero_ps();
406  __m128 dotProdVal2 = _mm_setzero_ps();
407  __m128 dotProdVal3 = _mm_setzero_ps();
408 
409  for(;number < sixteenthPoints; number++){
410 
411  a0Val = _mm_load_ps(aPtr);
412  a1Val = _mm_load_ps(aPtr+4);
413  a2Val = _mm_load_ps(aPtr+8);
414  a3Val = _mm_load_ps(aPtr+12);
415  b0Val = _mm_load_ps(bPtr);
416  b1Val = _mm_load_ps(bPtr+4);
417  b2Val = _mm_load_ps(bPtr+8);
418  b3Val = _mm_load_ps(bPtr+12);
419 
420  c0Val = _mm_mul_ps(a0Val, b0Val);
421  c1Val = _mm_mul_ps(a1Val, b1Val);
422  c2Val = _mm_mul_ps(a2Val, b2Val);
423  c3Val = _mm_mul_ps(a3Val, b3Val);
424 
425  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
426  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
427  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
428  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
429 
430  aPtr += 16;
431  bPtr += 16;
432  }
433 
434  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
435  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
436  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
437 
438  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
439  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
440 
441  dotProduct = dotProductVector[0];
442  dotProduct += dotProductVector[1];
443  dotProduct += dotProductVector[2];
444  dotProduct += dotProductVector[3];
445 
446  number = sixteenthPoints*16;
447  for(;number < num_points; number++){
448  dotProduct += ((*aPtr++) * (*bPtr++));
449  }
450 
451  *result = dotProduct;
452 }
453 
454 #endif /*LV_HAVE_SSE3*/
455 
456 #ifdef LV_HAVE_SSE4_1
457 
458 #include <smmintrin.h>
459 
460 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
461  unsigned int number = 0;
462  const unsigned int sixteenthPoints = num_points / 16;
463 
464  float dotProduct = 0;
465  const float* aPtr = input;
466  const float* bPtr = taps;
467 
468  __m128 aVal1, bVal1, cVal1;
469  __m128 aVal2, bVal2, cVal2;
470  __m128 aVal3, bVal3, cVal3;
471  __m128 aVal4, bVal4, cVal4;
472 
473  __m128 dotProdVal = _mm_setzero_ps();
474 
475  for(;number < sixteenthPoints; number++){
476 
477  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
478  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
479  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
480  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
481 
482  bVal1 = _mm_load_ps(bPtr); bPtr += 4;
483  bVal2 = _mm_load_ps(bPtr); bPtr += 4;
484  bVal3 = _mm_load_ps(bPtr); bPtr += 4;
485  bVal4 = _mm_load_ps(bPtr); bPtr += 4;
486 
487  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
488  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
489  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
490  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
491 
492  cVal1 = _mm_or_ps(cVal1, cVal2);
493  cVal3 = _mm_or_ps(cVal3, cVal4);
494  cVal1 = _mm_or_ps(cVal1, cVal3);
495 
496  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
497  }
498 
499  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
500  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
501 
502  dotProduct = dotProductVector[0];
503  dotProduct += dotProductVector[1];
504  dotProduct += dotProductVector[2];
505  dotProduct += dotProductVector[3];
506 
507  number = sixteenthPoints * 16;
508  for(;number < num_points; number++){
509  dotProduct += ((*aPtr++) * (*bPtr++));
510  }
511 
512  *result = dotProduct;
513 }
514 
515 #endif /*LV_HAVE_SSE4_1*/
516 
517 #ifdef LV_HAVE_AVX
518 
519 #include <immintrin.h>
520 
521 static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
522 
523  unsigned int number = 0;
524  const unsigned int sixteenthPoints = num_points / 16;
525 
526  float dotProduct = 0;
527  const float* aPtr = input;
528  const float* bPtr = taps;
529 
530  __m256 a0Val, a1Val;
531  __m256 b0Val, b1Val;
532  __m256 c0Val, c1Val;
533 
534  __m256 dotProdVal0 = _mm256_setzero_ps();
535  __m256 dotProdVal1 = _mm256_setzero_ps();
536 
537  for(;number < sixteenthPoints; number++){
538 
539  a0Val = _mm256_load_ps(aPtr);
540  a1Val = _mm256_load_ps(aPtr+8);
541  b0Val = _mm256_load_ps(bPtr);
542  b1Val = _mm256_load_ps(bPtr+8);
543 
544  c0Val = _mm256_mul_ps(a0Val, b0Val);
545  c1Val = _mm256_mul_ps(a1Val, b1Val);
546 
547  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
548  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
549 
550  aPtr += 16;
551  bPtr += 16;
552  }
553 
554  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
555 
556  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
557 
558  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
559 
560  dotProduct = dotProductVector[0];
561  dotProduct += dotProductVector[1];
562  dotProduct += dotProductVector[2];
563  dotProduct += dotProductVector[3];
564  dotProduct += dotProductVector[4];
565  dotProduct += dotProductVector[5];
566  dotProduct += dotProductVector[6];
567  dotProduct += dotProductVector[7];
568 
569  number = sixteenthPoints*16;
570  for(;number < num_points; number++){
571  dotProduct += ((*aPtr++) * (*bPtr++));
572  }
573 
574  *result = dotProduct;
575 
576 }
577 
578 #endif /*LV_HAVE_AVX*/
579 
580 #ifdef LV_HAVE_NEON
581 #include <arm_neon.h>
582 
583 static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
584 
585  unsigned int quarter_points = num_points / 16;
586  float dotProduct = 0;
587  const float* aPtr = input;
588  const float* bPtr= taps;
589  unsigned int number = 0;
590 
591  float32x4x4_t a_val, b_val, accumulator0;
592  accumulator0.val[0] = vdupq_n_f32(0);
593  accumulator0.val[1] = vdupq_n_f32(0);
594  accumulator0.val[2] = vdupq_n_f32(0);
595  accumulator0.val[3] = vdupq_n_f32(0);
596  // factor of 4 loop unroll with independent accumulators
597  // uses 12 out of 16 neon q registers
598  for( number = 0; number < quarter_points; ++number) {
599  a_val = vld4q_f32(aPtr);
600  b_val = vld4q_f32(bPtr);
601  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
602  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
603  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
604  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
605  aPtr += 16;
606  bPtr += 16;
607  }
608  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
609  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
610  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
611  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
612  vst1q_f32(accumulator, accumulator0.val[0]);
613  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
614 
615  for(number = quarter_points*16; number < num_points; number++){
616  dotProduct += ((*aPtr++) * (*bPtr++));
617  }
618 
619  *result = dotProduct;
620 }
621 
622 #endif
623 
624 
625 
626 
627 #ifdef LV_HAVE_NEON
628 static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
629 
630  unsigned int quarter_points = num_points / 8;
631  float dotProduct = 0;
632  const float* aPtr = input;
633  const float* bPtr= taps;
634  unsigned int number = 0;
635 
636  float32x4x2_t a_val, b_val, accumulator_val;
637  accumulator_val.val[0] = vdupq_n_f32(0);
638  accumulator_val.val[1] = vdupq_n_f32(0);
639  // factor of 2 loop unroll with independent accumulators
640  for( number = 0; number < quarter_points; ++number) {
641  a_val = vld2q_f32(aPtr);
642  b_val = vld2q_f32(bPtr);
643  accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
644  accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
645  aPtr += 8;
646  bPtr += 8;
647  }
648  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
649  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
650  vst1q_f32(accumulator, accumulator_val.val[0]);
651  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
652 
653  for(number = quarter_points*8; number < num_points; number++){
654  dotProduct += ((*aPtr++) * (*bPtr++));
655  }
656 
657  *result = dotProduct;
658 }
659 
660 #endif /* LV_HAVE_NEON */
661 
662 #ifdef LV_HAVE_NEON
663 extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
664 #endif /* LV_HAVE_NEON */
665 
666 #ifdef LV_HAVE_NEON
667 extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
668 #endif /* LV_HAVE_NEON */
669 
670 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9