1 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
10 #ifdef LV_HAVE_GENERIC
15 float * res = (
float*) result;
16 float * in = (
float*) input;
17 float * tp = (
float*) taps;
18 unsigned int n_2_ccomplex_blocks = num_points/2;
19 unsigned int isodd = num_points & 1;
21 float sum0[2] = {0,0};
22 float sum1[2] = {0,0};
25 for(i = 0; i < n_2_ccomplex_blocks; ++i) {
26 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
27 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
28 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
29 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
35 res[0] = sum0[0] + sum1[0];
36 res[1] = sum0[1] + sum1[1];
39 for(i = 0; i < isodd; ++i) {
40 *result += input[num_points - 1] * taps[num_points - 1];
48 #if LV_HAVE_SSE && LV_HAVE_64
50 static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
52 const unsigned int num_bytes = num_points*8;
53 unsigned int isodd = num_points & 1;
57 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
58 "# const float *taps, unsigned num_bytes)\n\t"
59 "# float sum0 = 0;\n\t"
60 "# float sum1 = 0;\n\t"
61 "# float sum2 = 0;\n\t"
62 "# float sum3 = 0;\n\t"
64 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
65 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
66 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
67 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
70 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
71 "# result[0] = sum0 + sum2;\n\t"
72 "# result[1] = sum1 + sum3;\n\t"
73 "# TODO: prefetch and better scheduling\n\t"
75 " xor %%r10, %%r10\n\t"
76 " movq %%rcx, %%rax\n\t"
77 " movq %%rcx, %%r8\n\t"
78 " movq %[rsi], %%r9\n\t"
79 " movq %[rdx], %%r10\n\t"
80 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
81 " movups 0(%%r9), %%xmm0\n\t"
82 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
83 " movups 0(%%r10), %%xmm2\n\t"
84 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
87 " # 4 taps / loop\n\t"
88 " # something like ?? cycles / loop\n\t"
90 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
91 "# movups (%%r9), %%xmmA\n\t"
92 "# movups (%%r10), %%xmmB\n\t"
93 "# movups %%xmmA, %%xmmZ\n\t"
94 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
95 "# mulps %%xmmB, %%xmmA\n\t"
96 "# mulps %%xmmZ, %%xmmB\n\t"
97 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
98 "# xorps %%xmmPN, %%xmmA\n\t"
99 "# movups %%xmmA, %%xmmZ\n\t"
100 "# unpcklps %%xmmB, %%xmmA\n\t"
101 "# unpckhps %%xmmB, %%xmmZ\n\t"
102 "# movups %%xmmZ, %%xmmY\n\t"
103 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
104 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
105 "# addps %%xmmZ, %%xmmA\n\t"
106 "# addps %%xmmA, %%xmmC\n\t"
107 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
108 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
109 " movups 16(%%r9), %%xmm1\n\t"
110 " movups %%xmm0, %%xmm4\n\t"
111 " mulps %%xmm2, %%xmm0\n\t"
112 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
113 " movups 16(%%r10), %%xmm3\n\t"
114 " movups %%xmm1, %%xmm5\n\t"
115 " addps %%xmm0, %%xmm6\n\t"
116 " mulps %%xmm3, %%xmm1\n\t"
117 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
118 " addps %%xmm1, %%xmm6\n\t"
119 " mulps %%xmm4, %%xmm2\n\t"
120 " movups 32(%%r9), %%xmm0\n\t"
121 " addps %%xmm2, %%xmm7\n\t"
122 " mulps %%xmm5, %%xmm3\n\t"
124 " movups 32(%%r10), %%xmm2\n\t"
125 " addps %%xmm3, %%xmm7\n\t"
126 " add $32, %%r10\n\t"
130 " # We've handled the bulk of multiplies up to here.\n\t"
131 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
132 " # If so, we've got 2 more taps to do.\n\t"
135 " # The count was odd, do 2 more taps.\n\t"
136 " # Note that we've already got mm0/mm2 preloaded\n\t"
137 " # from the main loop.\n\t"
138 " movups %%xmm0, %%xmm4\n\t"
139 " mulps %%xmm2, %%xmm0\n\t"
140 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
141 " addps %%xmm0, %%xmm6\n\t"
142 " mulps %%xmm4, %%xmm2\n\t"
143 " addps %%xmm2, %%xmm7\n\t"
145 " # neg inversor\n\t"
146 " xorps %%xmm1, %%xmm1\n\t"
147 " mov $0x80000000, %%r9\n\t"
148 " movd %%r9, %%xmm1\n\t"
149 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
151 " xorps %%xmm1, %%xmm6\n\t"
152 " movups %%xmm6, %%xmm2\n\t"
153 " unpcklps %%xmm7, %%xmm6\n\t"
154 " unpckhps %%xmm7, %%xmm2\n\t"
155 " movups %%xmm2, %%xmm3\n\t"
156 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
157 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
158 " addps %%xmm2, %%xmm6\n\t"
159 " # xmm6 = r1 i2 r3 i4\n\t"
160 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
161 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
162 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
164 :[rsi]
"r" (input), [rdx]
"r" (taps),
"c" (num_bytes), [rdi]
"r" (result)
165 :
"rax",
"r8",
"r9",
"r10"
170 *result += input[num_points - 1] * taps[num_points - 1];
184 #include <pmmintrin.h>
186 static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
189 memset(&dotProduct, 0x0, 2*
sizeof(
float));
191 unsigned int number = 0;
192 const unsigned int halfPoints = num_points/2;
193 unsigned int isodd = num_points & 1;
195 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
200 dotProdVal = _mm_setzero_ps();
202 for(;number < halfPoints; number++){
204 x = _mm_loadu_ps((
float*)a);
205 y = _mm_loadu_ps((
float*)b);
207 yl = _mm_moveldup_ps(y);
208 yh = _mm_movehdup_ps(y);
210 tmp1 = _mm_mul_ps(x,yl);
212 x = _mm_shuffle_ps(x,x,0xB1);
214 tmp2 = _mm_mul_ps(x,yh);
216 z = _mm_addsub_ps(tmp1,tmp2);
218 dotProdVal = _mm_add_ps(dotProdVal, z);
226 _mm_storeu_ps((
float*)dotProductVector,dotProdVal);
228 dotProduct += ( dotProductVector[0] + dotProductVector[1] );
231 dotProduct += input[num_points - 1] * taps[num_points - 1];
234 *result = dotProduct;
239 #ifdef LV_HAVE_SSE4_1
241 #include <smmintrin.h>
243 static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
246 const unsigned int qtr_points = num_points/4;
247 const unsigned int isodd = num_points & 3;
249 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
250 float *p_input, *p_taps;
253 p_result = (__m64*)result;
254 p_input = (
float*)input;
255 p_taps = (
float*)taps;
257 static const __m128i neg = {0x000000000000000080000000};
259 real0 = _mm_setzero_ps();
260 real1 = _mm_setzero_ps();
261 im0 = _mm_setzero_ps();
262 im1 = _mm_setzero_ps();
264 for(; i < qtr_points; ++i) {
265 xmm0 = _mm_loadu_ps(p_input);
266 xmm1 = _mm_loadu_ps(p_taps);
271 xmm2 = _mm_loadu_ps(p_input);
272 xmm3 = _mm_loadu_ps(p_taps);
277 xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
278 xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
279 xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
280 xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
283 xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
285 xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
287 xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
289 xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
291 xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
292 xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
294 xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
295 xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
297 real0 = _mm_add_ps(xmm4, real0);
298 real1 = _mm_add_ps(xmm5, real1);
299 im0 = _mm_add_ps(xmm6, im0);
300 im1 = _mm_add_ps(xmm7, im1);
303 real1 = _mm_xor_ps(real1,
bit128_p(&neg)->float_vec);
305 im0 = _mm_add_ps(im0, im1);
306 real0 = _mm_add_ps(real0, real1);
308 im0 = _mm_add_ps(im0, real0);
310 _mm_storel_pi(p_result, im0);
312 for(i = num_points-isodd; i < num_points; i++) {
313 *result += input[i] * taps[i];
323 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
324 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
332 #ifdef LV_HAVE_GENERIC
335 static inline void volk_32fc_x2_dot_prod_32fc_a_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
337 const unsigned int num_bytes = num_points*8;
339 float * res = (
float*) result;
340 float * in = (
float*) input;
341 float * tp = (
float*) taps;
342 unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
343 unsigned int isodd = num_points & 1;
345 float sum0[2] = {0,0};
346 float sum1[2] = {0,0};
349 for(i = 0; i < n_2_ccomplex_blocks; ++i) {
350 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
351 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
352 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
353 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
359 res[0] = sum0[0] + sum1[0];
360 res[1] = sum0[1] + sum1[1];
362 for(i = 0; i < isodd; ++i) {
363 *result += input[num_points - 1] * taps[num_points - 1];
370 #if LV_HAVE_SSE && LV_HAVE_64
373 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
375 const unsigned int num_bytes = num_points*8;
376 unsigned int isodd = num_points & 1;
380 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
381 "# const float *taps, unsigned num_bytes)\n\t"
382 "# float sum0 = 0;\n\t"
383 "# float sum1 = 0;\n\t"
384 "# float sum2 = 0;\n\t"
385 "# float sum3 = 0;\n\t"
387 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
388 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
389 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
390 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
393 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
394 "# result[0] = sum0 + sum2;\n\t"
395 "# result[1] = sum1 + sum3;\n\t"
396 "# TODO: prefetch and better scheduling\n\t"
397 " xor %%r9, %%r9\n\t"
398 " xor %%r10, %%r10\n\t"
399 " movq %%rcx, %%rax\n\t"
400 " movq %%rcx, %%r8\n\t"
401 " movq %[rsi], %%r9\n\t"
402 " movq %[rdx], %%r10\n\t"
403 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
404 " movaps 0(%%r9), %%xmm0\n\t"
405 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
406 " movaps 0(%%r10), %%xmm2\n\t"
407 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
409 " jmp .%=L1_test\n\t"
410 " # 4 taps / loop\n\t"
411 " # something like ?? cycles / loop\n\t"
413 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
414 "# movaps (%%r9), %%xmmA\n\t"
415 "# movaps (%%r10), %%xmmB\n\t"
416 "# movaps %%xmmA, %%xmmZ\n\t"
417 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
418 "# mulps %%xmmB, %%xmmA\n\t"
419 "# mulps %%xmmZ, %%xmmB\n\t"
420 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
421 "# xorps %%xmmPN, %%xmmA\n\t"
422 "# movaps %%xmmA, %%xmmZ\n\t"
423 "# unpcklps %%xmmB, %%xmmA\n\t"
424 "# unpckhps %%xmmB, %%xmmZ\n\t"
425 "# movaps %%xmmZ, %%xmmY\n\t"
426 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
427 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
428 "# addps %%xmmZ, %%xmmA\n\t"
429 "# addps %%xmmA, %%xmmC\n\t"
430 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
431 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
432 " movaps 16(%%r9), %%xmm1\n\t"
433 " movaps %%xmm0, %%xmm4\n\t"
434 " mulps %%xmm2, %%xmm0\n\t"
435 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
436 " movaps 16(%%r10), %%xmm3\n\t"
437 " movaps %%xmm1, %%xmm5\n\t"
438 " addps %%xmm0, %%xmm6\n\t"
439 " mulps %%xmm3, %%xmm1\n\t"
440 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
441 " addps %%xmm1, %%xmm6\n\t"
442 " mulps %%xmm4, %%xmm2\n\t"
443 " movaps 32(%%r9), %%xmm0\n\t"
444 " addps %%xmm2, %%xmm7\n\t"
445 " mulps %%xmm5, %%xmm3\n\t"
447 " movaps 32(%%r10), %%xmm2\n\t"
448 " addps %%xmm3, %%xmm7\n\t"
449 " add $32, %%r10\n\t"
453 " # We've handled the bulk of multiplies up to here.\n\t"
454 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
455 " # If so, we've got 2 more taps to do.\n\t"
458 " # The count was odd, do 2 more taps.\n\t"
459 " # Note that we've already got mm0/mm2 preloaded\n\t"
460 " # from the main loop.\n\t"
461 " movaps %%xmm0, %%xmm4\n\t"
462 " mulps %%xmm2, %%xmm0\n\t"
463 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
464 " addps %%xmm0, %%xmm6\n\t"
465 " mulps %%xmm4, %%xmm2\n\t"
466 " addps %%xmm2, %%xmm7\n\t"
468 " # neg inversor\n\t"
469 " xorps %%xmm1, %%xmm1\n\t"
470 " mov $0x80000000, %%r9\n\t"
471 " movd %%r9, %%xmm1\n\t"
472 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
474 " xorps %%xmm1, %%xmm6\n\t"
475 " movaps %%xmm6, %%xmm2\n\t"
476 " unpcklps %%xmm7, %%xmm6\n\t"
477 " unpckhps %%xmm7, %%xmm2\n\t"
478 " movaps %%xmm2, %%xmm3\n\t"
479 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
480 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
481 " addps %%xmm2, %%xmm6\n\t"
482 " # xmm6 = r1 i2 r3 i4\n\t"
483 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
484 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
485 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
487 :[rsi]
"r" (input), [rdx]
"r" (taps),
"c" (num_bytes), [rdi]
"r" (result)
488 :
"rax",
"r8",
"r9",
"r10"
493 *result += input[num_points - 1] * taps[num_points - 1];
502 #if LV_HAVE_SSE && LV_HAVE_32
504 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
506 volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
509 const unsigned int num_bytes = num_points*8;
510 unsigned int isodd = num_points & 1;
515 " #movl %%esp, %%ebp\n\t"
516 " movl 12(%%ebp), %%eax # input\n\t"
517 " movl 16(%%ebp), %%edx # taps\n\t"
518 " movl 20(%%ebp), %%ecx # n_bytes\n\t"
519 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
520 " movaps 0(%%eax), %%xmm0\n\t"
521 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
522 " movaps 0(%%edx), %%xmm2\n\t"
523 " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
524 " jmp .%=L1_test\n\t"
525 " # 4 taps / loop\n\t"
526 " # something like ?? cycles / loop\n\t"
528 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
529 "# movaps (%%eax), %%xmmA\n\t"
530 "# movaps (%%edx), %%xmmB\n\t"
531 "# movaps %%xmmA, %%xmmZ\n\t"
532 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
533 "# mulps %%xmmB, %%xmmA\n\t"
534 "# mulps %%xmmZ, %%xmmB\n\t"
535 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
536 "# xorps %%xmmPN, %%xmmA\n\t"
537 "# movaps %%xmmA, %%xmmZ\n\t"
538 "# unpcklps %%xmmB, %%xmmA\n\t"
539 "# unpckhps %%xmmB, %%xmmZ\n\t"
540 "# movaps %%xmmZ, %%xmmY\n\t"
541 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
542 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
543 "# addps %%xmmZ, %%xmmA\n\t"
544 "# addps %%xmmA, %%xmmC\n\t"
545 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
546 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
547 " movaps 16(%%eax), %%xmm1\n\t"
548 " movaps %%xmm0, %%xmm4\n\t"
549 " mulps %%xmm2, %%xmm0\n\t"
550 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
551 " movaps 16(%%edx), %%xmm3\n\t"
552 " movaps %%xmm1, %%xmm5\n\t"
553 " addps %%xmm0, %%xmm6\n\t"
554 " mulps %%xmm3, %%xmm1\n\t"
555 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
556 " addps %%xmm1, %%xmm6\n\t"
557 " mulps %%xmm4, %%xmm2\n\t"
558 " movaps 32(%%eax), %%xmm0\n\t"
559 " addps %%xmm2, %%xmm7\n\t"
560 " mulps %%xmm5, %%xmm3\n\t"
561 " addl $32, %%eax\n\t"
562 " movaps 32(%%edx), %%xmm2\n\t"
563 " addps %%xmm3, %%xmm7\n\t"
564 " addl $32, %%edx\n\t"
568 " # We've handled the bulk of multiplies up to here.\n\t"
569 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
570 " # If so, we've got 2 more taps to do.\n\t"
571 " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
572 " shrl $4, %%ecx\n\t"
573 " andl $1, %%ecx\n\t"
575 " # The count was odd, do 2 more taps.\n\t"
576 " # Note that we've already got mm0/mm2 preloaded\n\t"
577 " # from the main loop.\n\t"
578 " movaps %%xmm0, %%xmm4\n\t"
579 " mulps %%xmm2, %%xmm0\n\t"
580 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
581 " addps %%xmm0, %%xmm6\n\t"
582 " mulps %%xmm4, %%xmm2\n\t"
583 " addps %%xmm2, %%xmm7\n\t"
585 " # neg inversor\n\t"
586 " movl 8(%%ebp), %%eax \n\t"
587 " xorps %%xmm1, %%xmm1\n\t"
588 " movl $0x80000000, (%%eax)\n\t"
589 " movss (%%eax), %%xmm1\n\t"
590 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
592 " xorps %%xmm1, %%xmm6\n\t"
593 " movaps %%xmm6, %%xmm2\n\t"
594 " unpcklps %%xmm7, %%xmm6\n\t"
595 " unpckhps %%xmm7, %%xmm2\n\t"
596 " movaps %%xmm2, %%xmm3\n\t"
597 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
598 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
599 " addps %%xmm2, %%xmm6\n\t"
600 " # xmm6 = r1 i2 r3 i4\n\t"
601 " #movl 8(%%ebp), %%eax # @result\n\t"
602 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
603 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
604 " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
608 :
"eax",
"ecx",
"edx"
612 int getem = num_bytes % 16;
615 *result += (input[num_points - 1] * taps[num_points - 1]);
626 #include <pmmintrin.h>
628 static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
630 const unsigned int num_bytes = num_points*8;
631 unsigned int isodd = num_points & 1;
634 memset(&dotProduct, 0x0, 2*
sizeof(
float));
636 unsigned int number = 0;
637 const unsigned int halfPoints = num_bytes >> 4;
639 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
644 dotProdVal = _mm_setzero_ps();
646 for(;number < halfPoints; number++){
648 x = _mm_load_ps((
float*)a);
649 y = _mm_load_ps((
float*)b);
651 yl = _mm_moveldup_ps(y);
652 yh = _mm_movehdup_ps(y);
654 tmp1 = _mm_mul_ps(x,yl);
656 x = _mm_shuffle_ps(x,x,0xB1);
658 tmp2 = _mm_mul_ps(x,yh);
660 z = _mm_addsub_ps(tmp1,tmp2);
662 dotProdVal = _mm_add_ps(dotProdVal, z);
670 _mm_store_ps((
float*)dotProductVector,dotProdVal);
672 dotProduct += ( dotProductVector[0] + dotProductVector[1] );
675 dotProduct += input[num_points - 1] * taps[num_points - 1];
678 *result = dotProduct;
683 #ifdef LV_HAVE_SSE4_1
685 #include <smmintrin.h>
687 static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
690 const unsigned int qtr_points = num_points/4;
691 const unsigned int isodd = num_points & 3;
693 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
694 float *p_input, *p_taps;
697 static const __m128i neg = {0x000000000000000080000000};
699 p_result = (__m64*)result;
700 p_input = (
float*)input;
701 p_taps = (
float*)taps;
703 real0 = _mm_setzero_ps();
704 real1 = _mm_setzero_ps();
705 im0 = _mm_setzero_ps();
706 im1 = _mm_setzero_ps();
708 for(; i < qtr_points; ++i) {
709 xmm0 = _mm_load_ps(p_input);
710 xmm1 = _mm_load_ps(p_taps);
715 xmm2 = _mm_load_ps(p_input);
716 xmm3 = _mm_load_ps(p_taps);
721 xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
722 xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
723 xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
724 xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
727 xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
729 xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
731 xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
733 xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
735 xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
736 xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
738 xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
739 xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
741 real0 = _mm_add_ps(xmm4, real0);
742 real1 = _mm_add_ps(xmm5, real1);
743 im0 = _mm_add_ps(xmm6, im0);
744 im1 = _mm_add_ps(xmm7, im1);
747 real1 = _mm_xor_ps(real1,
bit128_p(&neg)->float_vec);
749 im0 = _mm_add_ps(im0, im1);
750 real0 = _mm_add_ps(real0, real1);
752 im0 = _mm_add_ps(im0, real0);
754 _mm_storel_pi(p_result, im0);
756 for(i = num_points-isodd; i < num_points; i++) {
757 *result += input[i] * taps[i];
764 #include <arm_neon.h>
766 static inline void volk_32fc_x2_dot_prod_32fc_neon(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
768 unsigned int quarter_points = num_points / 4;
775 float32x4x2_t a_val, b_val, c_val, accumulator;
776 float32x4x2_t tmp_real, tmp_imag;
777 accumulator.val[0] = vdupq_n_f32(0);
778 accumulator.val[1] = vdupq_n_f32(0);
780 for(number = 0; number < quarter_points; ++number) {
781 a_val = vld2q_f32((
float*)a_ptr);
782 b_val = vld2q_f32((
float*)b_ptr);
783 __builtin_prefetch(a_ptr+8);
784 __builtin_prefetch(b_ptr+8);
788 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
790 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
794 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
796 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
798 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
799 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
801 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
802 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
808 vst2q_f32((
float*)accum_result, accumulator);
809 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
812 for(number = quarter_points*4; number < num_points; ++number) {
813 *result += (*a_ptr++) * (*b_ptr++);
821 static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
823 unsigned int quarter_points = num_points / 4;
830 float32x4x2_t a_val, b_val, accumulator;
831 float32x4x2_t tmp_imag;
832 accumulator.val[0] = vdupq_n_f32(0);
833 accumulator.val[1] = vdupq_n_f32(0);
835 for(number = 0; number < quarter_points; ++number) {
836 a_val = vld2q_f32((
float*)a_ptr);
837 b_val = vld2q_f32((
float*)b_ptr);
838 __builtin_prefetch(a_ptr+8);
839 __builtin_prefetch(b_ptr+8);
842 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
843 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
846 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
847 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
849 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
850 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
857 vst2q_f32((
float*)accum_result, accumulator);
858 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
861 for(number = quarter_points*4; number < num_points; ++number) {
862 *result += (*a_ptr++) * (*b_ptr++);
869 static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
871 unsigned int quarter_points = num_points / 4;
878 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
879 accumulator1.val[0] = vdupq_n_f32(0);
880 accumulator1.val[1] = vdupq_n_f32(0);
881 accumulator2.val[0] = vdupq_n_f32(0);
882 accumulator2.val[1] = vdupq_n_f32(0);
884 for(number = 0; number < quarter_points; ++number) {
885 a_val = vld2q_f32((
float*)a_ptr);
886 b_val = vld2q_f32((
float*)b_ptr);
887 __builtin_prefetch(a_ptr+8);
888 __builtin_prefetch(b_ptr+8);
891 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
892 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
893 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
894 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
899 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
900 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
902 vst2q_f32((
float*)accum_result, accumulator1);
903 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
906 for(number = quarter_points*4; number < num_points; ++number) {
907 *result += (*a_ptr++) * (*b_ptr++);
914 static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
917 unsigned int quarter_points = num_points / 8;
924 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
925 float32x4x2_t reduced_accumulator;
926 accumulator1.val[0] = vdupq_n_f32(0);
927 accumulator1.val[1] = vdupq_n_f32(0);
928 accumulator1.val[2] = vdupq_n_f32(0);
929 accumulator1.val[3] = vdupq_n_f32(0);
930 accumulator2.val[0] = vdupq_n_f32(0);
931 accumulator2.val[1] = vdupq_n_f32(0);
932 accumulator2.val[2] = vdupq_n_f32(0);
933 accumulator2.val[3] = vdupq_n_f32(0);
936 for(number = 0; number < quarter_points; ++number) {
937 a_val = vld4q_f32((
float*)a_ptr);
938 b_val = vld4q_f32((
float*)b_ptr);
939 __builtin_prefetch(a_ptr+8);
940 __builtin_prefetch(b_ptr+8);
943 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
944 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
946 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
947 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
949 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
950 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
952 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
953 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
959 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
960 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
961 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
962 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
963 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
964 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
967 vst2q_f32((
float*)accum_result, reduced_accumulator);
968 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
971 for(number = quarter_points*8; number < num_points; ++number) {
972 *result += (*a_ptr++) * (*b_ptr++);
#define bit128_p(x)
Definition: volk_common.h:94
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56