1 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
2 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
11 static inline void volk_32fc_32f_dot_prod_32fc_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const float *
taps,
unsigned int num_points) {
14 float *realpt = &res[0], *imagpt = &res[1];
15 const float* aPtr = (
float*)input;
16 const float* bPtr=
taps;
17 unsigned int number = 0;
22 for(number = 0; number < num_points; number++){
23 *realpt += ((*aPtr++) * (*bPtr));
24 *imagpt += ((*aPtr++) * (*bPtr++));
36 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
38 unsigned int number = 0;
39 const unsigned int sixteenthPoints = num_points / 8;
42 float *realpt = &res[0], *imagpt = &res[1];
43 const float* aPtr = (
float*)input;
44 const float* bPtr =
taps;
46 __m128 a0Val, a1Val, a2Val, a3Val;
47 __m128 b0Val, b1Val, b2Val, b3Val;
48 __m128 x0Val, x1Val, x2Val, x3Val;
49 __m128 c0Val, c1Val, c2Val, c3Val;
51 __m128 dotProdVal0 = _mm_setzero_ps();
52 __m128 dotProdVal1 = _mm_setzero_ps();
53 __m128 dotProdVal2 = _mm_setzero_ps();
54 __m128 dotProdVal3 = _mm_setzero_ps();
56 for(;number < sixteenthPoints; number++){
58 a0Val = _mm_load_ps(aPtr);
59 a1Val = _mm_load_ps(aPtr+4);
60 a2Val = _mm_load_ps(aPtr+8);
61 a3Val = _mm_load_ps(aPtr+12);
63 x0Val = _mm_load_ps(bPtr);
64 x1Val = _mm_load_ps(bPtr);
65 x2Val = _mm_load_ps(bPtr+4);
66 x3Val = _mm_load_ps(bPtr+4);
67 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
68 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
69 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
70 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
72 c0Val = _mm_mul_ps(a0Val, b0Val);
73 c1Val = _mm_mul_ps(a1Val, b1Val);
74 c2Val = _mm_mul_ps(a2Val, b2Val);
75 c3Val = _mm_mul_ps(a3Val, b3Val);
77 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
78 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
79 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
80 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
86 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
87 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
88 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
92 _mm_store_ps(dotProductVector,dotProdVal0);
94 *realpt = dotProductVector[0];
95 *imagpt = dotProductVector[1];
96 *realpt += dotProductVector[2];
97 *imagpt += dotProductVector[3];
99 number = sixteenthPoints*8;
100 for(;number < num_points; number++){
101 *realpt += ((*aPtr++) * (*bPtr));
102 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56