1 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
2 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
11 static inline void volk_16i_32fc_dot_prod_32fc_generic(
lv_32fc_t* result,
const short* input,
const lv_32fc_t *
taps,
unsigned int num_points) {
13 static const int N_UNROLL = 4;
21 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
23 for(i = 0; i < n; i += N_UNROLL) {
24 acc0 += taps[i + 0] * (float)input[i + 0];
25 acc1 += taps[i + 1] * (float)input[i + 1];
26 acc2 += taps[i + 2] * (float)input[i + 2];
27 acc3 += taps[i + 3] * (float)input[i + 3];
30 for(; i < num_points; i++) {
31 acc0 += taps[i] * (float)input[i];
34 *result = acc0 + acc1 + acc2 + acc3;
40 #if LV_HAVE_SSE && LV_HAVE_MMX
43 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
45 unsigned int number = 0;
46 const unsigned int sixteenthPoints = num_points / 8;
49 float *realpt = &res[0], *imagpt = &res[1];
50 const short* aPtr = input;
51 const float* bPtr = (
float*)taps;
54 __m128 f0, f1, f2, f3;
55 __m128 a0Val, a1Val, a2Val, a3Val;
56 __m128 b0Val, b1Val, b2Val, b3Val;
57 __m128 c0Val, c1Val, c2Val, c3Val;
59 __m128 dotProdVal0 = _mm_setzero_ps();
60 __m128 dotProdVal1 = _mm_setzero_ps();
61 __m128 dotProdVal2 = _mm_setzero_ps();
62 __m128 dotProdVal3 = _mm_setzero_ps();
64 for(;number < sixteenthPoints; number++){
66 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
67 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
68 f0 = _mm_cvtpi16_ps(m0);
69 f1 = _mm_cvtpi16_ps(m0);
70 f2 = _mm_cvtpi16_ps(m1);
71 f3 = _mm_cvtpi16_ps(m1);
73 a0Val = _mm_unpacklo_ps(f0, f1);
74 a1Val = _mm_unpackhi_ps(f0, f1);
75 a2Val = _mm_unpacklo_ps(f2, f3);
76 a3Val = _mm_unpackhi_ps(f2, f3);
78 b0Val = _mm_load_ps(bPtr);
79 b1Val = _mm_load_ps(bPtr+4);
80 b2Val = _mm_load_ps(bPtr+8);
81 b3Val = _mm_load_ps(bPtr+12);
83 c0Val = _mm_mul_ps(a0Val, b0Val);
84 c1Val = _mm_mul_ps(a1Val, b1Val);
85 c2Val = _mm_mul_ps(a2Val, b2Val);
86 c3Val = _mm_mul_ps(a3Val, b3Val);
88 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
89 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
90 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
91 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
97 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
98 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
99 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
103 _mm_store_ps(dotProductVector,dotProdVal0);
105 *realpt = dotProductVector[0];
106 *imagpt = dotProductVector[1];
107 *realpt += dotProductVector[2];
108 *imagpt += dotProductVector[3];
110 number = sixteenthPoints*8;
111 for(;number < num_points; number++){
112 *realpt += ((*aPtr) * (*bPtr++));
113 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56