1 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
2 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
11 static inline void volk_32f_x2_dot_prod_16i_generic(
int16_t* result,
const float* input,
const float*
taps,
unsigned int num_points) {
14 const float* aPtr = input;
15 const float* bPtr=
taps;
16 unsigned int number = 0;
18 for(number = 0; number < num_points; number++){
19 dotProduct += ((*aPtr++) * (*bPtr++));
30 static inline void volk_32f_x2_dot_prod_16i_a_sse(
int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
32 unsigned int number = 0;
33 const unsigned int sixteenthPoints = num_points / 16;
36 const float* aPtr = input;
37 const float* bPtr =
taps;
39 __m128 a0Val, a1Val, a2Val, a3Val;
40 __m128 b0Val, b1Val, b2Val, b3Val;
41 __m128 c0Val, c1Val, c2Val, c3Val;
43 __m128 dotProdVal0 = _mm_setzero_ps();
44 __m128 dotProdVal1 = _mm_setzero_ps();
45 __m128 dotProdVal2 = _mm_setzero_ps();
46 __m128 dotProdVal3 = _mm_setzero_ps();
48 for(;number < sixteenthPoints; number++){
50 a0Val = _mm_load_ps(aPtr);
51 a1Val = _mm_load_ps(aPtr+4);
52 a2Val = _mm_load_ps(aPtr+8);
53 a3Val = _mm_load_ps(aPtr+12);
54 b0Val = _mm_load_ps(bPtr);
55 b1Val = _mm_load_ps(bPtr+4);
56 b2Val = _mm_load_ps(bPtr+8);
57 b3Val = _mm_load_ps(bPtr+12);
59 c0Val = _mm_mul_ps(a0Val, b0Val);
60 c1Val = _mm_mul_ps(a1Val, b1Val);
61 c2Val = _mm_mul_ps(a2Val, b2Val);
62 c3Val = _mm_mul_ps(a3Val, b3Val);
64 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
65 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
66 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
67 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
73 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
74 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
75 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
79 _mm_store_ps(dotProductVector,dotProdVal0);
81 dotProduct = dotProductVector[0];
82 dotProduct += dotProductVector[1];
83 dotProduct += dotProductVector[2];
84 dotProduct += dotProductVector[3];
86 number = sixteenthPoints*16;
87 for(;number < num_points; number++){
88 dotProduct += ((*aPtr++) * (*bPtr++));
91 *result = (short)dotProduct;
99 static inline void volk_32f_x2_dot_prod_16i_u_sse(
int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
101 unsigned int number = 0;
102 const unsigned int sixteenthPoints = num_points / 16;
104 float dotProduct = 0;
105 const float* aPtr = input;
106 const float* bPtr =
taps;
108 __m128 a0Val, a1Val, a2Val, a3Val;
109 __m128 b0Val, b1Val, b2Val, b3Val;
110 __m128 c0Val, c1Val, c2Val, c3Val;
112 __m128 dotProdVal0 = _mm_setzero_ps();
113 __m128 dotProdVal1 = _mm_setzero_ps();
114 __m128 dotProdVal2 = _mm_setzero_ps();
115 __m128 dotProdVal3 = _mm_setzero_ps();
117 for(;number < sixteenthPoints; number++){
119 a0Val = _mm_loadu_ps(aPtr);
120 a1Val = _mm_loadu_ps(aPtr+4);
121 a2Val = _mm_loadu_ps(aPtr+8);
122 a3Val = _mm_loadu_ps(aPtr+12);
123 b0Val = _mm_loadu_ps(bPtr);
124 b1Val = _mm_loadu_ps(bPtr+4);
125 b2Val = _mm_loadu_ps(bPtr+8);
126 b3Val = _mm_loadu_ps(bPtr+12);
128 c0Val = _mm_mul_ps(a0Val, b0Val);
129 c1Val = _mm_mul_ps(a1Val, b1Val);
130 c2Val = _mm_mul_ps(a2Val, b2Val);
131 c3Val = _mm_mul_ps(a3Val, b3Val);
133 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
134 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
135 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
136 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
142 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
143 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
144 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
148 _mm_store_ps(dotProductVector,dotProdVal0);
150 dotProduct = dotProductVector[0];
151 dotProduct += dotProductVector[1];
152 dotProduct += dotProductVector[2];
153 dotProduct += dotProductVector[3];
155 number = sixteenthPoints*16;
156 for(;number < num_points; number++){
157 dotProduct += ((*aPtr++) * (*bPtr++));
160 *result = (short)dotProduct;
signed short int16_t
Definition: stdint.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9