1 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
2 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
11 static inline void volk_32fc_32f_dot_prod_32fc_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const float *
taps,
unsigned int num_points) {
14 float *realpt = &res[0], *imagpt = &res[1];
15 const float* aPtr = (
float*)input;
16 const float* bPtr=
taps;
17 unsigned int number = 0;
22 for(number = 0; number < num_points; number++){
23 *realpt += ((*aPtr++) * (*bPtr));
24 *imagpt += ((*aPtr++) * (*bPtr++));
35 #include <immintrin.h>
37 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
39 unsigned int number = 0;
40 const unsigned int sixteenthPoints = num_points / 16;
43 float *realpt = &res[0], *imagpt = &res[1];
44 const float* aPtr = (
float*)input;
45 const float* bPtr =
taps;
47 __m256 a0Val, a1Val, a2Val, a3Val;
48 __m256 b0Val, b1Val, b2Val, b3Val;
49 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
50 __m256 c0Val, c1Val, c2Val, c3Val;
52 __m256 dotProdVal0 = _mm256_setzero_ps();
53 __m256 dotProdVal1 = _mm256_setzero_ps();
54 __m256 dotProdVal2 = _mm256_setzero_ps();
55 __m256 dotProdVal3 = _mm256_setzero_ps();
57 for(;number < sixteenthPoints; number++){
59 a0Val = _mm256_load_ps(aPtr);
60 a1Val = _mm256_load_ps(aPtr+8);
61 a2Val = _mm256_load_ps(aPtr+16);
62 a3Val = _mm256_load_ps(aPtr+24);
64 x0Val = _mm256_load_ps(bPtr);
65 x1Val = _mm256_load_ps(bPtr+8);
66 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
67 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
68 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
69 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
72 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
73 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
74 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
75 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
77 c0Val = _mm256_mul_ps(a0Val, b0Val);
78 c1Val = _mm256_mul_ps(a1Val, b1Val);
79 c2Val = _mm256_mul_ps(a2Val, b2Val);
80 c3Val = _mm256_mul_ps(a3Val, b3Val);
82 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
83 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
84 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
85 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
91 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
92 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
93 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
97 _mm256_store_ps(dotProductVector,dotProdVal0);
99 *realpt = dotProductVector[0];
100 *imagpt = dotProductVector[1];
101 *realpt += dotProductVector[2];
102 *imagpt += dotProductVector[3];
103 *realpt += dotProductVector[4];
104 *imagpt += dotProductVector[5];
105 *realpt += dotProductVector[6];
106 *imagpt += dotProductVector[7];
108 number = sixteenthPoints*16;
109 for(;number < num_points; number++){
110 *realpt += ((*aPtr++) * (*bPtr));
111 *imagpt += ((*aPtr++) * (*bPtr++));
125 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
127 unsigned int number = 0;
128 const unsigned int sixteenthPoints = num_points / 8;
131 float *realpt = &res[0], *imagpt = &res[1];
132 const float* aPtr = (
float*)input;
133 const float* bPtr =
taps;
135 __m128 a0Val, a1Val, a2Val, a3Val;
136 __m128 b0Val, b1Val, b2Val, b3Val;
137 __m128 x0Val, x1Val, x2Val, x3Val;
138 __m128 c0Val, c1Val, c2Val, c3Val;
140 __m128 dotProdVal0 = _mm_setzero_ps();
141 __m128 dotProdVal1 = _mm_setzero_ps();
142 __m128 dotProdVal2 = _mm_setzero_ps();
143 __m128 dotProdVal3 = _mm_setzero_ps();
145 for(;number < sixteenthPoints; number++){
147 a0Val = _mm_load_ps(aPtr);
148 a1Val = _mm_load_ps(aPtr+4);
149 a2Val = _mm_load_ps(aPtr+8);
150 a3Val = _mm_load_ps(aPtr+12);
152 x0Val = _mm_load_ps(bPtr);
153 x1Val = _mm_load_ps(bPtr);
154 x2Val = _mm_load_ps(bPtr+4);
155 x3Val = _mm_load_ps(bPtr+4);
156 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
157 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
158 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
159 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
161 c0Val = _mm_mul_ps(a0Val, b0Val);
162 c1Val = _mm_mul_ps(a1Val, b1Val);
163 c2Val = _mm_mul_ps(a2Val, b2Val);
164 c3Val = _mm_mul_ps(a3Val, b3Val);
166 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
167 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
168 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
169 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
175 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
176 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
177 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
181 _mm_store_ps(dotProductVector,dotProdVal0);
183 *realpt = dotProductVector[0];
184 *imagpt = dotProductVector[1];
185 *realpt += dotProductVector[2];
186 *imagpt += dotProductVector[3];
188 number = sixteenthPoints*8;
189 for(;number < num_points; number++){
190 *realpt += ((*aPtr++) * (*bPtr));
191 *imagpt += ((*aPtr++) * (*bPtr++));
203 #include <immintrin.h>
205 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
207 unsigned int number = 0;
208 const unsigned int sixteenthPoints = num_points / 16;
211 float *realpt = &res[0], *imagpt = &res[1];
212 const float* aPtr = (
float*)input;
213 const float* bPtr =
taps;
215 __m256 a0Val, a1Val, a2Val, a3Val;
216 __m256 b0Val, b1Val, b2Val, b3Val;
217 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
218 __m256 c0Val, c1Val, c2Val, c3Val;
220 __m256 dotProdVal0 = _mm256_setzero_ps();
221 __m256 dotProdVal1 = _mm256_setzero_ps();
222 __m256 dotProdVal2 = _mm256_setzero_ps();
223 __m256 dotProdVal3 = _mm256_setzero_ps();
225 for(;number < sixteenthPoints; number++){
227 a0Val = _mm256_loadu_ps(aPtr);
228 a1Val = _mm256_loadu_ps(aPtr+8);
229 a2Val = _mm256_loadu_ps(aPtr+16);
230 a3Val = _mm256_loadu_ps(aPtr+24);
232 x0Val = _mm256_loadu_ps(bPtr);
233 x1Val = _mm256_loadu_ps(bPtr+8);
234 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
235 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
236 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
237 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
240 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
241 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
242 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
243 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
245 c0Val = _mm256_mul_ps(a0Val, b0Val);
246 c1Val = _mm256_mul_ps(a1Val, b1Val);
247 c2Val = _mm256_mul_ps(a2Val, b2Val);
248 c3Val = _mm256_mul_ps(a3Val, b3Val);
250 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
251 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
252 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
253 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
259 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
260 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
261 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
265 _mm256_store_ps(dotProductVector,dotProdVal0);
267 *realpt = dotProductVector[0];
268 *imagpt = dotProductVector[1];
269 *realpt += dotProductVector[2];
270 *imagpt += dotProductVector[3];
271 *realpt += dotProductVector[4];
272 *imagpt += dotProductVector[5];
273 *realpt += dotProductVector[6];
274 *imagpt += dotProductVector[7];
276 number = sixteenthPoints*16;
277 for(;number < num_points; number++){
278 *realpt += ((*aPtr++) * (*bPtr));
279 *imagpt += ((*aPtr++) * (*bPtr++));
289 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
291 unsigned int number = 0;
292 const unsigned int sixteenthPoints = num_points / 8;
295 float *realpt = &res[0], *imagpt = &res[1];
296 const float* aPtr = (
float*)input;
297 const float* bPtr =
taps;
299 __m128 a0Val, a1Val, a2Val, a3Val;
300 __m128 b0Val, b1Val, b2Val, b3Val;
301 __m128 x0Val, x1Val, x2Val, x3Val;
302 __m128 c0Val, c1Val, c2Val, c3Val;
304 __m128 dotProdVal0 = _mm_setzero_ps();
305 __m128 dotProdVal1 = _mm_setzero_ps();
306 __m128 dotProdVal2 = _mm_setzero_ps();
307 __m128 dotProdVal3 = _mm_setzero_ps();
309 for(;number < sixteenthPoints; number++){
311 a0Val = _mm_loadu_ps(aPtr);
312 a1Val = _mm_loadu_ps(aPtr+4);
313 a2Val = _mm_loadu_ps(aPtr+8);
314 a3Val = _mm_loadu_ps(aPtr+12);
316 x0Val = _mm_loadu_ps(bPtr);
317 x1Val = _mm_loadu_ps(bPtr);
318 x2Val = _mm_loadu_ps(bPtr+4);
319 x3Val = _mm_loadu_ps(bPtr+4);
320 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
321 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
322 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
323 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
325 c0Val = _mm_mul_ps(a0Val, b0Val);
326 c1Val = _mm_mul_ps(a1Val, b1Val);
327 c2Val = _mm_mul_ps(a2Val, b2Val);
328 c3Val = _mm_mul_ps(a3Val, b3Val);
330 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
331 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
332 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
333 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
339 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
340 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
341 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
345 _mm_store_ps(dotProductVector,dotProdVal0);
347 *realpt = dotProductVector[0];
348 *imagpt = dotProductVector[1];
349 *realpt += dotProductVector[2];
350 *imagpt += dotProductVector[3];
352 number = sixteenthPoints*8;
353 for(;number < num_points; number++){
354 *realpt += ((*aPtr++) * (*bPtr));
355 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56