1 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
2 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
8 #define ROTATOR_RELOAD 512
11 #ifdef LV_HAVE_GENERIC
21 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
26 *outVector++ = *inVector++ * (*phase);
27 (*phase) *= phase_inc;
30 (*phase) /= std::abs((*phase));
32 (*phase) /= cabsf((*phase));
36 *outVector++ = *inVector++ * (*phase);
37 (*phase) *= phase_inc;
46 #include <smmintrin.h>
48 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
52 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
54 unsigned int i, j = 0;
56 for(i = 0; i < 2; ++
i) {
64 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
66 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
69 const unsigned int halfPoints = num_points / 2;
75 aVal = _mm_load_ps((
float*)aPtr);
77 yl = _mm_moveldup_ps(phase_Val);
78 yh = _mm_movehdup_ps(phase_Val);
79 ylp = _mm_moveldup_ps(inc_Val);
80 yhp = _mm_movehdup_ps(inc_Val);
82 tmp1 = _mm_mul_ps(aVal, yl);
83 tmp1p = _mm_mul_ps(phase_Val, ylp);
85 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
86 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
87 tmp2 = _mm_mul_ps(aVal, yh);
88 tmp2p = _mm_mul_ps(phase_Val, yhp);
90 z = _mm_addsub_ps(tmp1, tmp2);
91 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
93 _mm_store_ps((
float*)cPtr, z);
98 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
99 tmp2 = _mm_hadd_ps(tmp1, tmp1);
100 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
101 tmp2 = _mm_sqrt_ps(tmp1);
102 phase_Val = _mm_div_ps(phase_Val, tmp2);
105 aVal = _mm_load_ps((
float*)aPtr);
107 yl = _mm_moveldup_ps(phase_Val);
108 yh = _mm_movehdup_ps(phase_Val);
109 ylp = _mm_moveldup_ps(inc_Val);
110 yhp = _mm_movehdup_ps(inc_Val);
112 tmp1 = _mm_mul_ps(aVal, yl);
114 tmp1p = _mm_mul_ps(phase_Val, ylp);
116 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
117 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
118 tmp2 = _mm_mul_ps(aVal, yh);
119 tmp2p = _mm_mul_ps(phase_Val, yhp);
121 z = _mm_addsub_ps(tmp1, tmp2);
122 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
124 _mm_store_ps((
float*)cPtr, z);
130 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
131 for(i = 0; i < num_points%2; ++
i) {
132 *cPtr++ = *aPtr++ * phase_Ptr[0];
133 phase_Ptr[0] *= (phase_inc);
136 (*phase) = phase_Ptr[0];
143 #ifdef LV_HAVE_SSE4_1
144 #include <smmintrin.h>
154 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
158 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
160 unsigned int i, j = 0;
162 for(i = 0; i < 2; ++
i) {
163 phase_Ptr[
i] *= incr;
170 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
172 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
175 const unsigned int halfPoints = num_points / 2;
178 for(i = 0; i < (
unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
181 aVal = _mm_loadu_ps((
float*)aPtr);
183 yl = _mm_moveldup_ps(phase_Val);
184 yh = _mm_movehdup_ps(phase_Val);
185 ylp = _mm_moveldup_ps(inc_Val);
186 yhp = _mm_movehdup_ps(inc_Val);
188 tmp1 = _mm_mul_ps(aVal, yl);
189 tmp1p = _mm_mul_ps(phase_Val, ylp);
191 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
192 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
193 tmp2 = _mm_mul_ps(aVal, yh);
194 tmp2p = _mm_mul_ps(phase_Val, yhp);
196 z = _mm_addsub_ps(tmp1, tmp2);
197 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
199 _mm_storeu_ps((
float*)cPtr, z);
204 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
205 tmp2 = _mm_hadd_ps(tmp1, tmp1);
206 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
207 tmp2 = _mm_sqrt_ps(tmp1);
208 phase_Val = _mm_div_ps(phase_Val, tmp2);
211 aVal = _mm_loadu_ps((
float*)aPtr);
213 yl = _mm_moveldup_ps(phase_Val);
214 yh = _mm_movehdup_ps(phase_Val);
215 ylp = _mm_moveldup_ps(inc_Val);
216 yhp = _mm_movehdup_ps(inc_Val);
218 tmp1 = _mm_mul_ps(aVal, yl);
220 tmp1p = _mm_mul_ps(phase_Val, ylp);
222 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
223 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
224 tmp2 = _mm_mul_ps(aVal, yh);
225 tmp2p = _mm_mul_ps(phase_Val, yhp);
227 z = _mm_addsub_ps(tmp1, tmp2);
228 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
230 _mm_storeu_ps((
float*)cPtr, z);
236 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
237 for(i = 0; i < num_points%2; ++
i) {
238 *cPtr++ = *aPtr++ * phase_Ptr[0];
239 phase_Ptr[0] *= (phase_inc);
242 (*phase) = phase_Ptr[0];
250 #include <immintrin.h>
260 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
264 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
266 unsigned int i, j = 0;
268 for(i = 0; i < 4; ++
i) {
269 phase_Ptr[
i] *= incr;
278 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
280 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
282 const unsigned int fourthPoints = num_points / 4;
285 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
288 aVal = _mm256_load_ps((
float*)aPtr);
290 yl = _mm256_moveldup_ps(phase_Val);
291 yh = _mm256_movehdup_ps(phase_Val);
292 ylp = _mm256_moveldup_ps(inc_Val);
293 yhp = _mm256_movehdup_ps(inc_Val);
295 tmp1 = _mm256_mul_ps(aVal, yl);
296 tmp1p = _mm256_mul_ps(phase_Val, ylp);
298 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
299 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
300 tmp2 = _mm256_mul_ps(aVal, yh);
301 tmp2p = _mm256_mul_ps(phase_Val, yhp);
303 z = _mm256_addsub_ps(tmp1, tmp2);
304 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
306 _mm256_store_ps((
float*)cPtr, z);
311 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
312 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
313 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
314 tmp2 = _mm256_sqrt_ps(tmp1);
315 phase_Val = _mm256_div_ps(phase_Val, tmp2);
318 aVal = _mm256_load_ps((
float*)aPtr);
320 yl = _mm256_moveldup_ps(phase_Val);
321 yh = _mm256_movehdup_ps(phase_Val);
322 ylp = _mm256_moveldup_ps(inc_Val);
323 yhp = _mm256_movehdup_ps(inc_Val);
325 tmp1 = _mm256_mul_ps(aVal, yl);
327 tmp1p = _mm256_mul_ps(phase_Val, ylp);
329 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
330 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
331 tmp2 = _mm256_mul_ps(aVal, yh);
332 tmp2p = _mm256_mul_ps(phase_Val, yhp);
334 z = _mm256_addsub_ps(tmp1, tmp2);
335 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
337 _mm256_store_ps((
float*)cPtr, z);
343 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
344 for(i = 0; i < num_points%4; ++
i) {
345 *cPtr++ = *aPtr++ * phase_Ptr[0];
346 phase_Ptr[0] *= (phase_inc);
349 (*phase) = phase_Ptr[0];
357 #include <immintrin.h>
367 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
371 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
373 unsigned int i, j = 0;
375 for(i = 0; i < 4; ++
i) {
376 phase_Ptr[
i] *= incr;
385 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
387 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
389 const unsigned int fourthPoints = num_points / 4;
392 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
395 aVal = _mm256_loadu_ps((
float*)aPtr);
397 yl = _mm256_moveldup_ps(phase_Val);
398 yh = _mm256_movehdup_ps(phase_Val);
399 ylp = _mm256_moveldup_ps(inc_Val);
400 yhp = _mm256_movehdup_ps(inc_Val);
402 tmp1 = _mm256_mul_ps(aVal, yl);
403 tmp1p = _mm256_mul_ps(phase_Val, ylp);
405 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
406 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
407 tmp2 = _mm256_mul_ps(aVal, yh);
408 tmp2p = _mm256_mul_ps(phase_Val, yhp);
410 z = _mm256_addsub_ps(tmp1, tmp2);
411 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
413 _mm256_storeu_ps((
float*)cPtr, z);
418 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
419 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
420 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
421 tmp2 = _mm256_sqrt_ps(tmp1);
422 phase_Val = _mm256_div_ps(phase_Val, tmp2);
425 aVal = _mm256_loadu_ps((
float*)aPtr);
427 yl = _mm256_moveldup_ps(phase_Val);
428 yh = _mm256_movehdup_ps(phase_Val);
429 ylp = _mm256_moveldup_ps(inc_Val);
430 yhp = _mm256_movehdup_ps(inc_Val);
432 tmp1 = _mm256_mul_ps(aVal, yl);
434 tmp1p = _mm256_mul_ps(phase_Val, ylp);
436 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
437 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
438 tmp2 = _mm256_mul_ps(aVal, yh);
439 tmp2p = _mm256_mul_ps(phase_Val, yhp);
441 z = _mm256_addsub_ps(tmp1, tmp2);
442 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
444 _mm256_storeu_ps((
float*)cPtr, z);
450 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
451 for(i = 0; i < num_points%4; ++
i) {
452 *cPtr++ = *aPtr++ * phase_Ptr[0];
453 phase_Ptr[0] *= (phase_inc);
456 (*phase) = phase_Ptr[0];
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:8
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80