1 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
2 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
8 #define ROTATOR_RELOAD 512
11 #ifdef LV_HAVE_GENERIC
21 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
26 *outVector++ = *inVector++ * (*phase);
27 (*phase) *= phase_inc;
29 (*phase) /= abs((*phase));
32 *outVector++ = *inVector++ * (*phase);
33 (*phase) *= phase_inc;
41 #include <smmintrin.h>
43 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
47 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
49 unsigned int i, j = 0;
51 for(i = 0; i < 2; ++
i) {
61 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
63 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
66 const unsigned int halfPoints = num_points / 2;
72 aVal = _mm_load_ps((
float*)aPtr);
74 yl = _mm_moveldup_ps(phase_Val);
75 yh = _mm_movehdup_ps(phase_Val);
76 ylp = _mm_moveldup_ps(inc_Val);
77 yhp = _mm_movehdup_ps(inc_Val);
79 tmp1 = _mm_mul_ps(aVal, yl);
80 tmp1p = _mm_mul_ps(phase_Val, ylp);
82 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
83 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
84 tmp2 = _mm_mul_ps(aVal, yh);
85 tmp2p = _mm_mul_ps(phase_Val, yhp);
87 z = _mm_addsub_ps(tmp1, tmp2);
88 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
90 _mm_store_ps((
float*)cPtr, z);
95 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
96 tmp2 = _mm_hadd_ps(tmp1, tmp1);
97 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
98 phase_Val = _mm_div_ps(phase_Val, tmp1);
101 aVal = _mm_load_ps((
float*)aPtr);
103 yl = _mm_moveldup_ps(phase_Val);
104 yh = _mm_movehdup_ps(phase_Val);
105 ylp = _mm_moveldup_ps(inc_Val);
106 yhp = _mm_movehdup_ps(inc_Val);
108 tmp1 = _mm_mul_ps(aVal, yl);
110 tmp1p = _mm_mul_ps(phase_Val, ylp);
112 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
113 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
114 tmp2 = _mm_mul_ps(aVal, yh);
115 tmp2p = _mm_mul_ps(phase_Val, yhp);
117 z = _mm_addsub_ps(tmp1, tmp2);
118 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
120 _mm_store_ps((
float*)cPtr, z);
126 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
127 for(i = 0; i < num_points%2; ++
i) {
128 *cPtr++ = *aPtr++ * phase_Ptr[0];
129 phase_Ptr[0] *= (phase_inc);
132 (*phase) = phase_Ptr[0];
145 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
149 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
151 unsigned int i, j = 0;
153 for(i = 0; i < 2; ++
i) {
154 phase_Ptr[
i] *= incr;
163 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
165 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
168 const unsigned int halfPoints = num_points / 2;
171 for(i = 0; i < (
unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
174 aVal = _mm_loadu_ps((
float*)aPtr);
176 yl = _mm_moveldup_ps(phase_Val);
177 yh = _mm_movehdup_ps(phase_Val);
178 ylp = _mm_moveldup_ps(inc_Val);
179 yhp = _mm_movehdup_ps(inc_Val);
181 tmp1 = _mm_mul_ps(aVal, yl);
182 tmp1p = _mm_mul_ps(phase_Val, ylp);
184 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
185 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
186 tmp2 = _mm_mul_ps(aVal, yh);
187 tmp2p = _mm_mul_ps(phase_Val, yhp);
189 z = _mm_addsub_ps(tmp1, tmp2);
190 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
192 _mm_storeu_ps((
float*)cPtr, z);
197 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
198 tmp2 = _mm_hadd_ps(tmp1, tmp1);
199 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
200 phase_Val = _mm_div_ps(phase_Val, tmp1);
203 aVal = _mm_loadu_ps((
float*)aPtr);
205 yl = _mm_moveldup_ps(phase_Val);
206 yh = _mm_movehdup_ps(phase_Val);
207 ylp = _mm_moveldup_ps(inc_Val);
208 yhp = _mm_movehdup_ps(inc_Val);
210 tmp1 = _mm_mul_ps(aVal, yl);
212 tmp1p = _mm_mul_ps(phase_Val, ylp);
214 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
215 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
216 tmp2 = _mm_mul_ps(aVal, yh);
217 tmp2p = _mm_mul_ps(phase_Val, yhp);
219 z = _mm_addsub_ps(tmp1, tmp2);
220 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
222 _mm_storeu_ps((
float*)cPtr, z);
228 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
229 for(i = 0; i < num_points%2; ++
i) {
230 *cPtr++ = *aPtr++ * phase_Ptr[0];
231 phase_Ptr[0] *= (phase_inc);
234 (*phase) = phase_Ptr[0];
242 #include <immintrin.h>
252 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
256 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
258 unsigned int i, j = 0;
260 for(i = 0; i < 4; ++
i) {
261 phase_Ptr[
i] *= incr;
270 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
272 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
274 const unsigned int fourthPoints = num_points / 4;
277 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
280 aVal = _mm256_load_ps((
float*)aPtr);
282 yl = _mm256_moveldup_ps(phase_Val);
283 yh = _mm256_movehdup_ps(phase_Val);
284 ylp = _mm256_moveldup_ps(inc_Val);
285 yhp = _mm256_movehdup_ps(inc_Val);
287 tmp1 = _mm256_mul_ps(aVal, yl);
288 tmp1p = _mm256_mul_ps(phase_Val, ylp);
290 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
291 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
292 tmp2 = _mm256_mul_ps(aVal, yh);
293 tmp2p = _mm256_mul_ps(phase_Val, yhp);
295 z = _mm256_addsub_ps(tmp1, tmp2);
296 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
298 _mm256_store_ps((
float*)cPtr, z);
303 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
304 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
305 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
306 phase_Val = _mm256_div_ps(phase_Val, tmp1);
309 aVal = _mm256_load_ps((
float*)aPtr);
311 yl = _mm256_moveldup_ps(phase_Val);
312 yh = _mm256_movehdup_ps(phase_Val);
313 ylp = _mm256_moveldup_ps(inc_Val);
314 yhp = _mm256_movehdup_ps(inc_Val);
316 tmp1 = _mm256_mul_ps(aVal, yl);
318 tmp1p = _mm256_mul_ps(phase_Val, ylp);
320 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
321 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
322 tmp2 = _mm256_mul_ps(aVal, yh);
323 tmp2p = _mm256_mul_ps(phase_Val, yhp);
325 z = _mm256_addsub_ps(tmp1, tmp2);
326 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
328 _mm256_store_ps((
float*)cPtr, z);
334 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
335 for(i = 0; i < num_points%4; ++
i) {
336 *cPtr++ = *aPtr++ * phase_Ptr[0];
337 phase_Ptr[0] *= (phase_inc);
340 (*phase) = phase_Ptr[0];
355 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
359 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
361 unsigned int i, j = 0;
363 for(i = 0; i < 4; ++
i) {
364 phase_Ptr[
i] *= incr;
373 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
375 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
377 const unsigned int fourthPoints = num_points / 4;
380 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
383 aVal = _mm256_loadu_ps((
float*)aPtr);
385 yl = _mm256_moveldup_ps(phase_Val);
386 yh = _mm256_movehdup_ps(phase_Val);
387 ylp = _mm256_moveldup_ps(inc_Val);
388 yhp = _mm256_movehdup_ps(inc_Val);
390 tmp1 = _mm256_mul_ps(aVal, yl);
391 tmp1p = _mm256_mul_ps(phase_Val, ylp);
393 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
394 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
395 tmp2 = _mm256_mul_ps(aVal, yh);
396 tmp2p = _mm256_mul_ps(phase_Val, yhp);
398 z = _mm256_addsub_ps(tmp1, tmp2);
399 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
401 _mm256_storeu_ps((
float*)cPtr, z);
406 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
407 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
408 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
409 phase_Val = _mm256_div_ps(phase_Val, tmp1);
412 aVal = _mm256_loadu_ps((
float*)aPtr);
414 yl = _mm256_moveldup_ps(phase_Val);
415 yh = _mm256_movehdup_ps(phase_Val);
416 ylp = _mm256_moveldup_ps(inc_Val);
417 yhp = _mm256_movehdup_ps(inc_Val);
419 tmp1 = _mm256_mul_ps(aVal, yl);
421 tmp1p = _mm256_mul_ps(phase_Val, ylp);
423 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
424 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
425 tmp2 = _mm256_mul_ps(aVal, yh);
426 tmp2p = _mm256_mul_ps(phase_Val, yhp);
428 z = _mm256_addsub_ps(tmp1, tmp2);
429 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
431 _mm256_storeu_ps((
float*)cPtr, z);
437 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
438 for(i = 0; i < num_points%4; ++
i) {
439 *cPtr++ = *aPtr++ * phase_Ptr[0];
440 phase_Ptr[0] *= (phase_inc);
443 (*phase) = phase_Ptr[0];
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:8
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80