1 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
10 #include <pmmintrin.h>
18 static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
19 unsigned int number = 0;
20 const unsigned int halfPoints = num_points / 2;
22 __m128 x, y, yl, yh, z, tmp1, tmp2;
27 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
29 for(;number < halfPoints; number++){
31 x = _mm_loadu_ps((
float*)a);
32 y = _mm_loadu_ps((
float*)b);
34 y = _mm_xor_ps(y, conjugator);
36 yl = _mm_moveldup_ps(y);
37 yh = _mm_movehdup_ps(y);
39 tmp1 = _mm_mul_ps(x,yl);
41 x = _mm_shuffle_ps(x,x,0xB1);
43 tmp2 = _mm_mul_ps(x,yh);
45 z = _mm_addsub_ps(tmp1,tmp2);
47 _mm_storeu_ps((
float*)c,z);
54 if((num_points % 2) != 0) {
60 #ifdef LV_HAVE_GENERIC
68 static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
72 unsigned int number = 0;
74 for(number = 0; number < num_points; number++){
75 *cPtr++ = (*aPtr++) *
lv_conj(*bPtr++);
82 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
83 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
91 #include <pmmintrin.h>
99 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
100 unsigned int number = 0;
101 const unsigned int halfPoints = num_points / 2;
103 __m128 x, y, yl, yh, z, tmp1, tmp2;
108 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
110 for(;number < halfPoints; number++){
112 x = _mm_load_ps((
float*)a);
113 y = _mm_load_ps((
float*)b);
115 y = _mm_xor_ps(y, conjugator);
117 yl = _mm_moveldup_ps(y);
118 yh = _mm_movehdup_ps(y);
120 tmp1 = _mm_mul_ps(x,yl);
122 x = _mm_shuffle_ps(x,x,0xB1);
124 tmp2 = _mm_mul_ps(x,yh);
126 z = _mm_addsub_ps(tmp1,tmp2);
128 _mm_store_ps((
float*)c,z);
135 if((num_points % 2) != 0) {
142 #include <arm_neon.h>
150 static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
154 unsigned int quarter_points = num_points / 4;
155 float32x4x2_t a_val, b_val, c_val;
156 float32x4x2_t tmp_real, tmp_imag;
157 unsigned int number = 0;
159 for(number = 0; number < quarter_points; ++number) {
160 a_val = vld2q_f32((
float*)a_ptr);
161 b_val = vld2q_f32((
float*)b_ptr);
162 b_val.val[1] = vnegq_f32(b_val.val[1]);
163 __builtin_prefetch(a_ptr+4);
164 __builtin_prefetch(b_ptr+4);
168 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
170 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
174 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
176 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
179 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
180 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
181 vst2q_f32((
float*)cVector, c_val);
188 for(number = quarter_points*4; number < num_points; number++){
189 *cVector++ = (*a_ptr++) * conj(*b_ptr++);
194 #ifdef LV_HAVE_GENERIC
202 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
206 unsigned int number = 0;
208 for(number = 0; number < num_points; number++){
209 *cPtr++ = (*aPtr++) *
lv_conj(*bPtr++);
#define lv_conj(x)
Definition: volk_complex.h:80
float complex lv_32fc_t
Definition: volk_complex.h:56