1 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
10 #include <pmmintrin.h>
18 static inline void volk_32fc_x2_multiply_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
19 unsigned int number = 0;
20 const unsigned int halfPoints = num_points / 2;
22 __m128 x, y, yl, yh, z, tmp1, tmp2;
27 for(;number < halfPoints; number++){
29 x = _mm_loadu_ps((
float*)a);
30 y = _mm_loadu_ps((
float*)b);
32 yl = _mm_moveldup_ps(y);
33 yh = _mm_movehdup_ps(y);
35 tmp1 = _mm_mul_ps(x,yl);
37 x = _mm_shuffle_ps(x,x,0xB1);
39 tmp2 = _mm_mul_ps(x,yh);
41 z = _mm_addsub_ps(tmp1,tmp2);
43 _mm_storeu_ps((
float*)c,z);
50 if((num_points % 2) != 0) {
56 #ifdef LV_HAVE_GENERIC
64 static inline void volk_32fc_x2_multiply_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
68 unsigned int number = 0;
70 for(number = 0; number < num_points; number++){
71 *cPtr++ = (*aPtr++) * (*bPtr++);
78 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
79 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
87 #include <pmmintrin.h>
95 static inline void volk_32fc_x2_multiply_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
96 unsigned int number = 0;
97 const unsigned int halfPoints = num_points / 2;
99 __m128 x, y, yl, yh, z, tmp1, tmp2;
103 for(;number < halfPoints; number++){
105 x = _mm_load_ps((
float*)a);
106 y = _mm_load_ps((
float*)b);
108 yl = _mm_moveldup_ps(y);
109 yh = _mm_movehdup_ps(y);
111 tmp1 = _mm_mul_ps(x,yl);
113 x = _mm_shuffle_ps(x,x,0xB1);
115 tmp2 = _mm_mul_ps(x,yh);
117 z = _mm_addsub_ps(tmp1,tmp2);
119 _mm_store_ps((
float*)c,z);
126 if((num_points % 2) != 0) {
132 #ifdef LV_HAVE_GENERIC
140 static inline void volk_32fc_x2_multiply_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
144 unsigned int number = 0;
146 for(number = 0; number < num_points; number++){
147 *cPtr++ = (*aPtr++) * (*bPtr++);
153 #include <arm_neon.h>
162 static inline void volk_32fc_x2_multiply_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
166 unsigned int quarter_points = num_points / 4;
167 float32x4x2_t a_val, b_val, c_val;
168 float32x4x2_t tmp_real, tmp_imag;
169 unsigned int number = 0;
171 for(number = 0; number < quarter_points; ++number) {
172 a_val = vld2q_f32((
float*)a_ptr);
173 b_val = vld2q_f32((
float*)b_ptr);
174 __builtin_prefetch(a_ptr+4);
175 __builtin_prefetch(b_ptr+4);
179 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
181 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
185 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
187 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
190 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
191 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
192 vst2q_f32((
float*)cVector, c_val);
199 for(number = quarter_points*4; number < num_points; number++){
200 *cVector++ = (*a_ptr++) * (*b_ptr++);
214 static inline void volk_32fc_x2_multiply_32fc_neon_opttests(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
218 unsigned int quarter_points = num_points / 4;
219 float32x4x2_t a_val, b_val;
220 float32x4x2_t tmp_imag;
221 unsigned int number = 0;
223 for(number = 0; number < quarter_points; ++number) {
224 a_val = vld2q_f32((
float*)a_ptr);
225 b_val = vld2q_f32((
float*)b_ptr);
226 __builtin_prefetch(a_ptr+4);
227 __builtin_prefetch(b_ptr+4);
230 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
231 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
234 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
235 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
238 vst2q_f32((
float*)cVector, tmp_imag);
245 for(number = quarter_points*4; number < num_points; number++){
246 *cVector++ = (*a_ptr++) * (*b_ptr++);
260 extern void volk_32fc_x2_multiply_32fc_neonasm(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points);
271 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points);
272 static inline void volk_32fc_x2_multiply_32fc_u_orc(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
273 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
float complex lv_32fc_t
Definition: volk_complex.h:56