1 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
2 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
16 static inline void volk_32f_x2_multiply_32f_u_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
17 unsigned int number = 0;
18 const unsigned int quarterPoints = num_points / 4;
20 float* cPtr = cVector;
21 const float* aPtr = aVector;
22 const float* bPtr= bVector;
24 __m128 aVal, bVal, cVal;
25 for(;number < quarterPoints; number++){
27 aVal = _mm_loadu_ps(aPtr);
28 bVal = _mm_loadu_ps(bPtr);
30 cVal = _mm_mul_ps(aVal, bVal);
32 _mm_storeu_ps(cPtr,cVal);
39 number = quarterPoints * 4;
40 for(;number < num_points; number++){
41 *cPtr++ = (*aPtr++) * (*bPtr++);
47 #include <immintrin.h>
55 static inline void volk_32f_x2_multiply_32f_u_avx(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
56 unsigned int number = 0;
57 const unsigned int eighthPoints = num_points / 8;
59 float* cPtr = cVector;
60 const float* aPtr = aVector;
61 const float* bPtr= bVector;
63 __m256 aVal, bVal, cVal;
64 for(;number < eighthPoints; number++){
66 aVal = _mm256_loadu_ps(aPtr);
67 bVal = _mm256_loadu_ps(bPtr);
69 cVal = _mm256_mul_ps(aVal, bVal);
71 _mm256_storeu_ps(cPtr,cVal);
78 number = eighthPoints * 8;
79 for(;number < num_points; number++){
80 *cPtr++ = (*aPtr++) * (*bPtr++);
85 #ifdef LV_HAVE_GENERIC
93 static inline void volk_32f_x2_multiply_32f_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
94 float* cPtr = cVector;
95 const float* aPtr = aVector;
96 const float* bPtr= bVector;
97 unsigned int number = 0;
99 for(number = 0; number < num_points; number++){
100 *cPtr++ = (*aPtr++) * (*bPtr++);
107 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
108 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
114 #include <xmmintrin.h>
122 static inline void volk_32f_x2_multiply_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
123 unsigned int number = 0;
124 const unsigned int quarterPoints = num_points / 4;
126 float* cPtr = cVector;
127 const float* aPtr = aVector;
128 const float* bPtr= bVector;
130 __m128 aVal, bVal, cVal;
131 for(;number < quarterPoints; number++){
133 aVal = _mm_load_ps(aPtr);
134 bVal = _mm_load_ps(bPtr);
136 cVal = _mm_mul_ps(aVal, bVal);
138 _mm_store_ps(cPtr,cVal);
145 number = quarterPoints * 4;
146 for(;number < num_points; number++){
147 *cPtr++ = (*aPtr++) * (*bPtr++);
153 #include <immintrin.h>
161 static inline void volk_32f_x2_multiply_32f_a_avx(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
162 unsigned int number = 0;
163 const unsigned int eighthPoints = num_points / 8;
165 float* cPtr = cVector;
166 const float* aPtr = aVector;
167 const float* bPtr= bVector;
169 __m256 aVal, bVal, cVal;
170 for(;number < eighthPoints; number++){
172 aVal = _mm256_load_ps(aPtr);
173 bVal = _mm256_load_ps(bPtr);
175 cVal = _mm256_mul_ps(aVal, bVal);
177 _mm256_store_ps(cPtr,cVal);
184 number = eighthPoints * 8;
185 for(;number < num_points; number++){
186 *cPtr++ = (*aPtr++) * (*bPtr++);
192 #include <arm_neon.h>
201 static inline void volk_32f_x2_multiply_32f_neon(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
202 const unsigned int quarter_points = num_points / 4;
204 float32x4_t avec, bvec, cvec;
205 for(number=0; number < quarter_points; ++number) {
206 avec = vld1q_f32(aVector);
207 bvec = vld1q_f32(bVector);
208 cvec = vmulq_f32(avec, bvec);
209 vst1q_f32(cVector, cvec);
214 for(number=quarter_points*4; number < num_points; ++number) {
215 *cVector++ = *aVector++ * *bVector++;
220 #ifdef LV_HAVE_GENERIC
228 static inline void volk_32f_x2_multiply_32f_a_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
229 float* cPtr = cVector;
230 const float* aPtr = aVector;
231 const float* bPtr= bVector;
232 unsigned int number = 0;
234 for(number = 0; number < num_points; number++){
235 *cPtr++ = (*aPtr++) * (*bPtr++);
248 extern void volk_32f_x2_multiply_32f_a_orc_impl(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
249 static inline void volk_32f_x2_multiply_32f_u_orc(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
250 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);