1 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
2 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
16 static inline void volk_32f_s32f_multiply_32f_u_sse(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
17 unsigned int number = 0;
18 const unsigned int quarterPoints = num_points / 4;
20 float* cPtr = cVector;
21 const float* aPtr = aVector;
23 __m128 aVal, bVal, cVal;
24 bVal = _mm_set_ps1(scalar);
25 for(;number < quarterPoints; number++){
27 aVal = _mm_loadu_ps(aPtr);
29 cVal = _mm_mul_ps(aVal, bVal);
31 _mm_storeu_ps(cPtr,cVal);
37 number = quarterPoints * 4;
38 for(;number < num_points; number++){
39 *cPtr++ = (*aPtr++) * scalar;
45 #include <immintrin.h>
53 static inline void volk_32f_s32f_multiply_32f_u_avx(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
54 unsigned int number = 0;
55 const unsigned int eighthPoints = num_points / 8;
57 float* cPtr = cVector;
58 const float* aPtr = aVector;
60 __m256 aVal, bVal, cVal;
61 bVal = _mm256_set1_ps(scalar);
62 for(;number < eighthPoints; number++){
64 aVal = _mm256_loadu_ps(aPtr);
66 cVal = _mm256_mul_ps(aVal, bVal);
68 _mm256_storeu_ps(cPtr,cVal);
74 number = eighthPoints * 8;
75 for(;number < num_points; number++){
76 *cPtr++ = (*aPtr++) * scalar;
81 #ifdef LV_HAVE_GENERIC
89 static inline void volk_32f_s32f_multiply_32f_generic(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
90 unsigned int number = 0;
91 const float* inputPtr = aVector;
92 float* outputPtr = cVector;
93 for(number = 0; number < num_points; number++){
94 *outputPtr = (*inputPtr) * scalar;
103 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
104 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
110 #include <xmmintrin.h>
118 static inline void volk_32f_s32f_multiply_32f_a_sse(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
119 unsigned int number = 0;
120 const unsigned int quarterPoints = num_points / 4;
122 float* cPtr = cVector;
123 const float* aPtr = aVector;
125 __m128 aVal, bVal, cVal;
126 bVal = _mm_set_ps1(scalar);
127 for(;number < quarterPoints; number++){
129 aVal = _mm_load_ps(aPtr);
131 cVal = _mm_mul_ps(aVal, bVal);
133 _mm_store_ps(cPtr,cVal);
139 number = quarterPoints * 4;
140 for(;number < num_points; number++){
141 *cPtr++ = (*aPtr++) * scalar;
147 #include <immintrin.h>
155 static inline void volk_32f_s32f_multiply_32f_a_avx(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
156 unsigned int number = 0;
157 const unsigned int eighthPoints = num_points / 8;
159 float* cPtr = cVector;
160 const float* aPtr = aVector;
162 __m256 aVal, bVal, cVal;
163 bVal = _mm256_set1_ps(scalar);
164 for(;number < eighthPoints; number++){
166 aVal = _mm256_load_ps(aPtr);
168 cVal = _mm256_mul_ps(aVal, bVal);
170 _mm256_store_ps(cPtr,cVal);
176 number = eighthPoints * 8;
177 for(;number < num_points; number++){
178 *cPtr++ = (*aPtr++) * scalar;
184 #include <arm_neon.h>
192 static inline void volk_32f_s32f_multiply_32f_u_neon(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
193 unsigned int number = 0;
194 const float* inputPtr = aVector;
195 float* outputPtr = cVector;
196 const unsigned int quarterPoints = num_points / 4;
198 float32x4_t aVal, cVal;
200 for(number = 0; number < quarterPoints; number++){
201 aVal = vld1q_f32(inputPtr);
202 cVal = vmulq_n_f32 (aVal, scalar);
203 vst1q_f32(outputPtr, cVal);
207 for(number = quarterPoints * 4; number < num_points; number++){
208 *outputPtr++ = (*inputPtr++) * scalar;
213 #ifdef LV_HAVE_GENERIC
221 static inline void volk_32f_s32f_multiply_32f_a_generic(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
222 unsigned int number = 0;
223 const float* inputPtr = aVector;
224 float* outputPtr = cVector;
225 for(number = 0; number < num_points; number++){
226 *outputPtr = (*inputPtr) * scalar;
241 extern void volk_32f_s32f_multiply_32f_a_orc_impl(
float* dst,
const float* src,
const float scalar,
unsigned int num_points);
242 static inline void volk_32f_s32f_multiply_32f_u_orc(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
243 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);