1 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
2 #define INCLUDED_volk_32f_x2_add_32f_u_H
16 static inline void volk_32f_x2_add_32f_u_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
17 unsigned int number = 0;
18 const unsigned int quarterPoints = num_points / 4;
20 float* cPtr = cVector;
21 const float* aPtr = aVector;
22 const float* bPtr= bVector;
24 __m128 aVal, bVal, cVal;
25 for(;number < quarterPoints; number++){
27 aVal = _mm_loadu_ps(aPtr);
28 bVal = _mm_loadu_ps(bPtr);
30 cVal = _mm_add_ps(aVal, bVal);
32 _mm_storeu_ps(cPtr,cVal);
39 number = quarterPoints * 4;
40 for(;number < num_points; number++){
41 *cPtr++ = (*aPtr++) + (*bPtr++);
46 #ifdef LV_HAVE_GENERIC
54 static inline void volk_32f_x2_add_32f_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
55 float* cPtr = cVector;
56 const float* aPtr = aVector;
57 const float* bPtr= bVector;
58 unsigned int number = 0;
60 for(number = 0; number < num_points; number++){
61 *cPtr++ = (*aPtr++) + (*bPtr++);
67 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
68 #define INCLUDED_volk_32f_x2_add_32f_a_H
74 #include <xmmintrin.h>
82 static inline void volk_32f_x2_add_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
83 unsigned int number = 0;
84 const unsigned int quarterPoints = num_points / 4;
86 float* cPtr = cVector;
87 const float* aPtr = aVector;
88 const float* bPtr= bVector;
90 __m128 aVal, bVal, cVal;
91 for(;number < quarterPoints; number++){
93 aVal = _mm_load_ps(aPtr);
94 bVal = _mm_load_ps(bPtr);
96 cVal = _mm_add_ps(aVal, bVal);
98 _mm_store_ps(cPtr,cVal);
105 number = quarterPoints * 4;
106 for(;number < num_points; number++){
107 *cPtr++ = (*aPtr++) + (*bPtr++);
113 #include <arm_neon.h>
121 static inline void volk_32f_x2_add_32f_u_neon(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points) {
122 unsigned int number = 0;
123 const unsigned int quarterPoints = num_points / 4;
125 float* cPtr = cVector;
126 const float* aPtr = aVector;
127 const float* bPtr= bVector;
128 float32x4_t aVal, bVal, cVal;
129 for(number=0; number < quarterPoints; number++){
131 aVal = vld1q_f32(aPtr);
132 bVal = vld1q_f32(bPtr);
133 __builtin_prefetch(aPtr+4);
134 __builtin_prefetch(bPtr+4);
137 cVal = vaddq_f32(aVal, bVal);
139 vst1q_f32(cPtr,cVal);
146 number = quarterPoints * 4;
147 for(;number < num_points; number++){
148 *cPtr++ = (*aPtr++) + (*bPtr++);
155 #ifdef LV_HAVE_GENERIC
163 static inline void volk_32f_x2_add_32f_a_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
164 float* cPtr = cVector;
165 const float* aPtr = aVector;
166 const float* bPtr= bVector;
167 unsigned int number = 0;
169 for(number = 0; number < num_points; number++){
170 *cPtr++ = (*aPtr++) + (*bPtr++);
183 extern void volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
184 static inline void volk_32f_x2_add_32f_u_orc(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
185 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);