1 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
2 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
9 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
16 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points) {
18 const unsigned int num_bytes = num_points*4;
29 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
31 xmm9 = _mm_setzero_ps();
32 xmm1 = _mm_setzero_ps();
34 xmm0 = _mm_load1_ps(¢er_point_array[0]);
35 xmm6 = _mm_load1_ps(¢er_point_array[1]);
36 xmm7 = _mm_load1_ps(¢er_point_array[2]);
37 xmm8 = _mm_load1_ps(¢er_point_array[3]);
39 xmm10 = _mm_load1_ps(cutoff);
41 int bound = num_bytes >> 4;
42 int leftovers = (num_bytes >> 2) & 3;
45 for(; i < bound; ++i) {
46 xmm2 = _mm_load_ps(src0);
47 xmm2 = _mm_max_ps(xmm10, xmm2);
48 xmm3 = _mm_mul_ps(xmm2, xmm2);
49 xmm4 = _mm_mul_ps(xmm2, xmm3);
50 xmm5 = _mm_mul_ps(xmm3, xmm3);
53 xmm2 = _mm_mul_ps(xmm2, xmm0);
54 xmm3 = _mm_mul_ps(xmm3, xmm6);
55 xmm4 = _mm_mul_ps(xmm4, xmm7);
56 xmm5 = _mm_mul_ps(xmm5, xmm8);
59 xmm2 = _mm_add_ps(xmm2, xmm3);
60 xmm3 = _mm_add_ps(xmm4, xmm5);
64 xmm9 = _mm_add_ps(xmm2, xmm9);
66 xmm1 = _mm_add_ps(xmm3, xmm1);
71 xmm2 = _mm_hadd_ps(xmm9, xmm1);
72 xmm3 = _mm_hadd_ps(xmm2, xmm2);
73 xmm4 = _mm_hadd_ps(xmm3, xmm3);
75 _mm_store_ss(&result, xmm4);
79 for(i = 0; i < leftovers; ++i) {
81 fst =
MAX(fst, *cutoff);
87 result += (center_point_array[0] * fst +
88 center_point_array[1] * sq +
89 center_point_array[2] * thrd +
90 center_point_array[3] * frth);
94 result += ((float)((bound * 4) + leftovers)) * center_point_array[4];
104 #include<immintrin.h>
106 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points)
108 const unsigned int eighth_points = num_points / 8;
114 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
116 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
118 cpa0 = _mm256_set1_ps(center_point_array[0]);
119 cpa1 = _mm256_set1_ps(center_point_array[1]);
120 cpa2 = _mm256_set1_ps(center_point_array[2]);
121 cpa3 = _mm256_set1_ps(center_point_array[3]);
122 cutoff_vec = _mm256_set1_ps(*cutoff);
123 target_vec = _mm256_setzero_ps();
127 for(i = 0; i < eighth_points; ++i) {
128 x_to_1 = _mm256_load_ps(src0);
129 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
130 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
131 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
133 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
135 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
136 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
137 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
138 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
140 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
141 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
143 target_vec = _mm256_add_ps(x_to_1, target_vec);
144 target_vec = _mm256_add_ps(x_to_3, target_vec);
151 target_vec = _mm256_hadd_ps(target_vec, target_vec);
152 _mm256_store_ps(temp_results, target_vec);
153 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
156 for(i = eighth_points*8; i < num_points; ++i) {
158 fst =
MAX(fst, *cutoff);
163 *target += (center_point_array[0] * fst +
164 center_point_array[1] * sq +
165 center_point_array[2] * thrd +
166 center_point_array[3] * frth);
169 *target += ((float)(num_points)) * center_point_array[4];
172 #endif // LV_HAVE_AVX
175 #ifdef LV_HAVE_GENERIC
177 static inline void volk_32f_x3_sum_of_poly_32f_generic(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points) {
179 const unsigned int num_bytes = num_points*4;
192 for(; i < num_bytes >> 2; ++i) {
194 fst =
MAX(fst, *cutoff);
201 result += (center_point_array[0] * fst +
202 center_point_array[1] * sq +
203 center_point_array[2] * thrd +
204 center_point_array[3] * frth);
215 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);
226 #include<immintrin.h>
228 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points)
230 const unsigned int eighth_points = num_points / 8;
236 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
238 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
240 cpa0 = _mm256_set1_ps(center_point_array[0]);
241 cpa1 = _mm256_set1_ps(center_point_array[1]);
242 cpa2 = _mm256_set1_ps(center_point_array[2]);
243 cpa3 = _mm256_set1_ps(center_point_array[3]);
244 cutoff_vec = _mm256_set1_ps(*cutoff);
245 target_vec = _mm256_setzero_ps();
249 for(i = 0; i < eighth_points; ++i) {
250 x_to_1 = _mm256_loadu_ps(src0);
251 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
252 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
253 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
255 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
257 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
258 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
259 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
260 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
262 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
263 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
265 target_vec = _mm256_add_ps(x_to_1, target_vec);
266 target_vec = _mm256_add_ps(x_to_3, target_vec);
273 target_vec = _mm256_hadd_ps(target_vec, target_vec);
274 _mm256_store_ps(temp_results, target_vec);
275 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
278 for(i = eighth_points*8; i < num_points; ++i) {
280 fst =
MAX(fst, *cutoff);
285 *target += (center_point_array[0] * fst +
286 center_point_array[1] * sq +
287 center_point_array[2] * thrd +
288 center_point_array[3] * frth);
291 *target += ((float)(num_points)) * center_point_array[4];
294 #endif // LV_HAVE_AVX
297 #include <arm_neon.h>
299 static inline void volk_32f_x3_sum_of_poly_32f_a_neon(
float* __restrict target,
float* __restrict src0,
float* __restrict center_point_array,
float* __restrict cutoff,
unsigned int num_points) {
303 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
305 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
306 float32x2_t cutoff_vector;
307 float32x2x2_t x_low, x_high;
308 float32x4_t x_qvector, c_qvector, cpa_qvector;
310 float res_accumulators[4];
312 c_qvector = vld1q_f32( zero );
314 cutoff_vector = vdup_n_f32( *cutoff );
316 cpa_qvector = vld1q_f32( center_point_array );
318 for(i=0; i < num_points; ++i) {
320 x_to_1 = vdup_n_f32( *src0++ );
323 x_to_1 = vmax_f32(x_to_1, cutoff_vector );
324 x_to_2 = vmul_f32(x_to_1, x_to_1);
325 x_to_3 = vmul_f32(x_to_2, x_to_1);
326 x_to_4 = vmul_f32(x_to_3, x_to_1);
328 x_low = vzip_f32(x_to_1, x_to_2);
329 x_high = vzip_f32(x_to_3, x_to_4);
331 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
334 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
338 vst1q_f32(res_accumulators, c_qvector );
339 accumulator = res_accumulators[0] + res_accumulators[1] +
340 res_accumulators[2] + res_accumulators[3];
342 *target = accumulator + center_point_array[4] * (float)num_points;
349 static inline void volk_32f_x3_sum_of_poly_32f_neonvert(
float* __restrict target,
float* __restrict src0,
float* __restrict center_point_array,
float* __restrict cutoff,
unsigned int num_points) {
353 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
358 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
359 accumulator1_vec = vld1q_f32(zero);
360 accumulator2_vec = vld1q_f32(zero);
361 accumulator3_vec = vld1q_f32(zero);
362 accumulator4_vec = vld1q_f32(zero);
363 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
364 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
367 cutoff_vector = vdupq_n_f32( *cutoff );
369 cpa_0 = vdupq_n_f32(center_point_array[0]);
370 cpa_1 = vdupq_n_f32(center_point_array[1]);
371 cpa_2 = vdupq_n_f32(center_point_array[2]);
372 cpa_3 = vdupq_n_f32(center_point_array[3]);
376 for(i=0; i < num_points/4; ++i) {
378 x_to_1 = vld1q_f32( src0 );
381 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector );
382 x_to_2 = vmulq_f32(x_to_1, x_to_1);
383 x_to_3 = vmulq_f32(x_to_2, x_to_1);
384 x_to_4 = vmulq_f32(x_to_3, x_to_1);
385 x_to_1 = vmulq_f32(x_to_1, cpa_0);
386 x_to_2 = vmulq_f32(x_to_2, cpa_1);
387 x_to_3 = vmulq_f32(x_to_3, cpa_2);
388 x_to_4 = vmulq_f32(x_to_4, cpa_3);
389 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
390 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
391 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
392 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
396 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
397 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
398 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
401 vst1q_f32(res_accumulators, accumulator1_vec );
402 accumulator = res_accumulators[0] + res_accumulators[1] +
403 res_accumulators[2] + res_accumulators[3];
410 for(i = 4*num_points/4; i < num_points; ++i) {
412 fst =
MAX(fst, *cutoff);
419 accumulator += (center_point_array[0] * fst +
420 center_point_array[1] * sq +
421 center_point_array[2] * thrd +
422 center_point_array[3] * frth);
425 *target = accumulator + center_point_array[4] * (float)num_points;
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:9
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27