GNU Radio Manual and C++ API Reference  3.7.5.1
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
2 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 
8 #ifndef MAX
9 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
10 #endif
11 
12 #ifdef LV_HAVE_SSE3
13 #include<xmmintrin.h>
14 #include<pmmintrin.h>
15 
16 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
17 
18  const unsigned int num_bytes = num_points*4;
19 
20  float result = 0.0;
21  float fst = 0.0;
22  float sq = 0.0;
23  float thrd = 0.0;
24  float frth = 0.0;
25  //float fith = 0.0;
26 
27 
28 
29  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
30 
31  xmm9 = _mm_setzero_ps();
32  xmm1 = _mm_setzero_ps();
33 
34  xmm0 = _mm_load1_ps(&center_point_array[0]);
35  xmm6 = _mm_load1_ps(&center_point_array[1]);
36  xmm7 = _mm_load1_ps(&center_point_array[2]);
37  xmm8 = _mm_load1_ps(&center_point_array[3]);
38  //xmm11 = _mm_load1_ps(&center_point_array[4]);
39  xmm10 = _mm_load1_ps(cutoff);
40 
41  int bound = num_bytes >> 4;
42  int leftovers = (num_bytes >> 2) & 3;
43  int i = 0;
44 
45  for(; i < bound; ++i) {
46  xmm2 = _mm_load_ps(src0);
47  xmm2 = _mm_max_ps(xmm10, xmm2);
48  xmm3 = _mm_mul_ps(xmm2, xmm2);
49  xmm4 = _mm_mul_ps(xmm2, xmm3);
50  xmm5 = _mm_mul_ps(xmm3, xmm3);
51  //xmm12 = _mm_mul_ps(xmm3, xmm4);
52 
53  xmm2 = _mm_mul_ps(xmm2, xmm0);
54  xmm3 = _mm_mul_ps(xmm3, xmm6);
55  xmm4 = _mm_mul_ps(xmm4, xmm7);
56  xmm5 = _mm_mul_ps(xmm5, xmm8);
57  //xmm12 = _mm_mul_ps(xmm12, xmm11);
58 
59  xmm2 = _mm_add_ps(xmm2, xmm3);
60  xmm3 = _mm_add_ps(xmm4, xmm5);
61 
62  src0 += 4;
63 
64  xmm9 = _mm_add_ps(xmm2, xmm9);
65 
66  xmm1 = _mm_add_ps(xmm3, xmm1);
67 
68  //xmm9 = _mm_add_ps(xmm12, xmm9);
69  }
70 
71  xmm2 = _mm_hadd_ps(xmm9, xmm1);
72  xmm3 = _mm_hadd_ps(xmm2, xmm2);
73  xmm4 = _mm_hadd_ps(xmm3, xmm3);
74 
75  _mm_store_ss(&result, xmm4);
76 
77 
78 
79  for(i = 0; i < leftovers; ++i) {
80  fst = src0[i];
81  fst = MAX(fst, *cutoff);
82  sq = fst * fst;
83  thrd = fst * sq;
84  frth = sq * sq;
85  //fith = sq * thrd;
86 
87  result += (center_point_array[0] * fst +
88  center_point_array[1] * sq +
89  center_point_array[2] * thrd +
90  center_point_array[3] * frth);// +
91  //center_point_array[4] * fith);
92  }
93 
94  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
95 
96  target[0] = result;
97 }
98 
99 
100 #endif /*LV_HAVE_SSE3*/
101 
102 
103 #ifdef LV_HAVE_AVX
104 #include<immintrin.h>
105 
106 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
107 {
108  const unsigned int eighth_points = num_points / 8;
109  float fst = 0.0;
110  float sq = 0.0;
111  float thrd = 0.0;
112  float frth = 0.0;
113 
114  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
115  __m256 target_vec;
116  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
117 
118  cpa0 = _mm256_set1_ps(center_point_array[0]);
119  cpa1 = _mm256_set1_ps(center_point_array[1]);
120  cpa2 = _mm256_set1_ps(center_point_array[2]);
121  cpa3 = _mm256_set1_ps(center_point_array[3]);
122  cutoff_vec = _mm256_set1_ps(*cutoff);
123  target_vec = _mm256_setzero_ps();
124 
125  unsigned int i;
126 
127  for(i = 0; i < eighth_points; ++i) {
128  x_to_1 = _mm256_load_ps(src0);
129  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
130  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
131  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
132  // x^1 * x^3 is slightly faster than x^2 * x^2
133  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
134 
135  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
136  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
137  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
138  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
139 
140  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
141  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
142  // this is slightly faster than result += (x_to_1 + x_to_3)
143  target_vec = _mm256_add_ps(x_to_1, target_vec);
144  target_vec = _mm256_add_ps(x_to_3, target_vec);
145 
146  src0 += 8;
147  }
148 
149  // the hadd for vector reduction has very very slight impact @ 50k iters
150  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
151  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
152  _mm256_store_ps(temp_results, target_vec);
153  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
154 
155 
156  for(i = eighth_points*8; i < num_points; ++i) {
157  fst = *(src0++);
158  fst = MAX(fst, *cutoff);
159  sq = fst * fst;
160  thrd = fst * sq;
161  frth = sq * sq;
162 
163  *target += (center_point_array[0] * fst +
164  center_point_array[1] * sq +
165  center_point_array[2] * thrd +
166  center_point_array[3] * frth);
167  }
168 
169  *target += ((float)(num_points)) * center_point_array[4];
170 
171 }
172 #endif // LV_HAVE_AVX
173 
174 
175 #ifdef LV_HAVE_GENERIC
176 
177 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
178 
179  const unsigned int num_bytes = num_points*4;
180 
181  float result = 0.0;
182  float fst = 0.0;
183  float sq = 0.0;
184  float thrd = 0.0;
185  float frth = 0.0;
186  //float fith = 0.0;
187 
188 
189 
190  unsigned int i = 0;
191 
192  for(; i < num_bytes >> 2; ++i) {
193  fst = src0[i];
194  fst = MAX(fst, *cutoff);
195 
196  sq = fst * fst;
197  thrd = fst * sq;
198  frth = sq * sq;
199  //fith = sq * thrd;
200 
201  result += (center_point_array[0] * fst +
202  center_point_array[1] * sq +
203  center_point_array[2] * thrd +
204  center_point_array[3] * frth); //+
205  //center_point_array[4] * fith);
206  /*printf("%f12...%d\n", (center_point_array[0] * fst +
207  center_point_array[1] * sq +
208  center_point_array[2] * thrd +
209  center_point_array[3] * frth) +
210  //center_point_array[4] * fith) +
211  (center_point_array[4]), i);
212  */
213  }
214 
215  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
216 
217 
218 
219  *target = result;
220 }
221 
222 #endif /*LV_HAVE_GENERIC*/
223 
224 
225 #ifdef LV_HAVE_AVX
226 #include<immintrin.h>
227 
228 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
229 {
230  const unsigned int eighth_points = num_points / 8;
231  float fst = 0.0;
232  float sq = 0.0;
233  float thrd = 0.0;
234  float frth = 0.0;
235 
236  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
237  __m256 target_vec;
238  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
239 
240  cpa0 = _mm256_set1_ps(center_point_array[0]);
241  cpa1 = _mm256_set1_ps(center_point_array[1]);
242  cpa2 = _mm256_set1_ps(center_point_array[2]);
243  cpa3 = _mm256_set1_ps(center_point_array[3]);
244  cutoff_vec = _mm256_set1_ps(*cutoff);
245  target_vec = _mm256_setzero_ps();
246 
247  unsigned int i;
248 
249  for(i = 0; i < eighth_points; ++i) {
250  x_to_1 = _mm256_loadu_ps(src0);
251  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
252  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
253  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
254  // x^1 * x^3 is slightly faster than x^2 * x^2
255  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
256 
257  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
258  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
259  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
260  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
261 
262  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
263  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
264  // this is slightly faster than result += (x_to_1 + x_to_3)
265  target_vec = _mm256_add_ps(x_to_1, target_vec);
266  target_vec = _mm256_add_ps(x_to_3, target_vec);
267 
268  src0 += 8;
269  }
270 
271  // the hadd for vector reduction has very very slight impact @ 50k iters
272  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
273  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
274  _mm256_store_ps(temp_results, target_vec);
275  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
276 
277 
278  for(i = eighth_points*8; i < num_points; ++i) {
279  fst = *(src0++);
280  fst = MAX(fst, *cutoff);
281  sq = fst * fst;
282  thrd = fst * sq;
283  frth = sq * sq;
284 
285  *target += (center_point_array[0] * fst +
286  center_point_array[1] * sq +
287  center_point_array[2] * thrd +
288  center_point_array[3] * frth);
289  }
290 
291  *target += ((float)(num_points)) * center_point_array[4];
292 
293 }
294 #endif // LV_HAVE_AVX
295 
296 #ifdef LV_HAVE_NEON
297 #include <arm_neon.h>
298 
299 static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
300 
301 
302  unsigned int i;
303  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
304 
305  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
306  float32x2_t cutoff_vector;
307  float32x2x2_t x_low, x_high;
308  float32x4_t x_qvector, c_qvector, cpa_qvector;
309  float accumulator;
310  float res_accumulators[4];
311 
312  c_qvector = vld1q_f32( zero );
313  // load the cutoff in to a vector
314  cutoff_vector = vdup_n_f32( *cutoff );
315  // ... center point array
316  cpa_qvector = vld1q_f32( center_point_array );
317 
318  for(i=0; i < num_points; ++i) {
319  // load x (src0)
320  x_to_1 = vdup_n_f32( *src0++ );
321 
322  // Get a vector of max(src0, cutoff)
323  x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
324  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
325  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
326  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
327  // zip up doubles to interleave
328  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
329  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
330  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
331  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
332  // now we finally have [x^4 | x^3 | x^2 | x] !
333 
334  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
335 
336  }
337  // there should be better vector reduction techniques
338  vst1q_f32(res_accumulators, c_qvector );
339  accumulator = res_accumulators[0] + res_accumulators[1] +
340  res_accumulators[2] + res_accumulators[3];
341 
342  *target = accumulator + center_point_array[4] * (float)num_points;
343 }
344 
345 #endif /* LV_HAVE_NEON */
346 
347 #ifdef LV_HAVE_NEON
348 
349 static inline void volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
350 
351 
352  unsigned int i;
353  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
354 
355  float accumulator;
356 
357 
358  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
359  accumulator1_vec = vld1q_f32(zero);
360  accumulator2_vec = vld1q_f32(zero);
361  accumulator3_vec = vld1q_f32(zero);
362  accumulator4_vec = vld1q_f32(zero);
363  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
364  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
365 
366  // load the cutoff in to a vector
367  cutoff_vector = vdupq_n_f32( *cutoff );
368  // ... center point array
369  cpa_0 = vdupq_n_f32(center_point_array[0]);
370  cpa_1 = vdupq_n_f32(center_point_array[1]);
371  cpa_2 = vdupq_n_f32(center_point_array[2]);
372  cpa_3 = vdupq_n_f32(center_point_array[3]);
373 
374 
375  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
376  for(i=0; i < num_points/4; ++i) {
377  // load x
378  x_to_1 = vld1q_f32( src0 );
379 
380  // Get a vector of max(src0, cutoff)
381  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
382  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
383  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
384  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
385  x_to_1 = vmulq_f32(x_to_1, cpa_0);
386  x_to_2 = vmulq_f32(x_to_2, cpa_1);
387  x_to_3 = vmulq_f32(x_to_3, cpa_2);
388  x_to_4 = vmulq_f32(x_to_4, cpa_3);
389  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
390  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
391  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
392  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
393 
394  src0 += 4;
395  }
396  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
397  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
398  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
399 
400  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
401  vst1q_f32(res_accumulators, accumulator1_vec );
402  accumulator = res_accumulators[0] + res_accumulators[1] +
403  res_accumulators[2] + res_accumulators[3];
404 
405  float fst = 0.0;
406  float sq = 0.0;
407  float thrd = 0.0;
408  float frth = 0.0;
409 
410  for(i = 4*num_points/4; i < num_points; ++i) {
411  fst = src0[i];
412  fst = MAX(fst, *cutoff);
413 
414  sq = fst * fst;
415  thrd = fst * sq;
416  frth = sq * sq;
417  //fith = sq * thrd;
418 
419  accumulator += (center_point_array[0] * fst +
420  center_point_array[1] * sq +
421  center_point_array[2] * thrd +
422  center_point_array[3] * frth); //+
423  }
424 
425  *target = accumulator + center_point_array[4] * (float)num_points;
426 }
427 
428 #endif /* LV_HAVE_NEON */
429 
430 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:9
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27