GNU Radio 3.7.2 C++ API
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
2 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 
8 #ifndef MAX
9 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
10 #endif
11 
12 #ifdef LV_HAVE_SSE3
13 #include<xmmintrin.h>
14 #include<pmmintrin.h>
15 
16 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
17 
18  const unsigned int num_bytes = num_points*4;
19 
20  float result = 0.0;
21  float fst = 0.0;
22  float sq = 0.0;
23  float thrd = 0.0;
24  float frth = 0.0;
25  //float fith = 0.0;
26 
27 
28 
29  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
30 
31  xmm9 = _mm_setzero_ps();
32  xmm1 = _mm_setzero_ps();
33 
34  xmm0 = _mm_load1_ps(&center_point_array[0]);
35  xmm6 = _mm_load1_ps(&center_point_array[1]);
36  xmm7 = _mm_load1_ps(&center_point_array[2]);
37  xmm8 = _mm_load1_ps(&center_point_array[3]);
38  //xmm11 = _mm_load1_ps(&center_point_array[4]);
39  xmm10 = _mm_load1_ps(cutoff);
40 
41  int bound = num_bytes >> 4;
42  int leftovers = (num_bytes >> 2) & 3;
43  int i = 0;
44 
45  for(; i < bound; ++i) {
46  xmm2 = _mm_load_ps(src0);
47  xmm2 = _mm_max_ps(xmm10, xmm2);
48  xmm3 = _mm_mul_ps(xmm2, xmm2);
49  xmm4 = _mm_mul_ps(xmm2, xmm3);
50  xmm5 = _mm_mul_ps(xmm3, xmm3);
51  //xmm12 = _mm_mul_ps(xmm3, xmm4);
52 
53  xmm2 = _mm_mul_ps(xmm2, xmm0);
54  xmm3 = _mm_mul_ps(xmm3, xmm6);
55  xmm4 = _mm_mul_ps(xmm4, xmm7);
56  xmm5 = _mm_mul_ps(xmm5, xmm8);
57  //xmm12 = _mm_mul_ps(xmm12, xmm11);
58 
59  xmm2 = _mm_add_ps(xmm2, xmm3);
60  xmm3 = _mm_add_ps(xmm4, xmm5);
61 
62  src0 += 4;
63 
64  xmm9 = _mm_add_ps(xmm2, xmm9);
65 
66  xmm1 = _mm_add_ps(xmm3, xmm1);
67 
68  //xmm9 = _mm_add_ps(xmm12, xmm9);
69  }
70 
71  xmm2 = _mm_hadd_ps(xmm9, xmm1);
72  xmm3 = _mm_hadd_ps(xmm2, xmm2);
73  xmm4 = _mm_hadd_ps(xmm3, xmm3);
74 
75  _mm_store_ss(&result, xmm4);
76 
77 
78 
79  for(i = 0; i < leftovers; ++i) {
80  fst = src0[i];
81  fst = MAX(fst, *cutoff);
82  sq = fst * fst;
83  thrd = fst * sq;
84  frth = sq * sq;
85  //fith = sq * thrd;
86 
87  result += (center_point_array[0] * fst +
88  center_point_array[1] * sq +
89  center_point_array[2] * thrd +
90  center_point_array[3] * frth);// +
91  //center_point_array[4] * fith);
92  }
93 
94  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
95 
96  target[0] = result;
97 }
98 
99 
100 #endif /*LV_HAVE_SSE3*/
101 
102 #ifdef LV_HAVE_GENERIC
103 
104 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
105 
106  const unsigned int num_bytes = num_points*4;
107 
108  float result = 0.0;
109  float fst = 0.0;
110  float sq = 0.0;
111  float thrd = 0.0;
112  float frth = 0.0;
113  //float fith = 0.0;
114 
115 
116 
117  unsigned int i = 0;
118 
119  for(; i < num_bytes >> 2; ++i) {
120  fst = src0[i];
121  fst = MAX(fst, *cutoff);
122 
123  sq = fst * fst;
124  thrd = fst * sq;
125  frth = sq * sq;
126  //fith = sq * thrd;
127 
128  result += (center_point_array[0] * fst +
129  center_point_array[1] * sq +
130  center_point_array[2] * thrd +
131  center_point_array[3] * frth); //+
132  //center_point_array[4] * fith);
133  /*printf("%f12...%d\n", (center_point_array[0] * fst +
134  center_point_array[1] * sq +
135  center_point_array[2] * thrd +
136  center_point_array[3] * frth) +
137  //center_point_array[4] * fith) +
138  (center_point_array[4]), i);
139  */
140  }
141 
142  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
143 
144 
145 
146  *target = result;
147 }
148 
149 #endif /*LV_HAVE_GENERIC*/
150 
151 
152 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:9