GNU Radio 3.7.3 C++ API
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
2 #define INCLUDED_volk_32fc_index_max_16u_a_H
3 
4 #include <volk/volk_common.h>
5 #include<inttypes.h>
6 #include<stdio.h>
7 #include<volk/volk_complex.h>
8 
9 #ifdef LV_HAVE_SSE3
10 #include<xmmintrin.h>
11 #include<pmmintrin.h>
12 
13 
14 static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
15 
16  const unsigned int num_bytes = num_points*8;
17 
18  union bit128 holderf;
19  union bit128 holderi;
20  float sq_dist = 0.0;
21 
22 
23 
24 
25  union bit128 xmm5, xmm4;
26  __m128 xmm1, xmm2, xmm3;
27  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
28 
29  xmm5.int_vec = xmmfive = _mm_setzero_si128();
30  xmm4.int_vec = xmmfour = _mm_setzero_si128();
31  holderf.int_vec = holder0 = _mm_setzero_si128();
32  holderi.int_vec = holder1 = _mm_setzero_si128();
33 
34 
35  int bound = num_bytes >> 5;
36  int leftovers0 = (num_bytes >> 4) & 1;
37  int leftovers1 = (num_bytes >> 3) & 1;
38  int i = 0;
39 
40 
41  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
42  xmm9 = xmm8 = _mm_setzero_si128();
43  xmm10 = _mm_set_epi32(4, 4, 4, 4);
44  xmm3 = _mm_setzero_ps();
45 ;
46 
47  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
48 
49  for(; i < bound; ++i) {
50 
51  xmm1 = _mm_load_ps((float*)src0);
52  xmm2 = _mm_load_ps((float*)&src0[2]);
53 
54 
55  src0 += 4;
56 
57 
58  xmm1 = _mm_mul_ps(xmm1, xmm1);
59  xmm2 = _mm_mul_ps(xmm2, xmm2);
60 
61 
62  xmm1 = _mm_hadd_ps(xmm1, xmm2);
63 
64  xmm3 = _mm_max_ps(xmm1, xmm3);
65 
66  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
67  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
68 
69 
70 
71  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
72  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
73 
74  xmm9 = _mm_add_epi32(xmm11, xmm12);
75 
76  xmm8 = _mm_add_epi32(xmm8, xmm10);
77 
78 
79  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
80  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
81 
82  }
83 
84 
85  for(i = 0; i < leftovers0; ++i) {
86 
87 
88  xmm2 = _mm_load_ps((float*)src0);
89 
90  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
91  xmm8 = bit128_p(&xmm1)->int_vec;
92 
93  xmm2 = _mm_mul_ps(xmm2, xmm2);
94 
95  src0 += 2;
96 
97  xmm1 = _mm_hadd_ps(xmm2, xmm2);
98 
99  xmm3 = _mm_max_ps(xmm1, xmm3);
100 
101  xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
102 
103 
104  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
105  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
106 
107 
108 
109  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
110  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
111 
112  xmm9 = _mm_add_epi32(xmm11, xmm12);
113 
114  xmm8 = _mm_add_epi32(xmm8, xmm10);
115  //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
116 
117  }
118 
119 
120 
121 
122  for(i = 0; i < leftovers1; ++i) {
123  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
124 
125 
126  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
127 
128  xmm2 = _mm_load1_ps(&sq_dist);
129 
130  xmm1 = xmm3;
131 
132  xmm3 = _mm_max_ss(xmm3, xmm2);
133 
134 
135 
136  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
137  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
138 
139 
140  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
141 
142  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
143  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
144 
145 
146  xmm9 = _mm_add_epi32(xmm11, xmm12);
147 
148  }
149 
150  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
151 
152  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
153 
154  _mm_store_ps((float*)&(holderf.f), xmm3);
155  _mm_store_si128(&(holderi.int_vec), xmm9);
156 
157  target[0] = holderi.i[0];
158  sq_dist = holderf.f[0];
159  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
160  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
161  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
162  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
163  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
164  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
165 
166 
167 
168  /*
169  float placeholder = 0.0;
170  uint32_t temp0, temp1;
171  unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
172  unsigned int l0 = g0 ^ 1;
173 
174  unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
175  unsigned int l1 = g1 ^ 1;
176 
177  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
178  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
179  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
180  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
181 
182  g0 = (sq_dist > placeholder);
183  l0 = g0 ^ 1;
184  target[0] = g0 * temp0 + l0 * temp1;
185  */
186 
187 }
188 
189 #endif /*LV_HAVE_SSE3*/
190 
191 #ifdef LV_HAVE_GENERIC
192 static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
193 
194  const unsigned int num_bytes = num_points*8;
195 
196  float sq_dist = 0.0;
197  float max = 0.0;
198  unsigned int index = 0;
199 
200  unsigned int i = 0;
201 
202  for(; i < num_bytes >> 3; ++i) {
203 
204  sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
205 
206  index = sq_dist > max ? i : index;
207  max = sq_dist > max ? sq_dist : max;
208 
209 
210  }
211  target[0] = index;
212 
213 }
214 
215 #endif /*LV_HAVE_GENERIC*/
216 
217 
218 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
#define bit128_p(x)
Definition: volk_common.h:94
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
Definition: volk_common.h:78
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80