1 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
2 #define INCLUDED_volk_32fc_index_max_16u_a_H
14 static inline void volk_32fc_index_max_16u_a_sse3(
unsigned int* target,
lv_32fc_t* src0,
unsigned int num_points) {
16 const unsigned int num_bytes = num_points*8;
26 __m128 xmm1, xmm2, xmm3;
27 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
29 xmm5.int_vec = xmmfive = _mm_setzero_si128();
30 xmm4.int_vec = xmmfour = _mm_setzero_si128();
31 holderf.int_vec = holder0 = _mm_setzero_si128();
32 holderi.int_vec = holder1 = _mm_setzero_si128();
35 int bound = num_bytes >> 5;
36 int leftovers0 = (num_bytes >> 4) & 1;
37 int leftovers1 = (num_bytes >> 3) & 1;
41 xmm8 = _mm_set_epi32(3, 2, 1, 0);
42 xmm9 = xmm8 = _mm_setzero_si128();
43 xmm10 = _mm_set_epi32(4, 4, 4, 4);
44 xmm3 = _mm_setzero_ps();
49 for(; i < bound; ++
i) {
51 xmm1 = _mm_load_ps((
float*)src0);
52 xmm2 = _mm_load_ps((
float*)&src0[2]);
58 xmm1 = _mm_mul_ps(xmm1, xmm1);
59 xmm2 = _mm_mul_ps(xmm2, xmm2);
62 xmm1 = _mm_hadd_ps(xmm1, xmm2);
64 xmm3 = _mm_max_ps(xmm1, xmm3);
66 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
67 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
71 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
72 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
74 xmm9 = _mm_add_epi32(xmm11, xmm12);
76 xmm8 = _mm_add_epi32(xmm8, xmm10);
85 for(i = 0; i < leftovers0; ++
i) {
88 xmm2 = _mm_load_ps((
float*)src0);
90 xmm1 = _mm_movelh_ps(
bit128_p(&xmm8)->float_vec,
bit128_p(&xmm8)->float_vec);
93 xmm2 = _mm_mul_ps(xmm2, xmm2);
97 xmm1 = _mm_hadd_ps(xmm2, xmm2);
99 xmm3 = _mm_max_ps(xmm1, xmm3);
101 xmm10 = _mm_set_epi32(2, 2, 2, 2);
104 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
105 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
109 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
110 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
112 xmm9 = _mm_add_epi32(xmm11, xmm12);
114 xmm8 = _mm_add_epi32(xmm8, xmm10);
122 for(i = 0; i < leftovers1; ++
i) {
128 xmm2 = _mm_load1_ps(&sq_dist);
132 xmm3 = _mm_max_ss(xmm3, xmm2);
136 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
137 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
140 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
142 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
143 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
146 xmm9 = _mm_add_epi32(xmm11, xmm12);
154 _mm_store_ps((
float*)&(holderf.f), xmm3);
155 _mm_store_si128(&(holderi.int_vec), xmm9);
157 target[0] = holderi.i[0];
158 sq_dist = holderf.f[0];
159 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
160 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
161 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
162 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
163 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
164 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
191 #ifdef LV_HAVE_GENERIC
192 static inline void volk_32fc_index_max_16u_generic(
unsigned int* target,
lv_32fc_t* src0,
unsigned int num_points) {
194 const unsigned int num_bytes = num_points*8;
198 unsigned int index = 0;
202 for(; i < num_bytes >> 3; ++
i) {
206 index = sq_dist > max ? i : index;
207 max = sq_dist > max ? sq_dist : max;
#define bit128_p(x)
Definition: volk_common.h:94
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
Definition: volk_common.h:78
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80