1 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
2 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
16 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
18 const unsigned int num_bytes = num_points*2;
22 int bound = (num_bytes >> 4);
23 int bound_copy = bound;
24 int leftovers = (num_bytes >> 1) & 7;
26 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
27 p_target = (__m128i*) target;
28 p_src0 = (__m128i*)src0;
29 p_src1 = (__m128i*)src1;
30 p_src2 = (__m128i*)src2;
31 p_src3 = (__m128i*)src3;
35 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
37 while(bound_copy > 0) {
39 xmm1 = _mm_load_si128(p_src0);
40 xmm2 = _mm_load_si128(p_src1);
41 xmm3 = _mm_load_si128(p_src2);
42 xmm4 = _mm_load_si128(p_src3);
44 xmm5 = _mm_setzero_si128();
45 xmm6 = _mm_setzero_si128();
50 xmm1 = _mm_sub_epi16(xmm2, xmm1);
54 xmm3 = _mm_sub_epi16(xmm4, xmm3);
56 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
57 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
61 xmm2 = _mm_and_si128(xmm5, xmm2);
62 xmm4 = _mm_and_si128(xmm6, xmm4);
63 xmm5 = _mm_andnot_si128(xmm5, xmm7);
64 xmm6 = _mm_andnot_si128(xmm6, xmm8);
66 xmm5 = _mm_add_epi16(xmm2, xmm5);
67 xmm6 = _mm_add_epi16(xmm4, xmm6);
70 xmm1 = _mm_xor_si128(xmm1, xmm1);
72 xmm5 = _mm_sub_epi16(xmm6, xmm5);
76 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
79 xmm6 = _mm_and_si128(xmm1, xmm6);
81 xmm1 = _mm_andnot_si128(xmm1, xmm2);
86 xmm1 = _mm_add_epi16(xmm6, xmm1);
90 _mm_store_si128(p_target, xmm1);
156 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
157 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
158 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
159 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
169 #include <arm_neon.h>
170 static inline void volk_16i_x4_quad_max_star_16i_neon(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
171 const unsigned int eighth_points = num_points / 8;
174 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
175 int16x8_t diff12, diff34;
176 int16x8_t comp0, comp1, comp2, comp3;
177 int16x8_t result1_vec, result2_vec;
179 zeros = veorq_s16(zeros, zeros);
180 for(i=0; i < eighth_points; ++i) {
181 src0_vec = vld1q_s16(src0);
182 src1_vec = vld1q_s16(src1);
183 src2_vec = vld1q_s16(src2);
184 src3_vec = vld1q_s16(src3);
185 diff12 = vsubq_s16(src0_vec, src1_vec);
186 diff34 = vsubq_s16(src2_vec, src3_vec);
187 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
188 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
189 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
190 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
191 comp0 = vandq_s16(src0_vec, comp0);
192 comp1 = vandq_s16(src1_vec, comp1);
193 comp2 = vandq_s16(src2_vec, comp2);
194 comp3 = vandq_s16(src3_vec, comp3);
196 result1_vec = vaddq_s16(comp0, comp1);
197 result2_vec = vaddq_s16(comp2, comp3);
199 diff12 = vsubq_s16(result1_vec, result2_vec);
200 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
201 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
202 comp0 = vandq_s16(result1_vec, comp0);
203 comp1 = vandq_s16(result2_vec, comp1);
204 result1_vec = vaddq_s16(comp0, comp1);
205 vst1q_s16(target, result1_vec);
216 for(i=eighth_points*8; i < num_points; ++i) {
217 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
218 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
219 *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
229 #ifdef LV_HAVE_GENERIC
230 static inline void volk_16i_x4_quad_max_star_16i_generic(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
232 const unsigned int num_bytes = num_points*2;
236 int bound = num_bytes >> 1;
240 for(i = 0; i < bound; ++i) {
241 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
242 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
243 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;