1 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
2 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
16 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
18 const unsigned int num_bytes = num_points*2;
22 int bound = (num_bytes >> 4);
23 int bound_copy = bound;
24 int leftovers = (num_bytes >> 1) & 7;
26 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
27 p_target = (__m128i*) target;
28 p_src0 = (__m128i*)src0;
29 p_src1 = (__m128i*)src1;
30 p_src2 = (__m128i*)src2;
31 p_src3 = (__m128i*)src3;
35 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
37 while(bound_copy > 0) {
39 xmm1 = _mm_load_si128(p_src0);
40 xmm2 = _mm_load_si128(p_src1);
41 xmm3 = _mm_load_si128(p_src2);
42 xmm4 = _mm_load_si128(p_src3);
44 xmm5 = _mm_setzero_si128();
45 xmm6 = _mm_setzero_si128();
50 xmm1 = _mm_sub_epi16(xmm2, xmm1);
54 xmm3 = _mm_sub_epi16(xmm4, xmm3);
56 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
57 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
61 xmm2 = _mm_and_si128(xmm5, xmm2);
62 xmm4 = _mm_and_si128(xmm6, xmm4);
63 xmm5 = _mm_andnot_si128(xmm5, xmm7);
64 xmm6 = _mm_andnot_si128(xmm6, xmm8);
66 xmm5 = _mm_add_epi16(xmm2, xmm5);
67 xmm6 = _mm_add_epi16(xmm4, xmm6);
70 xmm1 = _mm_xor_si128(xmm1, xmm1);
72 xmm5 = _mm_sub_epi16(xmm6, xmm5);
76 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
79 xmm6 = _mm_and_si128(xmm1, xmm6);
81 xmm1 = _mm_andnot_si128(xmm1, xmm2);
86 xmm1 = _mm_add_epi16(xmm6, xmm1);
90 _mm_store_si128(p_target, xmm1);
156 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
157 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
158 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
159 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
169 #ifdef LV_HAVE_GENERIC
170 static inline void volk_16i_x4_quad_max_star_16i_generic(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
172 const unsigned int num_bytes = num_points*2;
176 int bound = num_bytes >> 1;
180 for(i = 0; i < bound; ++i) {
181 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
182 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
183 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;