GNU Radio Manual and C++ API Reference  3.7.5.1
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
2 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
3 
4 
5 #include<inttypes.h>
6 #include<stdio.h>
7 
8 
9 
10 
11 
12 #ifdef LV_HAVE_SSE2
13 
14 #include<emmintrin.h>
15 
16 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
17 
18  const unsigned int num_bytes = num_points*2;
19 
20  int i = 0;
21 
22  int bound = (num_bytes >> 4);
23  int bound_copy = bound;
24  int leftovers = (num_bytes >> 1) & 7;
25 
26  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
27  p_target = (__m128i*) target;
28  p_src0 = (__m128i*)src0;
29  p_src1 = (__m128i*)src1;
30  p_src2 = (__m128i*)src2;
31  p_src3 = (__m128i*)src3;
32 
33 
34 
35  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
36 
37  while(bound_copy > 0) {
38 
39  xmm1 = _mm_load_si128(p_src0);
40  xmm2 = _mm_load_si128(p_src1);
41  xmm3 = _mm_load_si128(p_src2);
42  xmm4 = _mm_load_si128(p_src3);
43 
44  xmm5 = _mm_setzero_si128();
45  xmm6 = _mm_setzero_si128();
46  xmm7 = xmm1;
47  xmm8 = xmm3;
48 
49 
50  xmm1 = _mm_sub_epi16(xmm2, xmm1);
51 
52 
53 
54  xmm3 = _mm_sub_epi16(xmm4, xmm3);
55 
56  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
57  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
58 
59 
60 
61  xmm2 = _mm_and_si128(xmm5, xmm2);
62  xmm4 = _mm_and_si128(xmm6, xmm4);
63  xmm5 = _mm_andnot_si128(xmm5, xmm7);
64  xmm6 = _mm_andnot_si128(xmm6, xmm8);
65 
66  xmm5 = _mm_add_epi16(xmm2, xmm5);
67  xmm6 = _mm_add_epi16(xmm4, xmm6);
68 
69 
70  xmm1 = _mm_xor_si128(xmm1, xmm1);
71  xmm2 = xmm5;
72  xmm5 = _mm_sub_epi16(xmm6, xmm5);
73  p_src0 += 1;
74  bound_copy -= 1;
75 
76  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
77  p_src1 += 1;
78 
79  xmm6 = _mm_and_si128(xmm1, xmm6);
80 
81  xmm1 = _mm_andnot_si128(xmm1, xmm2);
82  p_src2 += 1;
83 
84 
85 
86  xmm1 = _mm_add_epi16(xmm6, xmm1);
87  p_src3 += 1;
88 
89 
90  _mm_store_si128(p_target, xmm1);
91  p_target += 1;
92 
93  }
94 
95 
96  /*asm volatile
97  (
98  "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
99  "cmp $0, %[bound]\n\t"
100  "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
101 
102  "movaps (%[src0]), %%xmm1\n\t"
103  "movaps (%[src1]), %%xmm2\n\t"
104  "movaps (%[src2]), %%xmm3\n\t"
105  "movaps (%[src3]), %%xmm4\n\t"
106 
107  "pxor %%xmm5, %%xmm5\n\t"
108  "pxor %%xmm6, %%xmm6\n\t"
109  "movaps %%xmm1, %%xmm7\n\t"
110  "movaps %%xmm3, %%xmm8\n\t"
111  "psubw %%xmm2, %%xmm1\n\t"
112  "psubw %%xmm4, %%xmm3\n\t"
113 
114  "pcmpgtw %%xmm1, %%xmm5\n\t"
115  "pcmpgtw %%xmm3, %%xmm6\n\t"
116 
117  "pand %%xmm5, %%xmm2\n\t"
118  "pand %%xmm6, %%xmm4\n\t"
119  "pandn %%xmm7, %%xmm5\n\t"
120  "pandn %%xmm8, %%xmm6\n\t"
121 
122  "paddw %%xmm2, %%xmm5\n\t"
123  "paddw %%xmm4, %%xmm6\n\t"
124 
125  "pxor %%xmm1, %%xmm1\n\t"
126  "movaps %%xmm5, %%xmm2\n\t"
127 
128  "psubw %%xmm6, %%xmm5\n\t"
129  "add $16, %[src0]\n\t"
130  "add $-1, %[bound]\n\t"
131 
132  "pcmpgtw %%xmm5, %%xmm1\n\t"
133  "add $16, %[src1]\n\t"
134 
135  "pand %%xmm1, %%xmm6\n\t"
136 
137  "pandn %%xmm2, %%xmm1\n\t"
138  "add $16, %[src2]\n\t"
139 
140  "paddw %%xmm6, %%xmm1\n\t"
141  "add $16, %[src3]\n\t"
142 
143  "movaps %%xmm1, (%[target])\n\t"
144  "addw $16, %[target]\n\t"
145  "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
146 
147  "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
148  :
149  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
150  :
151  );
152  */
153 
154  short temp0 = 0;
155  short temp1 = 0;
156  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
157  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
158  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
159  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
160  }
161  return;
162 
163 
164 }
165 
166 #endif /*LV_HAVE_SSE2*/
167 
168 #ifdef LV_HAVE_NEON
169 #include <arm_neon.h>
170 static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
171  const unsigned int eighth_points = num_points / 8;
172  unsigned i;
173 
174  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
175  int16x8_t diff12, diff34;
176  int16x8_t comp0, comp1, comp2, comp3;
177  int16x8_t result1_vec, result2_vec;
178  int16x8_t zeros;
179  zeros = veorq_s16(zeros, zeros);
180  for(i=0; i < eighth_points; ++i) {
181  src0_vec = vld1q_s16(src0);
182  src1_vec = vld1q_s16(src1);
183  src2_vec = vld1q_s16(src2);
184  src3_vec = vld1q_s16(src3);
185  diff12 = vsubq_s16(src0_vec, src1_vec);
186  diff34 = vsubq_s16(src2_vec, src3_vec);
187  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
188  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
189  comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
190  comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
191  comp0 = vandq_s16(src0_vec, comp0);
192  comp1 = vandq_s16(src1_vec, comp1);
193  comp2 = vandq_s16(src2_vec, comp2);
194  comp3 = vandq_s16(src3_vec, comp3);
195 
196  result1_vec = vaddq_s16(comp0, comp1);
197  result2_vec = vaddq_s16(comp2, comp3);
198 
199  diff12 = vsubq_s16(result1_vec, result2_vec);
200  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
201  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
202  comp0 = vandq_s16(result1_vec, comp0);
203  comp1 = vandq_s16(result2_vec, comp1);
204  result1_vec = vaddq_s16(comp0, comp1);
205  vst1q_s16(target, result1_vec);
206  src0 += 8;
207  src1 += 8;
208  src2 += 8;
209  src3 += 8;
210  target += 8;
211  }
212 
213 
214  short temp0 = 0;
215  short temp1 = 0;
216  for(i=eighth_points*8; i < num_points; ++i) {
217  temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
218  temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
219  *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
220  src0++;
221  src1++;
222  src2++;
223  src3++;
224  }
225 }
226 #endif /* LV_HAVE_NEON */
227 
228 
229 #ifdef LV_HAVE_GENERIC
230 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
231 
232  const unsigned int num_bytes = num_points*2;
233 
234  int i = 0;
235 
236  int bound = num_bytes >> 1;
237 
238  short temp0 = 0;
239  short temp1 = 0;
240  for(i = 0; i < bound; ++i) {
241  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
242  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
243  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
244  }
245 }
246 
247 
248 
249 
250 #endif /*LV_HAVE_GENERIC*/
251 
252 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/