1 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
2 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
9 unsigned short s[64/16];
10 unsigned char c[64/8];
17 static inline void renormalize(
unsigned char* X,
unsigned char threshold){
21 unsigned char min=X[0];
23 for(i=0;i<NUMSTATES;i++)
26 for(i=0;i<NUMSTATES;i++)
34 static inline void BFLY(
int i,
int s,
unsigned char * syms,
unsigned char *Y,
unsigned char *X,
decision_t * d,
unsigned char* Branchtab) {
35 int j, decision0, decision1;
36 unsigned char metric,m0,m1,m2,m3;
41 int PRECISIONSHIFT = 2;
47 metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT ;
48 metric=metric>>PRECISIONSHIFT;
50 unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
53 m1 = X[i+NUMSTATES/2] + (max - metric);
54 m2 = X[i] + (max - metric);
55 m3 = X[i+NUMSTATES/2] + metric;
57 decision0 = (
signed int)(m0-m1) > 0;
58 decision1 = (
signed int)(m2-m3) > 0;
60 Y[2*i] = decision0 ? m1 : m0;
61 Y[2*i+1] = decision1 ? m3 : m2;
63 d->
w[i/(
sizeof(
unsigned int)*8/2)+s*(
sizeof(
decision_t)/
sizeof(
unsigned int))] |=
64 (decision0|decision1<<1) << ((2*i)&(
sizeof(
unsigned int)*8-1));
70 #include <pmmintrin.h>
71 #include <emmintrin.h>
72 #include <xmmintrin.h>
76 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(
unsigned char* Y,
unsigned char* X,
unsigned char* syms,
unsigned char* dec,
unsigned int framebits,
unsigned int excess,
unsigned char* Branchtab) {
78 for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
79 unsigned char a75, a81;
81 short int s20, s21, s26, s27;
82 unsigned char *a74, *a80, *b6;
83 short int *a110, *a111, *a91, *a93, *a94;
84 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
85 , *a95, *a96, *a97, *a98, *a99;
86 __m128i a105, a106, a86, a87;
87 __m128i a100, a101, a103, a104, a107, a108, a109
88 , a76, a78, a79, a82, a84, a85, a88, a89
89 , a90, d10, d11, d12, d9, m23, m24, m25
90 , m26, m27, m28, m29, m30, s18, s19, s22
91 , s23, s24, s25, s28, s29, t13, t14, t15
93 a71 = ((__m128i *) X);
100 a76 = _mm_set1_epi8(a75);
101 a77 = ((__m128i *) Branchtab);
103 a79 = _mm_xor_si128(a76, a78);
107 a82 = _mm_set1_epi8(a81);
110 a85 = _mm_xor_si128(a82, a84);
111 t13 = _mm_avg_epu8(a79,a85);
112 a86 = ((__m128i ) t13);
113 a87 = _mm_srli_epi16(a86, 2);
114 a88 = ((__m128i ) a87);
115 t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
116 , 63, 63, 63, 63, 63, 63, 63, 63
118 t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
119 , 63, 63, 63, 63, 63, 63, 63, 63
121 m23 = _mm_adds_epu8(s18, t14);
122 m24 = _mm_adds_epu8(s19, t15);
123 m25 = _mm_adds_epu8(s18, t15);
124 m26 = _mm_adds_epu8(s19, t14);
125 a89 = _mm_min_epu8(m24, m23);
126 d9 = _mm_cmpeq_epi8(a89, m24);
127 a90 = _mm_min_epu8(m26, m25);
128 d10 = _mm_cmpeq_epi8(a90, m26);
129 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
130 a91 = ((
short int *) dec);
134 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
137 s22 = _mm_unpacklo_epi8(a89, a90);
138 s23 = _mm_unpackhi_epi8(a89, a90);
139 a95 = ((__m128i *) Y);
149 a101 = _mm_xor_si128(a76, a100);
152 a104 = _mm_xor_si128(a82, a103);
153 t16 = _mm_avg_epu8(a101,a104);
154 a105 = ((__m128i ) t16);
155 a106 = _mm_srli_epi16(a105, 2);
156 a107 = ((__m128i ) a106);
157 t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
158 , 63, 63, 63, 63, 63, 63, 63, 63
160 t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
161 , 63, 63, 63, 63, 63, 63, 63, 63
163 m27 = _mm_adds_epu8(s24, t17);
164 m28 = _mm_adds_epu8(s25, t18);
165 m29 = _mm_adds_epu8(s24, t18);
166 m30 = _mm_adds_epu8(s25, t17);
167 a108 = _mm_min_epu8(m28, m27);
168 d11 = _mm_cmpeq_epi8(a108, m28);
169 a109 = _mm_min_epu8(m30, m29);
170 d12 = _mm_cmpeq_epi8(a109, m30);
171 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
174 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
177 s28 = _mm_unpacklo_epi8(a108, a109);
178 s29 = _mm_unpackhi_epi8(a108, a109);
183 if ((((
unsigned char *) Y)[0]>210)) {
185 m5 = ((__m128i *) Y)[0];
186 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
187 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
188 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
190 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
191 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
192 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
193 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
194 m7 = _mm_unpacklo_epi8(m7, m7);
195 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
196 m6 = _mm_unpacklo_epi64(m7, m7);
197 ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
198 ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
199 ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
200 ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
202 unsigned char a188, a194;
204 short int s48, s49, s54, s55;
205 unsigned char *a187, *a193, *b15;
206 short int *a204, *a206, *a207, *a223, *a224, *b16;
207 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
208 , *a211, *a212, *a215, *a225, *a226;
209 __m128i a199, a200, a218, a219;
210 __m128i a189, a191, a192, a195, a197, a198, a201
211 , a202, a203, a213, a214, a216, a217, a220, a221
212 , a222, d17, d18, d19, d20, m39, m40, m41
213 , m42, m43, m44, m45, m46, s46, s47, s50
214 , s51, s52, s53, s56, s57, t25, t26, t27
216 a184 = ((__m128i *) Y);
224 a189 = _mm_set1_epi8(a188);
225 a190 = ((__m128i *) Branchtab);
227 a192 = _mm_xor_si128(a189, a191);
230 a195 = _mm_set1_epi8(a194);
233 a198 = _mm_xor_si128(a195, a197);
234 t25 = _mm_avg_epu8(a192,a198);
235 a199 = ((__m128i ) t25);
236 a200 = _mm_srli_epi16(a199, 2);
237 a201 = ((__m128i ) a200);
238 t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
239 , 63, 63, 63, 63, 63, 63, 63, 63
241 t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
242 , 63, 63, 63, 63, 63, 63, 63, 63
244 m39 = _mm_adds_epu8(s46, t26);
245 m40 = _mm_adds_epu8(s47, t27);
246 m41 = _mm_adds_epu8(s46, t27);
247 m42 = _mm_adds_epu8(s47, t26);
248 a202 = _mm_min_epu8(m40, m39);
249 d17 = _mm_cmpeq_epi8(a202, m40);
250 a203 = _mm_min_epu8(m42, m41);
251 d18 = _mm_cmpeq_epi8(a203, m42);
252 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
253 a204 = ((
short int *) dec);
258 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
261 s50 = _mm_unpacklo_epi8(a202, a203);
262 s51 = _mm_unpackhi_epi8(a202, a203);
263 a208 = ((__m128i *) X);
273 a214 = _mm_xor_si128(a189, a213);
276 a217 = _mm_xor_si128(a195, a216);
277 t28 = _mm_avg_epu8(a214,a217);
278 a218 = ((__m128i ) t28);
279 a219 = _mm_srli_epi16(a218, 2);
280 a220 = ((__m128i ) a219);
281 t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
282 , 63, 63, 63, 63, 63, 63, 63, 63
284 t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
285 , 63, 63, 63, 63, 63, 63, 63, 63
287 m43 = _mm_adds_epu8(s52, t29);
288 m44 = _mm_adds_epu8(s53, t30);
289 m45 = _mm_adds_epu8(s52, t30);
290 m46 = _mm_adds_epu8(s53, t29);
291 a221 = _mm_min_epu8(m44, m43);
292 d19 = _mm_cmpeq_epi8(a221, m44);
293 a222 = _mm_min_epu8(m46, m45);
294 d20 = _mm_cmpeq_epi8(a222, m46);
295 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
298 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
301 s56 = _mm_unpacklo_epi8(a221, a222);
302 s57 = _mm_unpackhi_epi8(a221, a222);
307 if ((((
unsigned char *) X)[0]>210)) {
309 m12 = ((__m128i *) X)[0];
310 m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
311 m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
312 m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
314 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
315 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
316 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
317 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
318 m14 = _mm_unpacklo_epi8(m14, m14);
319 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
320 m13 = _mm_unpacklo_epi64(m14, m14);
321 ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
322 ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
323 ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
324 ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
337 for(j=0; j < (framebits + excess) % 2; ++j) {
340 BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (
decision_t *)dec, Branchtab);
368 static inline void volk_8u_x4_conv_k7_r2_8u_generic(
unsigned char* Y,
unsigned char* X,
unsigned char* syms,
unsigned char* dec,
unsigned int framebits,
unsigned int excess,
unsigned char* Branchtab) {
369 int nbits = framebits + excess;
371 int RENORMALIZE_THRESHOLD = 210;
378 for (s=0;s<nbits;s++){
380 for(i=0;i<NUMSTATES/2;i++){
391 Y = (
unsigned char*)tmp;
float min(float a, float b)
unsigned int * w
Definition: cc_common.h:36
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:17
Definition: cc_common.h:33
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:34