GNU Radio Manual and C++ API Reference  3.7.5.1
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
2 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
3 
4 #include <volk/volk_common.h>
5 
6 #include<inttypes.h>
7 #include<stdio.h>
8 
9 
10 #ifdef LV_HAVE_SSSE3
11 
12 #include<xmmintrin.h>
13 #include<emmintrin.h>
14 #include<tmmintrin.h>
15 
16 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) {
17 
18  const unsigned int num_bytes = num_points*2;
19 
20  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
21  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
22  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
23  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
24 
25 
26 
27  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
28  __m128i xmm5, xmm6, xmm7, xmm8;
29 
30  xmm4 = _mm_load_si128((__m128i*)shufmask0);
31  xmm5 = _mm_load_si128((__m128i*)shufmask1);
32  xmm6 = _mm_load_si128((__m128i*)andmask0);
33  xmm7 = _mm_load_si128((__m128i*)andmask1);
34 
35  __m128i *p_target, *p_src0;
36 
37  p_target = (__m128i*)target;
38  p_src0 = (__m128i*)src0;
39 
40  int bound = num_bytes >> 5;
41  int intermediate = (num_bytes >> 4) & 1;
42  int leftovers = (num_bytes >> 1) & 7;
43 
44  int i = 0;
45 
46 
47  for(i = 0; i < bound; ++i) {
48 
49  xmm0 = _mm_load_si128(p_src0);
50  xmm1 = _mm_load_si128(&p_src0[1]);
51 
52 
53 
54  xmm2 = _mm_xor_si128(xmm2, xmm2);
55  p_src0 += 2;
56 
57  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
58 
59  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
60 
61  xmm8 = _mm_and_si128(xmm2, xmm6);
62  xmm3 = _mm_and_si128(xmm2, xmm7);
63 
64 
65  xmm8 = _mm_add_epi8(xmm8, xmm4);
66  xmm3 = _mm_add_epi8(xmm3, xmm5);
67 
68  xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
69  xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
70 
71 
72  xmm3 = _mm_add_epi16(xmm0, xmm1);
73 
74 
75  _mm_store_si128(p_target, xmm3);
76 
77  p_target += 1;
78 
79  }
80 
81  for(i = 0; i < intermediate; ++i) {
82 
83  xmm0 = _mm_load_si128(p_src0);
84 
85 
86  xmm2 = _mm_xor_si128(xmm2, xmm2);
87  p_src0 += 1;
88 
89  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
90  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
91 
92  xmm8 = _mm_and_si128(xmm2, xmm6);
93 
94  xmm3 = _mm_add_epi8(xmm8, xmm4);
95 
96  xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
97 
98  _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
99 
100  p_target = (__m128i*)((int8_t*)p_target + 8);
101 
102  }
103 
104  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
105  target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
106  }
107 
108 
109 }
110 
111 #endif /*LV_HAVE_SSSE3*/
112 
113 #ifdef LV_HAVE_NEON
114 #include <arm_neon.h>
115 static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) {
116  const unsigned int eighth_points = num_points / 16;
117  unsigned number;
118  int16x8x2_t input_vec;
119  int16x8_t diff, max_vec, zeros;
120  uint16x8_t comp1, comp2;
121  zeros = veorq_s16(zeros, zeros);
122  for(number=0; number < eighth_points; ++number) {
123  input_vec = vld2q_s16(src0);
124  //__builtin_prefetch(src0+16);
125  diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
126  comp1 = vcgeq_s16(diff, zeros);
127  comp2 = vcltq_s16(diff, zeros);
128 
129  input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
130  input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
131 
132  max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
133  vst1q_s16(target, max_vec);
134  src0 += 16;
135  target += 8;
136  }
137  for(number=0; number < num_points%16; number+=2) {
138  target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
139  }
140 
141 }
142 #endif /* LV_HAVE_NEON */
143 
144 #ifdef LV_HAVE_NEON
145 extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
146 #endif /* LV_HAVE_NEON */
147 
148 #ifdef LV_HAVE_GENERIC
149 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
150 
151  const unsigned int num_bytes = num_points*2;
152 
153  int i = 0;
154 
155  int bound = num_bytes >> 1;
156 
157 
158  for(i = 0; i < bound; i += 2) {
159  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
160  }
161 
162 }
163 
164 
165 
166 #endif /*LV_HAVE_GENERIC*/
167 
168 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
#define bit128_p(x)
Definition: volk_common.h:94
unsigned char uint8_t
Definition: stdint.h:78
signed short int16_t
Definition: stdint.h:76
signed char int8_t
Definition: stdint.h:75