GNU Radio Manual and C++ API Reference  3.7.5.1
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
2 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
3 
4 
5 #include<inttypes.h>
6 #include<stdio.h>
7 
8 
9 #ifdef LV_HAVE_SSE2
10 #include<xmmintrin.h>
11 #include<emmintrin.h>
12 
13 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
14 
15  const unsigned int num_bytes = num_points*2;
16 
17  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
18  __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
19  p_target0 = (__m128i*)target0;
20  p_target1 = (__m128i*)target1;
21  p_target2 = (__m128i*)target2;
22  p_target3 = (__m128i*)target3;
23 
24  p_src0 = (__m128i*)src0;
25  p_src1 = (__m128i*)src1;
26  p_src2 = (__m128i*)src2;
27  p_src3 = (__m128i*)src3;
28  p_src4 = (__m128i*)src4;
29 
30  int i = 0;
31 
32  int bound = (num_bytes >> 4);
33  int leftovers = (num_bytes >> 1) & 7;
34 
35  for(; i < bound; ++i) {
36  xmm0 = _mm_load_si128(p_src0);
37  xmm1 = _mm_load_si128(p_src1);
38  xmm2 = _mm_load_si128(p_src2);
39  xmm3 = _mm_load_si128(p_src3);
40  xmm4 = _mm_load_si128(p_src4);
41 
42  p_src0 += 1;
43  p_src1 += 1;
44 
45  xmm1 = _mm_add_epi16(xmm0, xmm1);
46  xmm2 = _mm_add_epi16(xmm0, xmm2);
47  xmm3 = _mm_add_epi16(xmm0, xmm3);
48  xmm4 = _mm_add_epi16(xmm0, xmm4);
49 
50 
51  p_src2 += 1;
52  p_src3 += 1;
53  p_src4 += 1;
54 
55  _mm_store_si128(p_target0, xmm1);
56  _mm_store_si128(p_target1, xmm2);
57  _mm_store_si128(p_target2, xmm3);
58  _mm_store_si128(p_target3, xmm4);
59 
60  p_target0 += 1;
61  p_target1 += 1;
62  p_target2 += 1;
63  p_target3 += 1;
64  }
65  /*asm volatile
66  (
67  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
68  "cmp $0, %[bound]\n\t"
69  "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
70  "movaps (%[src0]), %%xmm1\n\t"
71  "movaps (%[src1]), %%xmm2\n\t"
72  "movaps (%[src2]), %%xmm3\n\t"
73  "movaps (%[src3]), %%xmm4\n\t"
74  "movaps (%[src4]), %%xmm5\n\t"
75  "add $16, %[src0]\n\t"
76  "add $16, %[src1]\n\t"
77  "add $16, %[src2]\n\t"
78  "add $16, %[src3]\n\t"
79  "add $16, %[src4]\n\t"
80  "paddw %%xmm1, %%xmm2\n\t"
81  "paddw %%xmm1, %%xmm3\n\t"
82  "paddw %%xmm1, %%xmm4\n\t"
83  "paddw %%xmm1, %%xmm5\n\t"
84  "add $-1, %[bound]\n\t"
85  "movaps %%xmm2, (%[target0])\n\t"
86  "movaps %%xmm3, (%[target1])\n\t"
87  "movaps %%xmm4, (%[target2])\n\t"
88  "movaps %%xmm5, (%[target3])\n\t"
89  "add $16, %[target0]\n\t"
90  "add $16, %[target1]\n\t"
91  "add $16, %[target2]\n\t"
92  "add $16, %[target3]\n\t"
93  "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
94  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
95  :
96  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
97  :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
98  );
99 
100  */
101 
102 
103  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
104  target0[i] = src0[i] + src1[i];
105  target1[i] = src0[i] + src2[i];
106  target2[i] = src0[i] + src3[i];
107  target3[i] = src0[i] + src4[i];
108  }
109 }
110 #endif /*LV_HAVE_SSE2*/
111 
112 #ifdef LV_HAVE_NEON
113 #include <arm_neon.h>
114 static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
115 
116  const unsigned int eighth_points = num_points / 8;
117  unsigned int number = 0;
118 
119  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
120  int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
121  for(number = 0; number < eighth_points; ++number) {
122  src0_vec = vld1q_s16(src0);
123  src1_vec = vld1q_s16(src1);
124  src2_vec = vld1q_s16(src2);
125  src3_vec = vld1q_s16(src3);
126  src4_vec = vld1q_s16(src4);
127 
128  target0_vec = vaddq_s16(src0_vec , src1_vec);
129  target1_vec = vaddq_s16(src0_vec , src2_vec);
130  target2_vec = vaddq_s16(src0_vec , src3_vec);
131  target3_vec = vaddq_s16(src0_vec , src4_vec);
132 
133  vst1q_s16(target0, target0_vec);
134  vst1q_s16(target1, target1_vec);
135  vst1q_s16(target2, target2_vec);
136  vst1q_s16(target3, target3_vec);
137  src0 += 8;
138  src1 += 8;
139  src2 += 8;
140  src3 += 8;
141  src4 += 8;
142  target0 += 8;
143  target1 += 8;
144  target2 += 8;
145  target3 += 8;
146  }
147 
148  for(number = eighth_points * 8; number < num_points; ++number) {
149  *target0++ = *src0 + *src1++;
150  *target1++ = *src0 + *src2++;
151  *target2++ = *src0 + *src3++;
152  *target3++ = *src0++ + *src4++;
153  }
154 }
155 
156 #endif /* LV_HAVE_NEON */
157 
158 
159 #ifdef LV_HAVE_GENERIC
160 
161 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
162 
163  const unsigned int num_bytes = num_points*2;
164 
165  int i = 0;
166 
167  int bound = num_bytes >> 1;
168 
169  for(i = 0; i < bound; ++i) {
170  target0[i] = src0[i] + src1[i];
171  target1[i] = src0[i] + src2[i];
172  target2[i] = src0[i] + src3[i];
173  target3[i] = src0[i] + src4[i];
174  }
175 }
176 
177 #endif /* LV_HAVE_GENERIC */
178 
179 
180 
181 
182 
183 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/