1 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
2 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
16 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(
short* target0,
short* target1,
short* target2,
short* target3,
short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
unsigned int num_points) {
18 const unsigned int num_bytes = num_points*2;
20 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
21 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
22 p_target0 = (__m128i*)target0;
23 p_target1 = (__m128i*)target1;
24 p_target2 = (__m128i*)target2;
25 p_target3 = (__m128i*)target3;
27 p_src0 = (__m128i*)src0;
28 p_src1 = (__m128i*)src1;
29 p_src2 = (__m128i*)src2;
30 p_src3 = (__m128i*)src3;
31 p_src4 = (__m128i*)src4;
35 int bound = (num_bytes >> 4);
36 int leftovers = (num_bytes >> 1) & 7;
38 for(; i < bound; ++i) {
39 xmm0 = _mm_load_si128(p_src0);
40 xmm1 = _mm_load_si128(p_src1);
41 xmm2 = _mm_load_si128(p_src2);
42 xmm3 = _mm_load_si128(p_src3);
43 xmm4 = _mm_load_si128(p_src4);
48 xmm1 = _mm_add_epi16(xmm0, xmm1);
49 xmm2 = _mm_add_epi16(xmm0, xmm2);
50 xmm3 = _mm_add_epi16(xmm0, xmm3);
51 xmm4 = _mm_add_epi16(xmm0, xmm4);
58 _mm_store_si128(p_target0, xmm1);
59 _mm_store_si128(p_target1, xmm2);
60 _mm_store_si128(p_target2, xmm3);
61 _mm_store_si128(p_target3, xmm4);
106 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
107 target0[i] = src0[i] + src1[i];
108 target1[i] = src0[i] + src2[i];
109 target2[i] = src0[i] + src3[i];
110 target3[i] = src0[i] + src4[i];
116 #ifdef LV_HAVE_GENERIC
118 static inline void volk_16i_x5_add_quad_16i_x4_generic(
short* target0,
short* target1,
short* target2,
short* target3,
short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
unsigned int num_points) {
120 const unsigned int num_bytes = num_points*2;
124 int bound = num_bytes >> 1;
126 for(i = 0; i < bound; ++i) {
127 target0[i] = src0[i] + src1[i];
128 target1[i] = src0[i] + src2[i];
129 target2[i] = src0[i] + src3[i];
130 target3[i] = src0[i] + src4[i];