Statistics
| Branch: | Tag: | Revision:

root / volk / include / volk / volk_16s_add_quad_a16.h @ d486ff4b

History | View | Annotate | Download (3.5 kB)

1
#ifndef INCLUDED_volk_16s_add_quad_a16_H
2
#define INCLUDED_volk_16s_add_quad_a16_H
3
4
5
#include<inttypes.h>
6
#include<stdio.h>        
7
8
9
10
11
12
#if LV_HAVE_SSE2
13
#include<xmmintrin.h>
14
#include<emmintrin.h>
15
16
static inline  void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
17
  
18
  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
19
  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
20
  p_target0 = (__m128i*)target0;
21
  p_target1 = (__m128i*)target1;
22
  p_target2 = (__m128i*)target2;
23
  p_target3 = (__m128i*)target3;
24
25
  p_src0 = (__m128i*)src0;
26
  p_src1 = (__m128i*)src1;
27
  p_src2 = (__m128i*)src2;
28
  p_src3 = (__m128i*)src3;
29
  p_src4 = (__m128i*)src4;
30
31
  int i = 0;
32
33
  int bound = (num_bytes >> 4);
34
  int leftovers = (num_bytes >> 1) & 7;
35
36
  for(; i < bound; ++i) {
37
    xmm0 = _mm_load_si128(p_src0);
38
    xmm1 = _mm_load_si128(p_src1);
39
    xmm2 = _mm_load_si128(p_src2);
40
    xmm3 = _mm_load_si128(p_src3);
41
    xmm4 = _mm_load_si128(p_src4);
42
    
43
    p_src0 += 1;
44
    p_src1 += 1;
45
    
46
    xmm1 = _mm_add_epi16(xmm0, xmm1);
47
    xmm2 = _mm_add_epi16(xmm0, xmm2);
48
    xmm3 = _mm_add_epi16(xmm0, xmm3);
49
    xmm4 = _mm_add_epi16(xmm0, xmm4);
50
    
51
    
52
    p_src2 += 1;
53
    p_src3 += 1;
54
    p_src4 += 1;
55
56
    _mm_store_si128(p_target0, xmm1);
57
    _mm_store_si128(p_target1, xmm2);
58
    _mm_store_si128(p_target2, xmm3);
59
    _mm_store_si128(p_target3, xmm4);
60
    
61
    p_target0 += 1;
62
    p_target1 += 1;
63
    p_target2 += 1;
64
    p_target3 += 1;
65
  }
66
    /*asm volatile
67
                (
68
                 ".%=volk_16s_add_quad_a16_sse2_L1:\n\t"
69
                 "cmp $0, %[bound]\n\t"
70
                 "je .%=volk_16s_add_quad_a16_sse2_END\n\t"
71
                 "movaps (%[src0]), %%xmm1\n\t"
72
                 "movaps (%[src1]), %%xmm2\n\t"
73
                 "movaps (%[src2]), %%xmm3\n\t"
74
                 "movaps (%[src3]), %%xmm4\n\t"
75
                 "movaps (%[src4]), %%xmm5\n\t"
76
                 "add $16, %[src0]\n\t"
77
                 "add $16, %[src1]\n\t"
78
                 "add $16, %[src2]\n\t"
79
                 "add $16, %[src3]\n\t"
80
                 "add $16, %[src4]\n\t"
81
                 "paddw %%xmm1, %%xmm2\n\t"
82
                 "paddw %%xmm1, %%xmm3\n\t"
83
                 "paddw %%xmm1, %%xmm4\n\t"
84
                 "paddw %%xmm1, %%xmm5\n\t"
85
                 "add $-1, %[bound]\n\t"
86
                 "movaps %%xmm2, (%[target0])\n\t"
87
                 "movaps %%xmm3, (%[target1])\n\t"
88
                 "movaps %%xmm4, (%[target2])\n\t"
89
                 "movaps %%xmm5, (%[target3])\n\t"
90
                 "add $16, %[target0]\n\t"
91
                 "add $16, %[target1]\n\t"
92
                 "add $16, %[target2]\n\t"
93
                 "add $16, %[target3]\n\t"
94
                 "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t"
95
                 ".%=volk_16s_add_quad_a16_sse2_END:\n\t"
96
                 :
97
                 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
98
                 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
99
                 );
100
                 
101
    */
102
         
103
104
  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
105
    target0[i] = src0[i] + src1[i];
106
    target1[i] = src0[i] + src2[i];
107
    target2[i] = src0[i] + src3[i];
108
    target3[i] = src0[i] + src4[i];
109
  }
110
}
111
#endif /*LV_HAVE_SSE2*/
112
113
114
#if LV_HAVE_GENERIC
115
116
static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
117
        
118
        int i = 0;
119
        
120
        int bound = num_bytes >> 1;
121
122
        for(i = 0; i < bound; ++i) {
123
                target0[i] = src0[i] + src1[i];
124
                target1[i] = src0[i] + src2[i];
125
                target2[i] = src0[i] + src3[i];
126
                target3[i] = src0[i] + src4[i];
127
        }
128
}
129
130
#endif /* LV_HAVE_GENERIC */
131
132
133
134
135
136
#endif /*INCLUDED_volk_16s_add_quad_a16_H*/