GNU Radio 3.7.3 C++ API
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
2 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
3 
4 #include <volk/volk_common.h>
5 #include<stdio.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
12 
13  float dotProduct = 0;
14  const float* aPtr = input;
15  const float* bPtr= taps;
16  unsigned int number = 0;
17 
18  for(number = 0; number < num_points; number++){
19  dotProduct += ((*aPtr++) * (*bPtr++));
20  }
21 
22  *result = (int16_t)dotProduct;
23 }
24 
25 #endif /*LV_HAVE_GENERIC*/
26 
27 
28 #ifdef LV_HAVE_SSE
29 
30 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
31 
32  unsigned int number = 0;
33  const unsigned int sixteenthPoints = num_points / 16;
34 
35  float dotProduct = 0;
36  const float* aPtr = input;
37  const float* bPtr = taps;
38 
39  __m128 a0Val, a1Val, a2Val, a3Val;
40  __m128 b0Val, b1Val, b2Val, b3Val;
41  __m128 c0Val, c1Val, c2Val, c3Val;
42 
43  __m128 dotProdVal0 = _mm_setzero_ps();
44  __m128 dotProdVal1 = _mm_setzero_ps();
45  __m128 dotProdVal2 = _mm_setzero_ps();
46  __m128 dotProdVal3 = _mm_setzero_ps();
47 
48  for(;number < sixteenthPoints; number++){
49 
50  a0Val = _mm_load_ps(aPtr);
51  a1Val = _mm_load_ps(aPtr+4);
52  a2Val = _mm_load_ps(aPtr+8);
53  a3Val = _mm_load_ps(aPtr+12);
54  b0Val = _mm_load_ps(bPtr);
55  b1Val = _mm_load_ps(bPtr+4);
56  b2Val = _mm_load_ps(bPtr+8);
57  b3Val = _mm_load_ps(bPtr+12);
58 
59  c0Val = _mm_mul_ps(a0Val, b0Val);
60  c1Val = _mm_mul_ps(a1Val, b1Val);
61  c2Val = _mm_mul_ps(a2Val, b2Val);
62  c3Val = _mm_mul_ps(a3Val, b3Val);
63 
64  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
65  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
66  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
67  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
68 
69  aPtr += 16;
70  bPtr += 16;
71  }
72 
73  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
74  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
75  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
76 
77  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
78 
79  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
80 
81  dotProduct = dotProductVector[0];
82  dotProduct += dotProductVector[1];
83  dotProduct += dotProductVector[2];
84  dotProduct += dotProductVector[3];
85 
86  number = sixteenthPoints*16;
87  for(;number < num_points; number++){
88  dotProduct += ((*aPtr++) * (*bPtr++));
89  }
90 
91  *result = (short)dotProduct;
92 }
93 
94 #endif /*LV_HAVE_SSE*/
95 
96 
97 #ifdef LV_HAVE_SSE
98 
99 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
100 
101  unsigned int number = 0;
102  const unsigned int sixteenthPoints = num_points / 16;
103 
104  float dotProduct = 0;
105  const float* aPtr = input;
106  const float* bPtr = taps;
107 
108  __m128 a0Val, a1Val, a2Val, a3Val;
109  __m128 b0Val, b1Val, b2Val, b3Val;
110  __m128 c0Val, c1Val, c2Val, c3Val;
111 
112  __m128 dotProdVal0 = _mm_setzero_ps();
113  __m128 dotProdVal1 = _mm_setzero_ps();
114  __m128 dotProdVal2 = _mm_setzero_ps();
115  __m128 dotProdVal3 = _mm_setzero_ps();
116 
117  for(;number < sixteenthPoints; number++){
118 
119  a0Val = _mm_loadu_ps(aPtr);
120  a1Val = _mm_loadu_ps(aPtr+4);
121  a2Val = _mm_loadu_ps(aPtr+8);
122  a3Val = _mm_loadu_ps(aPtr+12);
123  b0Val = _mm_loadu_ps(bPtr);
124  b1Val = _mm_loadu_ps(bPtr+4);
125  b2Val = _mm_loadu_ps(bPtr+8);
126  b3Val = _mm_loadu_ps(bPtr+12);
127 
128  c0Val = _mm_mul_ps(a0Val, b0Val);
129  c1Val = _mm_mul_ps(a1Val, b1Val);
130  c2Val = _mm_mul_ps(a2Val, b2Val);
131  c3Val = _mm_mul_ps(a3Val, b3Val);
132 
133  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
134  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
135  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
136  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
137 
138  aPtr += 16;
139  bPtr += 16;
140  }
141 
142  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
143  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
144  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
145 
146  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
147 
148  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
149 
150  dotProduct = dotProductVector[0];
151  dotProduct += dotProductVector[1];
152  dotProduct += dotProductVector[2];
153  dotProduct += dotProductVector[3];
154 
155  number = sixteenthPoints*16;
156  for(;number < num_points; number++){
157  dotProduct += ((*aPtr++) * (*bPtr++));
158  }
159 
160  *result = (short)dotProduct;
161 }
162 
163 #endif /*LV_HAVE_SSE*/
164 
165 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
signed short int16_t
Definition: stdint.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9