GNU Radio 3.7.3 C++ API
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
2 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
3 
4 #include <volk/volk_common.h>
5 #include<stdio.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
12 
13  static const int N_UNROLL = 4;
14 
15  lv_32fc_t acc0 = 0;
16  lv_32fc_t acc1 = 0;
17  lv_32fc_t acc2 = 0;
18  lv_32fc_t acc3 = 0;
19 
20  unsigned i = 0;
21  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
22 
23  for(i = 0; i < n; i += N_UNROLL) {
24  acc0 += taps[i + 0] * (float)input[i + 0];
25  acc1 += taps[i + 1] * (float)input[i + 1];
26  acc2 += taps[i + 2] * (float)input[i + 2];
27  acc3 += taps[i + 3] * (float)input[i + 3];
28  }
29 
30  for(; i < num_points; i++) {
31  acc0 += taps[i] * (float)input[i];
32  }
33 
34  *result = acc0 + acc1 + acc2 + acc3;
35 }
36 
37 #endif /*LV_HAVE_GENERIC*/
38 
39 
40 
41 #if LV_HAVE_SSE && LV_HAVE_MMX
42 
43 static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
44 
45  unsigned int number = 0;
46  const unsigned int sixteenthPoints = num_points / 8;
47 
48  float res[2];
49  float *realpt = &res[0], *imagpt = &res[1];
50  const short* aPtr = input;
51  const float* bPtr = (float*)taps;
52 
53  __m64 m0, m1;
54  __m128 f0, f1, f2, f3;
55  __m128 a0Val, a1Val, a2Val, a3Val;
56  __m128 b0Val, b1Val, b2Val, b3Val;
57  __m128 c0Val, c1Val, c2Val, c3Val;
58 
59  __m128 dotProdVal0 = _mm_setzero_ps();
60  __m128 dotProdVal1 = _mm_setzero_ps();
61  __m128 dotProdVal2 = _mm_setzero_ps();
62  __m128 dotProdVal3 = _mm_setzero_ps();
63 
64  for(;number < sixteenthPoints; number++){
65 
66  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
67  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
68  f0 = _mm_cvtpi16_ps(m0);
69  f1 = _mm_cvtpi16_ps(m0);
70  f2 = _mm_cvtpi16_ps(m1);
71  f3 = _mm_cvtpi16_ps(m1);
72 
73  a0Val = _mm_unpacklo_ps(f0, f1);
74  a1Val = _mm_unpackhi_ps(f0, f1);
75  a2Val = _mm_unpacklo_ps(f2, f3);
76  a3Val = _mm_unpackhi_ps(f2, f3);
77 
78  b0Val = _mm_loadu_ps(bPtr);
79  b1Val = _mm_loadu_ps(bPtr+4);
80  b2Val = _mm_loadu_ps(bPtr+8);
81  b3Val = _mm_loadu_ps(bPtr+12);
82 
83  c0Val = _mm_mul_ps(a0Val, b0Val);
84  c1Val = _mm_mul_ps(a1Val, b1Val);
85  c2Val = _mm_mul_ps(a2Val, b2Val);
86  c3Val = _mm_mul_ps(a3Val, b3Val);
87 
88  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
89  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
90  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
91  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
92 
93  aPtr += 8;
94  bPtr += 16;
95  }
96 
97  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
98  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
99  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
100 
101  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
102 
103  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
104 
105  *realpt = dotProductVector[0];
106  *imagpt = dotProductVector[1];
107  *realpt += dotProductVector[2];
108  *imagpt += dotProductVector[3];
109 
110  number = sixteenthPoints*8;
111  for(;number < num_points; number++){
112  *realpt += ((*aPtr) * (*bPtr++));
113  *imagpt += ((*aPtr++) * (*bPtr++));
114  }
115 
116  *result = *(lv_32fc_t*)(&res[0]);
117 }
118 
119 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
120 
121 
122 
123 
124 #if LV_HAVE_SSE && LV_HAVE_MMX
125 
126 
127 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
128 
129  unsigned int number = 0;
130  const unsigned int sixteenthPoints = num_points / 8;
131 
132  float res[2];
133  float *realpt = &res[0], *imagpt = &res[1];
134  const short* aPtr = input;
135  const float* bPtr = (float*)taps;
136 
137  __m64 m0, m1;
138  __m128 f0, f1, f2, f3;
139  __m128 a0Val, a1Val, a2Val, a3Val;
140  __m128 b0Val, b1Val, b2Val, b3Val;
141  __m128 c0Val, c1Val, c2Val, c3Val;
142 
143  __m128 dotProdVal0 = _mm_setzero_ps();
144  __m128 dotProdVal1 = _mm_setzero_ps();
145  __m128 dotProdVal2 = _mm_setzero_ps();
146  __m128 dotProdVal3 = _mm_setzero_ps();
147 
148  for(;number < sixteenthPoints; number++){
149 
150  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
151  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
152  f0 = _mm_cvtpi16_ps(m0);
153  f1 = _mm_cvtpi16_ps(m0);
154  f2 = _mm_cvtpi16_ps(m1);
155  f3 = _mm_cvtpi16_ps(m1);
156 
157  a0Val = _mm_unpacklo_ps(f0, f1);
158  a1Val = _mm_unpackhi_ps(f0, f1);
159  a2Val = _mm_unpacklo_ps(f2, f3);
160  a3Val = _mm_unpackhi_ps(f2, f3);
161 
162  b0Val = _mm_load_ps(bPtr);
163  b1Val = _mm_load_ps(bPtr+4);
164  b2Val = _mm_load_ps(bPtr+8);
165  b3Val = _mm_load_ps(bPtr+12);
166 
167  c0Val = _mm_mul_ps(a0Val, b0Val);
168  c1Val = _mm_mul_ps(a1Val, b1Val);
169  c2Val = _mm_mul_ps(a2Val, b2Val);
170  c3Val = _mm_mul_ps(a3Val, b3Val);
171 
172  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
173  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
174  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
175  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
176 
177  aPtr += 8;
178  bPtr += 16;
179  }
180 
181  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
182  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
183  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
184 
185  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
186 
187  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
188 
189  *realpt = dotProductVector[0];
190  *imagpt = dotProductVector[1];
191  *realpt += dotProductVector[2];
192  *imagpt += dotProductVector[3];
193 
194  number = sixteenthPoints*8;
195  for(;number < num_points; number++){
196  *realpt += ((*aPtr) * (*bPtr++));
197  *imagpt += ((*aPtr++) * (*bPtr++));
198  }
199 
200  *result = *(lv_32fc_t*)(&res[0]);
201 }
202 
203 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
204 
205 
206 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56