GNU Radio 3.7.2 C++ API
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
2 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
3 
4 #include <volk/volk_common.h>
5 #include<stdio.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
12 
13  float res[2];
14  float *realpt = &res[0], *imagpt = &res[1];
15  const float* aPtr = (float*)input;
16  const float* bPtr= taps;
17  unsigned int number = 0;
18 
19  *realpt = 0;
20  *imagpt = 0;
21 
22  for(number = 0; number < num_points; number++){
23  *realpt += ((*aPtr++) * (*bPtr));
24  *imagpt += ((*aPtr++) * (*bPtr++));
25  }
26 
27  *result = *(lv_32fc_t*)(&res[0]);
28 }
29 
30 #endif /*LV_HAVE_GENERIC*/
31 
32 
33 #ifdef LV_HAVE_SSE
34 
35 
36 static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
37 
38  unsigned int number = 0;
39  const unsigned int sixteenthPoints = num_points / 8;
40 
41  float res[2];
42  float *realpt = &res[0], *imagpt = &res[1];
43  const float* aPtr = (float*)input;
44  const float* bPtr = taps;
45 
46  __m128 a0Val, a1Val, a2Val, a3Val;
47  __m128 b0Val, b1Val, b2Val, b3Val;
48  __m128 x0Val, x1Val, x2Val, x3Val;
49  __m128 c0Val, c1Val, c2Val, c3Val;
50 
51  __m128 dotProdVal0 = _mm_setzero_ps();
52  __m128 dotProdVal1 = _mm_setzero_ps();
53  __m128 dotProdVal2 = _mm_setzero_ps();
54  __m128 dotProdVal3 = _mm_setzero_ps();
55 
56  for(;number < sixteenthPoints; number++){
57 
58  a0Val = _mm_load_ps(aPtr);
59  a1Val = _mm_load_ps(aPtr+4);
60  a2Val = _mm_load_ps(aPtr+8);
61  a3Val = _mm_load_ps(aPtr+12);
62 
63  x0Val = _mm_load_ps(bPtr);
64  x1Val = _mm_load_ps(bPtr);
65  x2Val = _mm_load_ps(bPtr+4);
66  x3Val = _mm_load_ps(bPtr+4);
67  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
68  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
69  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
70  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
71 
72  c0Val = _mm_mul_ps(a0Val, b0Val);
73  c1Val = _mm_mul_ps(a1Val, b1Val);
74  c2Val = _mm_mul_ps(a2Val, b2Val);
75  c3Val = _mm_mul_ps(a3Val, b3Val);
76 
77  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
78  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
79  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
80  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
81 
82  aPtr += 16;
83  bPtr += 8;
84  }
85 
86  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
87  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
88  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
89 
90  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
91 
92  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
93 
94  *realpt = dotProductVector[0];
95  *imagpt = dotProductVector[1];
96  *realpt += dotProductVector[2];
97  *imagpt += dotProductVector[3];
98 
99  number = sixteenthPoints*8;
100  for(;number < num_points; number++){
101  *realpt += ((*aPtr++) * (*bPtr));
102  *imagpt += ((*aPtr++) * (*bPtr++));
103  }
104 
105  *result = *(lv_32fc_t*)(&res[0]);
106 }
107 
108 #endif /*LV_HAVE_SSE*/
109 
110 
111 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56