GNU Radio 3.7.3 C++ API
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
2 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 #include <string.h>
8 
9 #ifdef LV_HAVE_SSE3
10 #include<xmmintrin.h>
11 #include<pmmintrin.h>
12 
13 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
14 
15  const unsigned int num_bytes = num_points*8;
16 
17  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
18 
19  lv_32fc_t diff;
20  memset(&diff, 0x0, 2*sizeof(float));
21 
22  float sq_dist = 0.0;
23  int bound = num_bytes >> 5;
24  int leftovers0 = (num_bytes >> 4) & 1;
25  int leftovers1 = (num_bytes >> 3) & 1;
26  int i = 0;
27 
28 
29 
30  xmm1 = _mm_setzero_ps();
31  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
32  xmm2 = _mm_load_ps((float*)&points[0]);
33  xmm8 = _mm_load1_ps(&scalar);
34  xmm1 = _mm_movelh_ps(xmm1, xmm1);
35  xmm3 = _mm_load_ps((float*)&points[2]);
36 
37 
38  for(; i < bound - 1; ++i) {
39 
40  xmm4 = _mm_sub_ps(xmm1, xmm2);
41  xmm5 = _mm_sub_ps(xmm1, xmm3);
42  points += 4;
43  xmm6 = _mm_mul_ps(xmm4, xmm4);
44  xmm7 = _mm_mul_ps(xmm5, xmm5);
45 
46  xmm2 = _mm_load_ps((float*)&points[0]);
47 
48  xmm4 = _mm_hadd_ps(xmm6, xmm7);
49 
50  xmm3 = _mm_load_ps((float*)&points[2]);
51 
52  xmm4 = _mm_mul_ps(xmm4, xmm8);
53 
54  _mm_store_ps(target, xmm4);
55 
56  target += 4;
57 
58  }
59 
60  xmm4 = _mm_sub_ps(xmm1, xmm2);
61  xmm5 = _mm_sub_ps(xmm1, xmm3);
62 
63 
64 
65  points += 4;
66  xmm6 = _mm_mul_ps(xmm4, xmm4);
67  xmm7 = _mm_mul_ps(xmm5, xmm5);
68 
69  xmm4 = _mm_hadd_ps(xmm6, xmm7);
70 
71  xmm4 = _mm_mul_ps(xmm4, xmm8);
72 
73  _mm_store_ps(target, xmm4);
74 
75  target += 4;
76 
77 
78  for(i = 0; i < leftovers0; ++i) {
79 
80  xmm2 = _mm_load_ps((float*)&points[0]);
81 
82  xmm4 = _mm_sub_ps(xmm1, xmm2);
83 
84  points += 2;
85 
86  xmm6 = _mm_mul_ps(xmm4, xmm4);
87 
88  xmm4 = _mm_hadd_ps(xmm6, xmm6);
89 
90  xmm4 = _mm_mul_ps(xmm4, xmm8);
91 
92  _mm_storeh_pi((__m64*)target, xmm4);
93 
94  target += 2;
95  }
96 
97  for(i = 0; i < leftovers1; ++i) {
98 
99  diff = src0[0] - points[0];
100 
101  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
102 
103  target[0] = sq_dist;
104  }
105 }
106 
107 #endif /*LV_HAVE_SSE3*/
108 
109 #ifdef LV_HAVE_GENERIC
110 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
111 
112  const unsigned int num_bytes = num_points*8;
113 
114  lv_32fc_t diff;
115  float sq_dist;
116  unsigned int i = 0;
117 
118  for(; i < num_bytes >> 3; ++i) {
119  diff = src0[0] - points[i];
120 
121  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
122 
123  target[i] = sq_dist;
124  }
125 }
126 
127 #endif /*LV_HAVE_GENERIC*/
128 
129 
130 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78