GNU Radio Manual and C++ API Reference  3.7.5.1
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
2 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 
8 #ifdef LV_HAVE_SSE3
9 #include<xmmintrin.h>
10 #include<pmmintrin.h>
11 
12 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
13 
14  const unsigned int num_bytes = num_points*8;
15 
16  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
17 
18  lv_32fc_t diff;
19  float sq_dist;
20  int bound = num_bytes >> 5;
21  int leftovers0 = (num_bytes >> 4) & 1;
22  int leftovers1 = (num_bytes >> 3) & 1;
23  int i = 0;
24 
25  xmm1 = _mm_setzero_ps();
26  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
27  xmm2 = _mm_load_ps((float*)&points[0]);
28  xmm1 = _mm_movelh_ps(xmm1, xmm1);
29  xmm3 = _mm_load_ps((float*)&points[2]);
30 
31 
32  for(; i < bound - 1; ++i) {
33  xmm4 = _mm_sub_ps(xmm1, xmm2);
34  xmm5 = _mm_sub_ps(xmm1, xmm3);
35  points += 4;
36  xmm6 = _mm_mul_ps(xmm4, xmm4);
37  xmm7 = _mm_mul_ps(xmm5, xmm5);
38 
39  xmm2 = _mm_load_ps((float*)&points[0]);
40 
41  xmm4 = _mm_hadd_ps(xmm6, xmm7);
42 
43  xmm3 = _mm_load_ps((float*)&points[2]);
44 
45  _mm_store_ps(target, xmm4);
46 
47  target += 4;
48 
49  }
50 
51  xmm4 = _mm_sub_ps(xmm1, xmm2);
52  xmm5 = _mm_sub_ps(xmm1, xmm3);
53 
54 
55 
56  points += 4;
57  xmm6 = _mm_mul_ps(xmm4, xmm4);
58  xmm7 = _mm_mul_ps(xmm5, xmm5);
59 
60  xmm4 = _mm_hadd_ps(xmm6, xmm7);
61 
62  _mm_store_ps(target, xmm4);
63 
64  target += 4;
65 
66  for(i = 0; i < leftovers0; ++i) {
67 
68  xmm2 = _mm_load_ps((float*)&points[0]);
69 
70  xmm4 = _mm_sub_ps(xmm1, xmm2);
71 
72  points += 2;
73 
74  xmm6 = _mm_mul_ps(xmm4, xmm4);
75 
76  xmm4 = _mm_hadd_ps(xmm6, xmm6);
77 
78  _mm_storeh_pi((__m64*)target, xmm4);
79 
80  target += 2;
81  }
82 
83  for(i = 0; i < leftovers1; ++i) {
84 
85  diff = src0[0] - points[0];
86 
87  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
88 
89  target[0] = sq_dist;
90  }
91 }
92 
93 #endif /*LV_HAVE_SSE3*/
94 
95 #ifdef LV_HAVE_NEON
96 #include <arm_neon.h>
97 static inline void volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
98  const unsigned int quarter_points = num_points / 4;
99  unsigned int number;
100 
101  float32x4x2_t a_vec, b_vec;
102  float32x4x2_t diff_vec;
103  float32x4_t tmp, tmp1, dist_sq;
104  a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
105  a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
106  for(number=0; number < quarter_points; ++number) {
107  b_vec = vld2q_f32((float*)points);
108  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
109  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
110  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
111  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
112 
113  dist_sq = vaddq_f32(tmp, tmp1);
114  vst1q_f32(target, dist_sq);
115  points += 4;
116  target += 4;
117  }
118  for(number=quarter_points*4; number < num_points; ++number) {
119  lv_32fc_t diff = src0[0] - *points++;
120  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
121  }
122 }
123 #endif /* LV_HAVE_NEON */
124 
125 #ifdef LV_HAVE_GENERIC
126 static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
127 
128  const unsigned int num_bytes = num_points*8;
129 
130  lv_32fc_t diff;
131  float sq_dist;
132  unsigned int i = 0;
133 
134  for(; i < num_bytes >> 3; ++i) {
135  diff = src0[0] - points[i];
136 
137  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
138 
139  target[i] = sq_dist;
140  }
141 }
142 
143 #endif /*LV_HAVE_GENERIC*/
144 
145 
146 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78