GNU Radio 3.7.2 C++ API
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
2 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 
8 #ifdef LV_HAVE_SSE3
9 #include<xmmintrin.h>
10 #include<pmmintrin.h>
11 
12 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
13 
14  const unsigned int num_bytes = num_points*8;
15 
16  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
17 
18  lv_32fc_t diff;
19  float sq_dist;
20  int bound = num_bytes >> 5;
21  int leftovers0 = (num_bytes >> 4) & 1;
22  int leftovers1 = (num_bytes >> 3) & 1;
23  int i = 0;
24 
25  xmm1 = _mm_setzero_ps();
26  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
27  xmm2 = _mm_load_ps((float*)&points[0]);
28  xmm1 = _mm_movelh_ps(xmm1, xmm1);
29  xmm3 = _mm_load_ps((float*)&points[2]);
30 
31 
32  for(; i < bound - 1; ++i) {
33  xmm4 = _mm_sub_ps(xmm1, xmm2);
34  xmm5 = _mm_sub_ps(xmm1, xmm3);
35  points += 4;
36  xmm6 = _mm_mul_ps(xmm4, xmm4);
37  xmm7 = _mm_mul_ps(xmm5, xmm5);
38 
39  xmm2 = _mm_load_ps((float*)&points[0]);
40 
41  xmm4 = _mm_hadd_ps(xmm6, xmm7);
42 
43  xmm3 = _mm_load_ps((float*)&points[2]);
44 
45  _mm_store_ps(target, xmm4);
46 
47  target += 4;
48 
49  }
50 
51  xmm4 = _mm_sub_ps(xmm1, xmm2);
52  xmm5 = _mm_sub_ps(xmm1, xmm3);
53 
54 
55 
56  points += 4;
57  xmm6 = _mm_mul_ps(xmm4, xmm4);
58  xmm7 = _mm_mul_ps(xmm5, xmm5);
59 
60  xmm4 = _mm_hadd_ps(xmm6, xmm7);
61 
62  _mm_store_ps(target, xmm4);
63 
64  target += 4;
65 
66  for(i = 0; i < leftovers0; ++i) {
67 
68  xmm2 = _mm_load_ps((float*)&points[0]);
69 
70  xmm4 = _mm_sub_ps(xmm1, xmm2);
71 
72  points += 2;
73 
74  xmm6 = _mm_mul_ps(xmm4, xmm4);
75 
76  xmm4 = _mm_hadd_ps(xmm6, xmm6);
77 
78  _mm_storeh_pi((__m64*)target, xmm4);
79 
80  target += 2;
81  }
82 
83  for(i = 0; i < leftovers1; ++i) {
84 
85  diff = src0[0] - points[0];
86 
87  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
88 
89  target[0] = sq_dist;
90  }
91 }
92 
93 #endif /*LV_HAVE_SSE3*/
94 
95 #ifdef LV_HAVE_GENERIC
96 static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
97 
98  const unsigned int num_bytes = num_points*8;
99 
100  lv_32fc_t diff;
101  float sq_dist;
102  unsigned int i = 0;
103 
104  for(; i < num_bytes >> 3; ++i) {
105  diff = src0[0] - points[i];
106 
107  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
108 
109  target[i] = sq_dist;
110  }
111 }
112 
113 #endif /*LV_HAVE_GENERIC*/
114 
115 
116 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78