GNU Radio 3.5.3.2 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H 00002 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H 00003 00004 #include<inttypes.h> 00005 #include<stdio.h> 00006 #include<volk/volk_complex.h> 00007 00008 #ifdef LV_HAVE_SSE3 00009 #include<xmmintrin.h> 00010 #include<pmmintrin.h> 00011 00012 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { 00013 00014 00015 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 00016 00017 lv_32fc_t diff; 00018 float sq_dist; 00019 int bound = num_bytes >> 5; 00020 int leftovers0 = (num_bytes >> 4) & 1; 00021 int leftovers1 = (num_bytes >> 3) & 1; 00022 int i = 0; 00023 00024 xmm1 = _mm_setzero_ps(); 00025 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); 00026 xmm2 = _mm_load_ps((float*)&points[0]); 00027 xmm1 = _mm_movelh_ps(xmm1, xmm1); 00028 xmm3 = _mm_load_ps((float*)&points[2]); 00029 00030 00031 for(; i < bound - 1; ++i) { 00032 xmm4 = _mm_sub_ps(xmm1, xmm2); 00033 xmm5 = _mm_sub_ps(xmm1, xmm3); 00034 points += 4; 00035 xmm6 = _mm_mul_ps(xmm4, xmm4); 00036 xmm7 = _mm_mul_ps(xmm5, xmm5); 00037 00038 xmm2 = _mm_load_ps((float*)&points[0]); 00039 00040 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00041 00042 xmm3 = _mm_load_ps((float*)&points[2]); 00043 00044 _mm_store_ps(target, xmm4); 00045 00046 target += 4; 00047 00048 } 00049 00050 xmm4 = _mm_sub_ps(xmm1, xmm2); 00051 xmm5 = _mm_sub_ps(xmm1, xmm3); 00052 00053 00054 00055 points += 4; 00056 xmm6 = _mm_mul_ps(xmm4, xmm4); 00057 xmm7 = _mm_mul_ps(xmm5, xmm5); 00058 00059 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00060 00061 _mm_store_ps(target, xmm4); 00062 00063 target += 4; 00064 00065 for(i = 0; i < leftovers0; ++i) { 00066 00067 xmm2 = _mm_load_ps((float*)&points[0]); 00068 00069 xmm4 = _mm_sub_ps(xmm1, xmm2); 00070 00071 points += 2; 00072 00073 xmm6 = _mm_mul_ps(xmm4, xmm4); 00074 00075 xmm4 = _mm_hadd_ps(xmm6, xmm6); 00076 00077 _mm_storeh_pi((__m64*)target, xmm4); 00078 00079 target += 2; 00080 } 00081 00082 for(i = 0; i < leftovers1; ++i) { 00083 00084 diff = src0[0] - points[0]; 00085 00086 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); 00087 00088 target[0] = sq_dist; 00089 } 00090 } 00091 00092 #endif /*LV_HAVE_SSE3*/ 00093 00094 #ifdef LV_HAVE_GENERIC 00095 static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { 00096 lv_32fc_t diff; 00097 float sq_dist; 00098 unsigned int i = 0; 00099 00100 for(; i < num_bytes >> 3; ++i) { 00101 diff = src0[0] - points[i]; 00102 00103 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); 00104 00105 target[i] = sq_dist; 00106 } 00107 } 00108 00109 #endif /*LV_HAVE_GENERIC*/ 00110 00111 00112 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/