GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 00002 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 00003 00004 #include<inttypes.h> 00005 #include<stdio.h> 00006 #include<volk/volk_complex.h> 00007 #include <string.h> 00008 00009 #ifdef LV_HAVE_SSE3 00010 #include<xmmintrin.h> 00011 #include<pmmintrin.h> 00012 00013 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { 00014 00015 00016 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 00017 00018 lv_32fc_t diff; 00019 memset(&diff, 0x0, 2*sizeof(float)); 00020 00021 float sq_dist = 0.0; 00022 int bound = num_bytes >> 5; 00023 int leftovers0 = (num_bytes >> 4) & 1; 00024 int leftovers1 = (num_bytes >> 3) & 1; 00025 int i = 0; 00026 00027 00028 00029 xmm1 = _mm_setzero_ps(); 00030 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); 00031 xmm2 = _mm_load_ps((float*)&points[0]); 00032 xmm8 = _mm_load1_ps(&scalar); 00033 xmm1 = _mm_movelh_ps(xmm1, xmm1); 00034 xmm3 = _mm_load_ps((float*)&points[2]); 00035 00036 00037 for(; i < bound - 1; ++i) { 00038 00039 xmm4 = _mm_sub_ps(xmm1, xmm2); 00040 xmm5 = _mm_sub_ps(xmm1, xmm3); 00041 points += 4; 00042 xmm6 = _mm_mul_ps(xmm4, xmm4); 00043 xmm7 = _mm_mul_ps(xmm5, xmm5); 00044 00045 xmm2 = _mm_load_ps((float*)&points[0]); 00046 00047 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00048 00049 xmm3 = _mm_load_ps((float*)&points[2]); 00050 00051 xmm4 = _mm_mul_ps(xmm4, xmm8); 00052 00053 _mm_store_ps(target, xmm4); 00054 00055 target += 4; 00056 00057 } 00058 00059 xmm4 = _mm_sub_ps(xmm1, xmm2); 00060 xmm5 = _mm_sub_ps(xmm1, xmm3); 00061 00062 00063 00064 points += 4; 00065 xmm6 = _mm_mul_ps(xmm4, xmm4); 00066 xmm7 = _mm_mul_ps(xmm5, xmm5); 00067 00068 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00069 00070 xmm4 = _mm_mul_ps(xmm4, xmm8); 00071 00072 _mm_store_ps(target, xmm4); 00073 00074 target += 4; 00075 00076 00077 for(i = 0; i < leftovers0; ++i) { 00078 00079 xmm2 = _mm_load_ps((float*)&points[0]); 00080 00081 xmm4 = _mm_sub_ps(xmm1, xmm2); 00082 00083 points += 2; 00084 00085 xmm6 = _mm_mul_ps(xmm4, xmm4); 00086 00087 xmm4 = _mm_hadd_ps(xmm6, xmm6); 00088 00089 xmm4 = _mm_mul_ps(xmm4, xmm8); 00090 00091 _mm_storeh_pi((__m64*)target, xmm4); 00092 00093 target += 2; 00094 } 00095 00096 for(i = 0; i < leftovers1; ++i) { 00097 00098 diff = src0[0] - points[0]; 00099 00100 sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); 00101 00102 target[0] = sq_dist; 00103 } 00104 } 00105 00106 #endif /*LV_HAVE_SSE3*/ 00107 00108 #ifdef LV_HAVE_GENERIC 00109 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { 00110 lv_32fc_t diff; 00111 float sq_dist; 00112 unsigned int i = 0; 00113 00114 for(; i < num_bytes >> 3; ++i) { 00115 diff = src0[0] - points[i]; 00116 00117 sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); 00118 00119 target[i] = sq_dist; 00120 } 00121 } 00122 00123 #endif /*LV_HAVE_GENERIC*/ 00124 00125 00126 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/