GNU Radio 3.5.3.2 C++ API
volk_32fc_x2_square_dist_32f_a.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
00002 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
00003 
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007 
00008 #ifdef LV_HAVE_SSE3
00009 #include<xmmintrin.h>
00010 #include<pmmintrin.h>
00011 
00012 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
00013   
00014 
00015   __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
00016 
00017   lv_32fc_t diff;
00018   float sq_dist;
00019   int bound = num_bytes >> 5;
00020   int leftovers0 = (num_bytes >> 4) & 1;
00021   int leftovers1 = (num_bytes >> 3) & 1;
00022   int i = 0;
00023 
00024   xmm1 = _mm_setzero_ps();
00025   xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
00026   xmm2 = _mm_load_ps((float*)&points[0]);
00027   xmm1 = _mm_movelh_ps(xmm1, xmm1);
00028   xmm3 = _mm_load_ps((float*)&points[2]);
00029   
00030 
00031   for(; i < bound - 1; ++i) {
00032     xmm4 = _mm_sub_ps(xmm1, xmm2);
00033     xmm5 = _mm_sub_ps(xmm1, xmm3);
00034     points += 4;
00035     xmm6 = _mm_mul_ps(xmm4, xmm4);
00036     xmm7 = _mm_mul_ps(xmm5, xmm5);
00037     
00038     xmm2 = _mm_load_ps((float*)&points[0]);
00039     
00040     xmm4 = _mm_hadd_ps(xmm6, xmm7);
00041 
00042     xmm3 = _mm_load_ps((float*)&points[2]);
00043 
00044     _mm_store_ps(target, xmm4);
00045 
00046     target += 4;
00047 
00048   }
00049   
00050   xmm4 = _mm_sub_ps(xmm1, xmm2);
00051   xmm5 = _mm_sub_ps(xmm1, xmm3);
00052   
00053   
00054 
00055   points += 4;
00056   xmm6 = _mm_mul_ps(xmm4, xmm4);
00057   xmm7 = _mm_mul_ps(xmm5, xmm5);
00058     
00059   xmm4 = _mm_hadd_ps(xmm6, xmm7);
00060    
00061   _mm_store_ps(target, xmm4);
00062   
00063   target += 4;
00064 
00065   for(i = 0; i < leftovers0; ++i) {
00066     
00067     xmm2 = _mm_load_ps((float*)&points[0]);
00068     
00069     xmm4 = _mm_sub_ps(xmm1, xmm2);
00070     
00071     points += 2;
00072     
00073     xmm6 = _mm_mul_ps(xmm4, xmm4);
00074 
00075     xmm4 = _mm_hadd_ps(xmm6, xmm6);
00076     
00077     _mm_storeh_pi((__m64*)target, xmm4);
00078 
00079     target += 2;
00080   }
00081 
00082   for(i = 0; i < leftovers1; ++i) {
00083     
00084     diff = src0[0] - points[0];
00085 
00086     sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
00087 
00088     target[0] = sq_dist;
00089   }
00090 }
00091 
00092 #endif /*LV_HAVE_SSE3*/
00093 
00094 #ifdef LV_HAVE_GENERIC
00095 static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
00096   lv_32fc_t diff;
00097   float sq_dist;
00098   unsigned int i = 0; 
00099   
00100   for(; i < num_bytes >> 3; ++i) {
00101     diff = src0[0] - points[i];
00102 
00103     sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
00104     
00105     target[i] = sq_dist;
00106   }
00107 }
00108 
00109 #endif /*LV_HAVE_GENERIC*/
00110 
00111 
00112 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/