GNU Radio 3.5.3.2 C++ API
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
00002 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
00003 
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007 #include <string.h>
00008 
00009 #ifdef LV_HAVE_SSE3
00010 #include<xmmintrin.h>
00011 #include<pmmintrin.h>
00012 
00013 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
00014   
00015 
00016   __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
00017 
00018   lv_32fc_t diff;
00019   memset(&diff, 0x0, 2*sizeof(float));
00020 
00021   float sq_dist = 0.0;
00022   int bound = num_bytes >> 5;
00023   int leftovers0 = (num_bytes >> 4) & 1;
00024   int leftovers1 = (num_bytes >> 3) & 1;
00025   int i = 0;
00026   
00027   
00028   
00029   xmm1 = _mm_setzero_ps();
00030   xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
00031   xmm2 = _mm_load_ps((float*)&points[0]);
00032   xmm8 = _mm_load1_ps(&scalar);
00033   xmm1 = _mm_movelh_ps(xmm1, xmm1);
00034   xmm3 = _mm_load_ps((float*)&points[2]);
00035   
00036   
00037   for(; i < bound - 1; ++i) {
00038   
00039     xmm4 = _mm_sub_ps(xmm1, xmm2);
00040     xmm5 = _mm_sub_ps(xmm1, xmm3);
00041     points += 4;
00042     xmm6 = _mm_mul_ps(xmm4, xmm4);
00043     xmm7 = _mm_mul_ps(xmm5, xmm5);
00044     
00045     xmm2 = _mm_load_ps((float*)&points[0]);
00046     
00047     xmm4 = _mm_hadd_ps(xmm6, xmm7);
00048 
00049     xmm3 = _mm_load_ps((float*)&points[2]);
00050     
00051     xmm4 = _mm_mul_ps(xmm4, xmm8);
00052 
00053     _mm_store_ps(target, xmm4);
00054 
00055     target += 4;
00056 
00057   }
00058   
00059   xmm4 = _mm_sub_ps(xmm1, xmm2);
00060   xmm5 = _mm_sub_ps(xmm1, xmm3);
00061   
00062   
00063 
00064   points += 4;
00065   xmm6 = _mm_mul_ps(xmm4, xmm4);
00066   xmm7 = _mm_mul_ps(xmm5, xmm5);
00067     
00068   xmm4 = _mm_hadd_ps(xmm6, xmm7);
00069   
00070   xmm4 = _mm_mul_ps(xmm4, xmm8);
00071    
00072   _mm_store_ps(target, xmm4);
00073   
00074   target += 4;
00075   
00076 
00077   for(i = 0; i < leftovers0; ++i) {
00078     
00079     xmm2 = _mm_load_ps((float*)&points[0]);
00080     
00081     xmm4 = _mm_sub_ps(xmm1, xmm2);
00082     
00083     points += 2;
00084     
00085     xmm6 = _mm_mul_ps(xmm4, xmm4);
00086 
00087     xmm4 = _mm_hadd_ps(xmm6, xmm6);
00088 
00089     xmm4 = _mm_mul_ps(xmm4, xmm8);
00090     
00091     _mm_storeh_pi((__m64*)target, xmm4);
00092 
00093     target += 2;
00094   }
00095 
00096   for(i = 0; i < leftovers1; ++i) {
00097     
00098     diff = src0[0] - points[0];
00099 
00100     sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
00101 
00102     target[0] = sq_dist;
00103   }
00104 }
00105 
00106 #endif /*LV_HAVE_SSE3*/
00107 
00108 #ifdef LV_HAVE_GENERIC
00109 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
00110   lv_32fc_t diff;
00111   float sq_dist;
00112   unsigned int i = 0; 
00113   
00114   for(; i < num_bytes >> 3; ++i) {
00115     diff = src0[0] - points[i];
00116 
00117     sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
00118     
00119     target[i] = sq_dist;
00120   }
00121 }
00122 
00123 #endif /*LV_HAVE_GENERIC*/
00124 
00125 
00126 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/