GNU Radio 3.5.3.2 C++ API
volk_32fc_index_max_16u_a.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
00002 #define INCLUDED_volk_32fc_index_max_16u_a_H
00003 
00004 #include <volk/volk_common.h>
00005 #include<inttypes.h>
00006 #include<stdio.h>
00007 #include<volk/volk_complex.h>
00008 
00009 #ifdef LV_HAVE_SSE3
00010 #include<xmmintrin.h>
00011 #include<pmmintrin.h>
00012 
00013 
00014 static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
00015   
00016   
00017   
00018   union bit128 holderf;
00019   union bit128 holderi;
00020   float sq_dist = 0.0;
00021 
00022 
00023 
00024   
00025   union bit128 xmm5, xmm4;
00026   __m128 xmm1, xmm2, xmm3;
00027   __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
00028 
00029   xmm5.int_vec = xmmfive = _mm_setzero_si128();
00030   xmm4.int_vec = xmmfour = _mm_setzero_si128();
00031   holderf.int_vec = holder0 = _mm_setzero_si128();
00032   holderi.int_vec = holder1 = _mm_setzero_si128();
00033  
00034   
00035   int bound = num_bytes >> 5;
00036   int leftovers0 = (num_bytes >> 4) & 1;
00037   int leftovers1 = (num_bytes >> 3) & 1;
00038   int i = 0;
00039   
00040   
00041   xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
00042   xmm9 = xmm8 = _mm_setzero_si128();
00043   xmm10 = _mm_set_epi32(4, 4, 4, 4);
00044   xmm3 = _mm_setzero_ps();
00045 ;
00046   
00047   //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
00048   
00049   for(; i < bound; ++i) {
00050   
00051     xmm1 = _mm_load_ps((float*)src0);
00052     xmm2 = _mm_load_ps((float*)&src0[2]);
00053     
00054 
00055     src0 += 4;
00056   
00057   
00058     xmm1 = _mm_mul_ps(xmm1, xmm1);
00059     xmm2 = _mm_mul_ps(xmm2, xmm2);
00060     
00061     
00062     xmm1 = _mm_hadd_ps(xmm1, xmm2);
00063 
00064     xmm3 = _mm_max_ps(xmm1, xmm3);
00065   
00066     xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
00067     xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
00068     
00069         
00070     
00071     xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
00072     xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
00073     
00074     xmm9 = _mm_add_epi32(xmm11,  xmm12);
00075 
00076     xmm8 = _mm_add_epi32(xmm8, xmm10);    
00077 
00078     
00079     //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
00080     //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
00081 
00082   }
00083   
00084   
00085   for(i = 0; i < leftovers0; ++i) {
00086 
00087 
00088     xmm2 = _mm_load_ps((float*)src0);
00089     
00090     xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
00091     xmm8 = bit128_p(&xmm1)->int_vec;
00092 
00093     xmm2 = _mm_mul_ps(xmm2, xmm2);
00094 
00095     src0 += 2;
00096 
00097     xmm1 = _mm_hadd_ps(xmm2, xmm2);
00098 
00099     xmm3 = _mm_max_ps(xmm1, xmm3);
00100 
00101     xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
00102     
00103     
00104     xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
00105     xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
00106     
00107         
00108     
00109     xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
00110     xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
00111     
00112     xmm9 = _mm_add_epi32(xmm11, xmm12);
00113 
00114     xmm8 = _mm_add_epi32(xmm8, xmm10);    
00115     //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
00116 
00117   }
00118     
00119   
00120     
00121 
00122   for(i = 0; i < leftovers1; ++i) {
00123     //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
00124     
00125 
00126     sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
00127     
00128     xmm2 = _mm_load1_ps(&sq_dist);
00129 
00130     xmm1 = xmm3;
00131     
00132     xmm3 = _mm_max_ss(xmm3, xmm2);
00133 
00134     
00135         
00136     xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
00137     xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
00138     
00139         
00140     xmm8 = _mm_shuffle_epi32(xmm8, 0x00);  
00141     
00142     xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
00143     xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
00144     
00145 
00146     xmm9 = _mm_add_epi32(xmm11, xmm12);
00147 
00148   }
00149   
00150   //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
00151 
00152   //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
00153 
00154   _mm_store_ps((float*)&(holderf.f), xmm3);
00155   _mm_store_si128(&(holderi.int_vec), xmm9);
00156   
00157   target[0] = holderi.i[0];
00158   sq_dist = holderf.f[0]; 
00159   target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
00160   sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
00161   target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
00162   sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
00163   target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
00164   sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
00165 
00166  
00167   
00168   /*
00169   float placeholder = 0.0;
00170   uint32_t temp0, temp1; 
00171   unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
00172   unsigned int l0 = g0 ^ 1;
00173 
00174   unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
00175   unsigned int l1 = g1 ^ 1;
00176   
00177   temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
00178   temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
00179   sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; 
00180   placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
00181   
00182   g0 = (sq_dist > placeholder);
00183   l0 = g0 ^ 1;
00184   target[0] = g0 * temp0 + l0 * temp1;
00185   */
00186   
00187 }
00188 
00189 #endif /*LV_HAVE_SSE3*/
00190 
00191 #ifdef LV_HAVE_GENERIC
00192 static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
00193   float sq_dist = 0.0;
00194   float max = 0.0;
00195   unsigned int index = 0;
00196   
00197   unsigned int i = 0; 
00198   
00199   for(; i < num_bytes >> 3; ++i) {
00200 
00201     sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
00202     
00203     index = sq_dist > max ? i : index;
00204     max = sq_dist > max ? sq_dist : max;
00205     
00206     
00207   }
00208   target[0] = index;
00209   
00210 }
00211 
00212 #endif /*LV_HAVE_GENERIC*/
00213 
00214 
00215 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/