GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H 00002 #define INCLUDED_volk_32fc_index_max_16u_a_H 00003 00004 #include <volk/volk_common.h> 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 #include<volk/volk_complex.h> 00008 00009 #ifdef LV_HAVE_SSE3 00010 #include<xmmintrin.h> 00011 #include<pmmintrin.h> 00012 00013 00014 static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { 00015 00016 00017 00018 union bit128 holderf; 00019 union bit128 holderi; 00020 float sq_dist = 0.0; 00021 00022 00023 00024 00025 union bit128 xmm5, xmm4; 00026 __m128 xmm1, xmm2, xmm3; 00027 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; 00028 00029 xmm5.int_vec = xmmfive = _mm_setzero_si128(); 00030 xmm4.int_vec = xmmfour = _mm_setzero_si128(); 00031 holderf.int_vec = holder0 = _mm_setzero_si128(); 00032 holderi.int_vec = holder1 = _mm_setzero_si128(); 00033 00034 00035 int bound = num_bytes >> 5; 00036 int leftovers0 = (num_bytes >> 4) & 1; 00037 int leftovers1 = (num_bytes >> 3) & 1; 00038 int i = 0; 00039 00040 00041 xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! 00042 xmm9 = xmm8 = _mm_setzero_si128(); 00043 xmm10 = _mm_set_epi32(4, 4, 4, 4); 00044 xmm3 = _mm_setzero_ps(); 00045 ; 00046 00047 //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); 00048 00049 for(; i < bound; ++i) { 00050 00051 xmm1 = _mm_load_ps((float*)src0); 00052 xmm2 = _mm_load_ps((float*)&src0[2]); 00053 00054 00055 src0 += 4; 00056 00057 00058 xmm1 = _mm_mul_ps(xmm1, xmm1); 00059 xmm2 = _mm_mul_ps(xmm2, xmm2); 00060 00061 00062 xmm1 = _mm_hadd_ps(xmm1, xmm2); 00063 00064 xmm3 = _mm_max_ps(xmm1, xmm3); 00065 00066 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); 00067 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); 00068 00069 00070 00071 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); 00072 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); 00073 00074 xmm9 = _mm_add_epi32(xmm11, xmm12); 00075 00076 xmm8 = _mm_add_epi32(xmm8, xmm10); 00077 00078 00079 //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); 00080 //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); 00081 00082 } 00083 00084 00085 for(i = 0; i < leftovers0; ++i) { 00086 00087 00088 xmm2 = _mm_load_ps((float*)src0); 00089 00090 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); 00091 xmm8 = bit128_p(&xmm1)->int_vec; 00092 00093 xmm2 = _mm_mul_ps(xmm2, xmm2); 00094 00095 src0 += 2; 00096 00097 xmm1 = _mm_hadd_ps(xmm2, xmm2); 00098 00099 xmm3 = _mm_max_ps(xmm1, xmm3); 00100 00101 xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); 00102 00103 00104 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); 00105 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); 00106 00107 00108 00109 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); 00110 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); 00111 00112 xmm9 = _mm_add_epi32(xmm11, xmm12); 00113 00114 xmm8 = _mm_add_epi32(xmm8, xmm10); 00115 //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); 00116 00117 } 00118 00119 00120 00121 00122 for(i = 0; i < leftovers1; ++i) { 00123 //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); 00124 00125 00126 sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); 00127 00128 xmm2 = _mm_load1_ps(&sq_dist); 00129 00130 xmm1 = xmm3; 00131 00132 xmm3 = _mm_max_ss(xmm3, xmm2); 00133 00134 00135 00136 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); 00137 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); 00138 00139 00140 xmm8 = _mm_shuffle_epi32(xmm8, 0x00); 00141 00142 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); 00143 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); 00144 00145 00146 xmm9 = _mm_add_epi32(xmm11, xmm12); 00147 00148 } 00149 00150 //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); 00151 00152 //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); 00153 00154 _mm_store_ps((float*)&(holderf.f), xmm3); 00155 _mm_store_si128(&(holderi.int_vec), xmm9); 00156 00157 target[0] = holderi.i[0]; 00158 sq_dist = holderf.f[0]; 00159 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; 00160 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; 00161 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; 00162 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; 00163 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; 00164 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; 00165 00166 00167 00168 /* 00169 float placeholder = 0.0; 00170 uint32_t temp0, temp1; 00171 unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); 00172 unsigned int l0 = g0 ^ 1; 00173 00174 unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); 00175 unsigned int l1 = g1 ^ 1; 00176 00177 temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; 00178 temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; 00179 sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; 00180 placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; 00181 00182 g0 = (sq_dist > placeholder); 00183 l0 = g0 ^ 1; 00184 target[0] = g0 * temp0 + l0 * temp1; 00185 */ 00186 00187 } 00188 00189 #endif /*LV_HAVE_SSE3*/ 00190 00191 #ifdef LV_HAVE_GENERIC 00192 static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { 00193 float sq_dist = 0.0; 00194 float max = 0.0; 00195 unsigned int index = 0; 00196 00197 unsigned int i = 0; 00198 00199 for(; i < num_bytes >> 3; ++i) { 00200 00201 sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); 00202 00203 index = sq_dist > max ? i : index; 00204 max = sq_dist > max ? sq_dist : max; 00205 00206 00207 } 00208 target[0] = index; 00209 00210 } 00211 00212 #endif /*LV_HAVE_GENERIC*/ 00213 00214 00215 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/