GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H 00002 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 00010 00011 00012 #ifdef LV_HAVE_SSE2 00013 00014 #include<emmintrin.h> 00015 00016 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { 00017 00018 00019 00020 00021 int i = 0; 00022 00023 int bound = (num_bytes >> 4); 00024 int bound_copy = bound; 00025 int leftovers = (num_bytes >> 1) & 7; 00026 00027 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; 00028 p_target = (__m128i*) target; 00029 p_src0 = (__m128i*)src0; 00030 p_src1 = (__m128i*)src1; 00031 p_src2 = (__m128i*)src2; 00032 p_src3 = (__m128i*)src3; 00033 00034 00035 00036 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 00037 00038 while(bound_copy > 0) { 00039 00040 xmm1 = _mm_load_si128(p_src0); 00041 xmm2 = _mm_load_si128(p_src1); 00042 xmm3 = _mm_load_si128(p_src2); 00043 xmm4 = _mm_load_si128(p_src3); 00044 00045 xmm5 = _mm_setzero_si128(); 00046 xmm6 = _mm_setzero_si128(); 00047 xmm7 = xmm1; 00048 xmm8 = xmm3; 00049 00050 00051 xmm1 = _mm_sub_epi16(xmm2, xmm1); 00052 00053 00054 00055 xmm3 = _mm_sub_epi16(xmm4, xmm3); 00056 00057 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); 00058 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); 00059 00060 00061 00062 xmm2 = _mm_and_si128(xmm5, xmm2); 00063 xmm4 = _mm_and_si128(xmm6, xmm4); 00064 xmm5 = _mm_andnot_si128(xmm5, xmm7); 00065 xmm6 = _mm_andnot_si128(xmm6, xmm8); 00066 00067 xmm5 = _mm_add_epi16(xmm2, xmm5); 00068 xmm6 = _mm_add_epi16(xmm4, xmm6); 00069 00070 00071 xmm1 = _mm_xor_si128(xmm1, xmm1); 00072 xmm2 = xmm5; 00073 xmm5 = _mm_sub_epi16(xmm6, xmm5); 00074 p_src0 += 1; 00075 bound_copy -= 1; 00076 00077 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); 00078 p_src1 += 1; 00079 00080 xmm6 = _mm_and_si128(xmm1, xmm6); 00081 00082 xmm1 = _mm_andnot_si128(xmm1, xmm2); 00083 p_src2 += 1; 00084 00085 00086 00087 xmm1 = _mm_add_epi16(xmm6, xmm1); 00088 p_src3 += 1; 00089 00090 00091 _mm_store_si128(p_target, xmm1); 00092 p_target += 1; 00093 00094 } 00095 00096 00097 /*asm volatile 00098 ( 00099 "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" 00100 "cmp $0, %[bound]\n\t" 00101 "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" 00102 00103 "movaps (%[src0]), %%xmm1\n\t" 00104 "movaps (%[src1]), %%xmm2\n\t" 00105 "movaps (%[src2]), %%xmm3\n\t" 00106 "movaps (%[src3]), %%xmm4\n\t" 00107 00108 "pxor %%xmm5, %%xmm5\n\t" 00109 "pxor %%xmm6, %%xmm6\n\t" 00110 "movaps %%xmm1, %%xmm7\n\t" 00111 "movaps %%xmm3, %%xmm8\n\t" 00112 "psubw %%xmm2, %%xmm1\n\t" 00113 "psubw %%xmm4, %%xmm3\n\t" 00114 00115 "pcmpgtw %%xmm1, %%xmm5\n\t" 00116 "pcmpgtw %%xmm3, %%xmm6\n\t" 00117 00118 "pand %%xmm5, %%xmm2\n\t" 00119 "pand %%xmm6, %%xmm4\n\t" 00120 "pandn %%xmm7, %%xmm5\n\t" 00121 "pandn %%xmm8, %%xmm6\n\t" 00122 00123 "paddw %%xmm2, %%xmm5\n\t" 00124 "paddw %%xmm4, %%xmm6\n\t" 00125 00126 "pxor %%xmm1, %%xmm1\n\t" 00127 "movaps %%xmm5, %%xmm2\n\t" 00128 00129 "psubw %%xmm6, %%xmm5\n\t" 00130 "add $16, %[src0]\n\t" 00131 "add $-1, %[bound]\n\t" 00132 00133 "pcmpgtw %%xmm5, %%xmm1\n\t" 00134 "add $16, %[src1]\n\t" 00135 00136 "pand %%xmm1, %%xmm6\n\t" 00137 00138 "pandn %%xmm2, %%xmm1\n\t" 00139 "add $16, %[src2]\n\t" 00140 00141 "paddw %%xmm6, %%xmm1\n\t" 00142 "add $16, %[src3]\n\t" 00143 00144 "movaps %%xmm1, (%[target])\n\t" 00145 "addw $16, %[target]\n\t" 00146 "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" 00147 00148 "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" 00149 : 00150 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) 00151 : 00152 ); 00153 */ 00154 00155 short temp0 = 0; 00156 short temp1 = 0; 00157 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { 00158 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; 00159 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; 00160 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; 00161 } 00162 return; 00163 00164 00165 } 00166 00167 #endif /*LV_HAVE_SSE2*/ 00168 00169 00170 #ifdef LV_HAVE_GENERIC 00171 static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { 00172 00173 int i = 0; 00174 00175 int bound = num_bytes >> 1; 00176 00177 short temp0 = 0; 00178 short temp1 = 0; 00179 for(i = 0; i < bound; ++i) { 00180 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; 00181 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; 00182 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; 00183 } 00184 } 00185 00186 00187 00188 00189 #endif /*LV_HAVE_GENERIC*/ 00190 00191 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/