GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H 00002 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H 00003 00004 #include <volk/volk_common.h> 00005 00006 #include<inttypes.h> 00007 #include<stdio.h> 00008 00009 00010 #ifdef LV_HAVE_SSSE3 00011 00012 #include<xmmintrin.h> 00013 #include<emmintrin.h> 00014 #include<tmmintrin.h> 00015 00016 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { 00017 00018 const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 00019 const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; 00020 const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; 00021 const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; 00022 00023 00024 00025 __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 00026 __m128i xmm5, xmm6, xmm7, xmm8; 00027 00028 xmm4 = _mm_load_si128((__m128i*)shufmask0); 00029 xmm5 = _mm_load_si128((__m128i*)shufmask1); 00030 xmm6 = _mm_load_si128((__m128i*)andmask0); 00031 xmm7 = _mm_load_si128((__m128i*)andmask1); 00032 00033 __m128i *p_target, *p_src0; 00034 00035 p_target = (__m128i*)target; 00036 p_src0 = (__m128i*)src0; 00037 00038 int bound = num_bytes >> 5; 00039 int intermediate = (num_bytes >> 4) & 1; 00040 int leftovers = (num_bytes >> 1) & 7; 00041 00042 int i = 0; 00043 00044 00045 for(i = 0; i < bound; ++i) { 00046 00047 xmm0 = _mm_load_si128(p_src0); 00048 xmm1 = _mm_load_si128(&p_src0[1]); 00049 00050 00051 00052 xmm2 = _mm_xor_si128(xmm2, xmm2); 00053 p_src0 += 2; 00054 00055 xmm3 = _mm_hsub_epi16(xmm0, xmm1); 00056 00057 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); 00058 00059 xmm8 = _mm_and_si128(xmm2, xmm6); 00060 xmm3 = _mm_and_si128(xmm2, xmm7); 00061 00062 00063 xmm8 = _mm_add_epi8(xmm8, xmm4); 00064 xmm3 = _mm_add_epi8(xmm3, xmm5); 00065 00066 xmm0 = _mm_shuffle_epi8(xmm0, xmm8); 00067 xmm1 = _mm_shuffle_epi8(xmm1, xmm3); 00068 00069 00070 xmm3 = _mm_add_epi16(xmm0, xmm1); 00071 00072 00073 _mm_store_si128(p_target, xmm3); 00074 00075 p_target += 1; 00076 00077 } 00078 00079 for(i = 0; i < intermediate; ++i) { 00080 00081 xmm0 = _mm_load_si128(p_src0); 00082 00083 00084 xmm2 = _mm_xor_si128(xmm2, xmm2); 00085 p_src0 += 1; 00086 00087 xmm3 = _mm_hsub_epi16(xmm0, xmm1); 00088 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); 00089 00090 xmm8 = _mm_and_si128(xmm2, xmm6); 00091 00092 xmm3 = _mm_add_epi8(xmm8, xmm4); 00093 00094 xmm0 = _mm_shuffle_epi8(xmm0, xmm3); 00095 00096 _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); 00097 00098 p_target = (__m128i*)((int8_t*)p_target + 8); 00099 00100 } 00101 00102 for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 00103 target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; 00104 } 00105 00106 00107 } 00108 00109 #endif /*LV_HAVE_SSSE3*/ 00110 00111 00112 #ifdef LV_HAVE_GENERIC 00113 static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { 00114 00115 int i = 0; 00116 00117 int bound = num_bytes >> 1; 00118 00119 00120 for(i = 0; i < bound; i += 2) { 00121 target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; 00122 } 00123 00124 } 00125 00126 00127 00128 #endif /*LV_HAVE_GENERIC*/ 00129 00130 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/