GNU Radio 3.5.3.2 C++ API
|
00001 #ifndef INCLUDED_volk_16i_branch_4_state_8_a_H 00002 #define INCLUDED_volk_16i_branch_4_state_8_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 00010 00011 #ifdef LV_HAVE_SSSE3 00012 00013 #include<xmmintrin.h> 00014 #include<emmintrin.h> 00015 #include<tmmintrin.h> 00016 00017 static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { 00018 00019 00020 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; 00021 00022 __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; 00023 00024 00025 00026 p_target = (__m128i*)target; 00027 p_src0 = (__m128i*)src0; 00028 p_cntl2 = (__m128i*)cntl2; 00029 p_cntl3 = (__m128i*)cntl3; 00030 p_scalars = (__m128i*)scalars; 00031 00032 int i = 0; 00033 00034 int bound = 1; 00035 00036 00037 xmm0 = _mm_load_si128(p_scalars); 00038 00039 xmm1 = _mm_shufflelo_epi16(xmm0, 0); 00040 xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); 00041 xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); 00042 xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); 00043 00044 xmm1 = _mm_shuffle_epi32(xmm1, 0x00); 00045 xmm2 = _mm_shuffle_epi32(xmm2, 0x00); 00046 xmm3 = _mm_shuffle_epi32(xmm3, 0x00); 00047 xmm4 = _mm_shuffle_epi32(xmm4, 0x00); 00048 00049 xmm0 = _mm_load_si128((__m128i*)permuters[0]); 00050 xmm6 = _mm_load_si128((__m128i*)permuters[1]); 00051 xmm8 = _mm_load_si128((__m128i*)permuters[2]); 00052 xmm10 = _mm_load_si128((__m128i*)permuters[3]); 00053 00054 for(; i < bound; ++i) { 00055 00056 xmm5 = _mm_load_si128(p_src0); 00057 00058 00059 00060 00061 00062 00063 00064 00065 00066 xmm0 = _mm_shuffle_epi8(xmm5, xmm0); 00067 xmm6 = _mm_shuffle_epi8(xmm5, xmm6); 00068 xmm8 = _mm_shuffle_epi8(xmm5, xmm8); 00069 xmm10 = _mm_shuffle_epi8(xmm5, xmm10); 00070 00071 p_src0 += 4; 00072 00073 00074 xmm5 = _mm_add_epi16(xmm1, xmm2); 00075 00076 xmm6 = _mm_add_epi16(xmm2, xmm6); 00077 xmm8 = _mm_add_epi16(xmm1, xmm8); 00078 00079 00080 xmm7 = _mm_load_si128(p_cntl2); 00081 xmm9 = _mm_load_si128(p_cntl3); 00082 00083 xmm0 = _mm_add_epi16(xmm5, xmm0); 00084 00085 00086 xmm7 = _mm_and_si128(xmm7, xmm3); 00087 xmm9 = _mm_and_si128(xmm9, xmm4); 00088 00089 xmm5 = _mm_load_si128(&p_cntl2[1]); 00090 xmm11 = _mm_load_si128(&p_cntl3[1]); 00091 00092 xmm7 = _mm_add_epi16(xmm7, xmm9); 00093 00094 xmm5 = _mm_and_si128(xmm5, xmm3); 00095 xmm11 = _mm_and_si128(xmm11, xmm4); 00096 00097 xmm0 = _mm_add_epi16(xmm0, xmm7); 00098 00099 00100 00101 xmm7 = _mm_load_si128(&p_cntl2[2]); 00102 xmm9 = _mm_load_si128(&p_cntl3[2]); 00103 00104 xmm5 = _mm_add_epi16(xmm5, xmm11); 00105 00106 xmm7 = _mm_and_si128(xmm7, xmm3); 00107 xmm9 = _mm_and_si128(xmm9, xmm4); 00108 00109 xmm6 = _mm_add_epi16(xmm6, xmm5); 00110 00111 00112 xmm5 = _mm_load_si128(&p_cntl2[3]); 00113 xmm11 = _mm_load_si128(&p_cntl3[3]); 00114 00115 xmm7 = _mm_add_epi16(xmm7, xmm9); 00116 00117 xmm5 = _mm_and_si128(xmm5, xmm3); 00118 xmm11 = _mm_and_si128(xmm11, xmm4); 00119 00120 xmm8 = _mm_add_epi16(xmm8, xmm7); 00121 00122 xmm5 = _mm_add_epi16(xmm5, xmm11); 00123 00124 _mm_store_si128(p_target, xmm0); 00125 _mm_store_si128(&p_target[1], xmm6); 00126 00127 xmm10 = _mm_add_epi16(xmm5, xmm10); 00128 00129 _mm_store_si128(&p_target[2], xmm8); 00130 00131 _mm_store_si128(&p_target[3], xmm10); 00132 00133 p_target += 3; 00134 } 00135 } 00136 00137 00138 #endif /*LV_HAVE_SSEs*/ 00139 00140 #ifdef LV_HAVE_GENERIC 00141 static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { 00142 int i = 0; 00143 00144 int bound = 4; 00145 00146 for(; i < bound; ++i) { 00147 target[i* 8] = src0[((char)permuters[i][0])/2] 00148 + ((i + 1)%2 * scalars[0]) 00149 + (((i >> 1)^1) * scalars[1]) 00150 + (cntl2[i * 8] & scalars[2]) 00151 + (cntl3[i * 8] & scalars[3]); 00152 target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 00153 + ((i + 1)%2 * scalars[0]) 00154 + (((i >> 1)^1) * scalars[1]) 00155 + (cntl2[i * 8 + 1] & scalars[2]) 00156 + (cntl3[i * 8 + 1] & scalars[3]); 00157 target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 00158 + ((i + 1)%2 * scalars[0]) 00159 + (((i >> 1)^1) * scalars[1]) 00160 + (cntl2[i * 8 + 2] & scalars[2]) 00161 + (cntl3[i * 8 + 2] & scalars[3]); 00162 target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 00163 + ((i + 1)%2 * scalars[0]) 00164 + (((i >> 1)^1) * scalars[1]) 00165 + (cntl2[i * 8 + 3] & scalars[2]) 00166 + (cntl3[i * 8 + 3] & scalars[3]); 00167 target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 00168 + ((i + 1)%2 * scalars[0]) 00169 + (((i >> 1)^1) * scalars[1]) 00170 + (cntl2[i * 8 + 4] & scalars[2]) 00171 + (cntl3[i * 8 + 4] & scalars[3]); 00172 target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 00173 + ((i + 1)%2 * scalars[0]) 00174 + (((i >> 1)^1) * scalars[1]) 00175 + (cntl2[i * 8 + 5] & scalars[2]) 00176 + (cntl3[i * 8 + 5] & scalars[3]); 00177 target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 00178 + ((i + 1)%2 * scalars[0]) 00179 + (((i >> 1)^1) * scalars[1]) 00180 + (cntl2[i * 8 + 6] & scalars[2]) 00181 + (cntl3[i * 8 + 6] & scalars[3]); 00182 target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 00183 + ((i + 1)%2 * scalars[0]) 00184 + (((i >> 1)^1) * scalars[1]) 00185 + (cntl2[i * 8 + 7] & scalars[2]) 00186 + (cntl3[i * 8 + 7] & scalars[3]); 00187 00188 } 00189 } 00190 00191 #endif /*LV_HAVE_GENERIC*/ 00192 00193 00194 #endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/