GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H 00002 #define INCLUDED_volk_16i_permute_and_scalar_add_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 00010 00011 #ifdef LV_HAVE_SSE2 00012 00013 #include<xmmintrin.h> 00014 #include<emmintrin.h> 00015 00016 static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { 00017 00018 00019 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 00020 00021 __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; 00022 00023 short* p_permute_indexes = permute_indexes; 00024 00025 p_target = (__m128i*)target; 00026 p_cntl0 = (__m128i*)cntl0; 00027 p_cntl1 = (__m128i*)cntl1; 00028 p_cntl2 = (__m128i*)cntl2; 00029 p_cntl3 = (__m128i*)cntl3; 00030 p_scalars = (__m128i*)scalars; 00031 00032 int i = 0; 00033 00034 int bound = (num_bytes >> 4); 00035 int leftovers = (num_bytes >> 1) & 7; 00036 00037 xmm0 = _mm_load_si128(p_scalars); 00038 00039 xmm1 = _mm_shufflelo_epi16(xmm0, 0); 00040 xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); 00041 xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); 00042 xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); 00043 00044 xmm1 = _mm_shuffle_epi32(xmm1, 0x00); 00045 xmm2 = _mm_shuffle_epi32(xmm2, 0x00); 00046 xmm3 = _mm_shuffle_epi32(xmm3, 0x00); 00047 xmm4 = _mm_shuffle_epi32(xmm4, 0x00); 00048 00049 00050 for(; i < bound; ++i) { 00051 xmm0 = _mm_setzero_si128(); 00052 xmm5 = _mm_setzero_si128(); 00053 xmm6 = _mm_setzero_si128(); 00054 xmm7 = _mm_setzero_si128(); 00055 00056 xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); 00057 xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); 00058 xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); 00059 xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); 00060 xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); 00061 xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); 00062 xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); 00063 xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); 00064 00065 xmm0 = _mm_add_epi16(xmm0, xmm5); 00066 xmm6 = _mm_add_epi16(xmm6, xmm7); 00067 00068 p_permute_indexes += 8; 00069 00070 xmm0 = _mm_add_epi16(xmm0, xmm6); 00071 00072 xmm5 = _mm_load_si128(p_cntl0); 00073 xmm6 = _mm_load_si128(p_cntl1); 00074 xmm7 = _mm_load_si128(p_cntl2); 00075 00076 xmm5 = _mm_and_si128(xmm5, xmm1); 00077 xmm6 = _mm_and_si128(xmm6, xmm2); 00078 xmm7 = _mm_and_si128(xmm7, xmm3); 00079 00080 xmm0 = _mm_add_epi16(xmm0, xmm5); 00081 00082 xmm5 = _mm_load_si128(p_cntl3); 00083 00084 xmm6 = _mm_add_epi16(xmm6, xmm7); 00085 00086 p_cntl0 += 1; 00087 00088 xmm5 = _mm_and_si128(xmm5, xmm4); 00089 00090 xmm0 = _mm_add_epi16(xmm0, xmm6); 00091 00092 p_cntl1 += 1; 00093 p_cntl2 += 1; 00094 00095 xmm0 = _mm_add_epi16(xmm0, xmm5); 00096 00097 p_cntl3 += 1; 00098 00099 _mm_store_si128(p_target, xmm0); 00100 00101 p_target += 1; 00102 } 00103 00104 00105 00106 00107 00108 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { 00109 target[i] = src0[permute_indexes[i]] 00110 + (cntl0[i] & scalars[0]) 00111 + (cntl1[i] & scalars[1]) 00112 + (cntl2[i] & scalars[2]) 00113 + (cntl3[i] & scalars[3]); 00114 } 00115 } 00116 #endif /*LV_HAVE_SSEs*/ 00117 00118 00119 #ifdef LV_HAVE_GENERIC 00120 static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { 00121 00122 int i = 0; 00123 00124 int bound = num_bytes >> 1; 00125 00126 for(i = 0; i < bound; ++i) { 00127 target[i] = src0[permute_indexes[i]] 00128 + (cntl0[i] & scalars[0]) 00129 + (cntl1[i] & scalars[1]) 00130 + (cntl2[i] & scalars[2]) 00131 + (cntl3[i] & scalars[3]); 00132 00133 } 00134 } 00135 00136 #endif /*LV_HAVE_GENERIC*/ 00137 00138 00139 #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/