GNU Radio 3.6.5 C++ API

volk_16i_permute_and_scalar_add_a.h

Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
00002 #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
00003 
00004 
00005 #include<inttypes.h>
00006 #include<stdio.h>
00007 
00008 
00009 
00010 
00011 #ifdef LV_HAVE_SSE2
00012 
00013 #include<xmmintrin.h>
00014 #include<emmintrin.h>
00015 
00016 static inline  void volk_16i_permute_and_scalar_add_a_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
00017 
00018 
00019   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
00020 
00021   __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
00022 
00023   short* p_permute_indexes = permute_indexes;
00024 
00025   p_target = (__m128i*)target;
00026   p_cntl0 = (__m128i*)cntl0;
00027   p_cntl1 = (__m128i*)cntl1;
00028   p_cntl2 = (__m128i*)cntl2;
00029   p_cntl3 = (__m128i*)cntl3;
00030   p_scalars = (__m128i*)scalars;
00031 
00032   int i = 0;
00033 
00034   int bound = (num_bytes >> 4);
00035   int leftovers = (num_bytes >> 1) & 7;
00036 
00037   xmm0 = _mm_load_si128(p_scalars);
00038 
00039   xmm1 = _mm_shufflelo_epi16(xmm0, 0);
00040   xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
00041   xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
00042   xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
00043 
00044   xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
00045   xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
00046   xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
00047   xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
00048 
00049 
00050   for(; i < bound; ++i) {
00051     xmm0 = _mm_setzero_si128();
00052     xmm5 = _mm_setzero_si128();
00053     xmm6 = _mm_setzero_si128();
00054     xmm7 = _mm_setzero_si128();
00055 
00056     xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
00057     xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
00058     xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
00059     xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
00060     xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
00061     xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
00062     xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
00063     xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
00064 
00065     xmm0 = _mm_add_epi16(xmm0, xmm5);
00066     xmm6 = _mm_add_epi16(xmm6, xmm7);
00067 
00068     p_permute_indexes += 8;
00069 
00070     xmm0 = _mm_add_epi16(xmm0, xmm6);
00071 
00072     xmm5 = _mm_load_si128(p_cntl0);
00073     xmm6 = _mm_load_si128(p_cntl1);
00074     xmm7 = _mm_load_si128(p_cntl2);
00075 
00076     xmm5 = _mm_and_si128(xmm5, xmm1);
00077     xmm6 = _mm_and_si128(xmm6, xmm2);
00078     xmm7 = _mm_and_si128(xmm7, xmm3);
00079 
00080     xmm0 = _mm_add_epi16(xmm0, xmm5);
00081 
00082     xmm5 = _mm_load_si128(p_cntl3);
00083 
00084     xmm6 = _mm_add_epi16(xmm6, xmm7);
00085 
00086     p_cntl0 += 1;
00087 
00088     xmm5 = _mm_and_si128(xmm5, xmm4);
00089 
00090     xmm0 = _mm_add_epi16(xmm0, xmm6);
00091 
00092     p_cntl1 += 1;
00093     p_cntl2 += 1;
00094 
00095     xmm0 = _mm_add_epi16(xmm0, xmm5);
00096 
00097     p_cntl3 += 1;
00098 
00099     _mm_store_si128(p_target, xmm0);
00100 
00101     p_target += 1;
00102   }
00103 
00104 
00105 
00106 
00107 
00108   for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
00109     target[i] = src0[permute_indexes[i]]
00110       + (cntl0[i] & scalars[0])
00111       + (cntl1[i] & scalars[1])
00112       + (cntl2[i] & scalars[2])
00113       + (cntl3[i] & scalars[3]);
00114   }
00115 }
00116 #endif /*LV_HAVE_SSEs*/
00117 
00118 
00119 #ifdef LV_HAVE_GENERIC
00120 static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
00121 
00122         int i = 0;
00123 
00124         int bound = num_bytes >> 1;
00125 
00126         for(i = 0; i < bound; ++i) {
00127                 target[i] = src0[permute_indexes[i]]
00128                         + (cntl0[i] & scalars[0])
00129                         + (cntl1[i] & scalars[1])
00130                         + (cntl2[i] & scalars[2])
00131                         + (cntl3[i] & scalars[3]);
00132 
00133         }
00134 }
00135 
00136 #endif /*LV_HAVE_GENERIC*/
00137 
00138 
00139 #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/