GNU Radio 3.5.3.2 C++ API
volk_16i_x5_add_quad_16i_x4_a.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
00002 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
00003 
00004 
00005 #include<inttypes.h>
00006 #include<stdio.h>       
00007 
00008 
00009 
00010 
00011 
00012 #ifdef LV_HAVE_SSE2
00013 #include<xmmintrin.h>
00014 #include<emmintrin.h>
00015 
00016 static inline  void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
00017   
00018   __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
00019   __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
00020   p_target0 = (__m128i*)target0;
00021   p_target1 = (__m128i*)target1;
00022   p_target2 = (__m128i*)target2;
00023   p_target3 = (__m128i*)target3;
00024 
00025   p_src0 = (__m128i*)src0;
00026   p_src1 = (__m128i*)src1;
00027   p_src2 = (__m128i*)src2;
00028   p_src3 = (__m128i*)src3;
00029   p_src4 = (__m128i*)src4;
00030 
00031   int i = 0;
00032 
00033   int bound = (num_bytes >> 4);
00034   int leftovers = (num_bytes >> 1) & 7;
00035 
00036   for(; i < bound; ++i) {
00037     xmm0 = _mm_load_si128(p_src0);
00038     xmm1 = _mm_load_si128(p_src1);
00039     xmm2 = _mm_load_si128(p_src2);
00040     xmm3 = _mm_load_si128(p_src3);
00041     xmm4 = _mm_load_si128(p_src4);
00042     
00043     p_src0 += 1;
00044     p_src1 += 1;
00045     
00046     xmm1 = _mm_add_epi16(xmm0, xmm1);
00047     xmm2 = _mm_add_epi16(xmm0, xmm2);
00048     xmm3 = _mm_add_epi16(xmm0, xmm3);
00049     xmm4 = _mm_add_epi16(xmm0, xmm4);
00050     
00051     
00052     p_src2 += 1;
00053     p_src3 += 1;
00054     p_src4 += 1;
00055 
00056     _mm_store_si128(p_target0, xmm1);
00057     _mm_store_si128(p_target1, xmm2);
00058     _mm_store_si128(p_target2, xmm3);
00059     _mm_store_si128(p_target3, xmm4);
00060     
00061     p_target0 += 1;
00062     p_target1 += 1;
00063     p_target2 += 1;
00064     p_target3 += 1;
00065   }
00066     /*asm volatile
00067                 (
00068                  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
00069                  "cmp $0, %[bound]\n\t"
00070                  "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
00071                  "movaps (%[src0]), %%xmm1\n\t"
00072                  "movaps (%[src1]), %%xmm2\n\t"
00073                  "movaps (%[src2]), %%xmm3\n\t"
00074                  "movaps (%[src3]), %%xmm4\n\t"
00075                  "movaps (%[src4]), %%xmm5\n\t"
00076                  "add $16, %[src0]\n\t"
00077                  "add $16, %[src1]\n\t"
00078                  "add $16, %[src2]\n\t"
00079                  "add $16, %[src3]\n\t"
00080                  "add $16, %[src4]\n\t"
00081                  "paddw %%xmm1, %%xmm2\n\t"
00082                  "paddw %%xmm1, %%xmm3\n\t"
00083                  "paddw %%xmm1, %%xmm4\n\t"
00084                  "paddw %%xmm1, %%xmm5\n\t"
00085                  "add $-1, %[bound]\n\t"
00086                  "movaps %%xmm2, (%[target0])\n\t"
00087                  "movaps %%xmm3, (%[target1])\n\t"
00088                  "movaps %%xmm4, (%[target2])\n\t"
00089                  "movaps %%xmm5, (%[target3])\n\t"
00090                  "add $16, %[target0]\n\t"
00091                  "add $16, %[target1]\n\t"
00092                  "add $16, %[target2]\n\t"
00093                  "add $16, %[target3]\n\t"
00094                  "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
00095                  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
00096                  :
00097                  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
00098                  :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
00099                  );
00100                  
00101     */
00102          
00103 
00104   for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
00105     target0[i] = src0[i] + src1[i];
00106     target1[i] = src0[i] + src2[i];
00107     target2[i] = src0[i] + src3[i];
00108     target3[i] = src0[i] + src4[i];
00109   }
00110 }
00111 #endif /*LV_HAVE_SSE2*/
00112 
00113 
00114 #ifdef LV_HAVE_GENERIC
00115 
00116 static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
00117         
00118         int i = 0;
00119         
00120         int bound = num_bytes >> 1;
00121 
00122         for(i = 0; i < bound; ++i) {
00123                 target0[i] = src0[i] + src1[i];
00124                 target1[i] = src0[i] + src2[i];
00125                 target2[i] = src0[i] + src3[i];
00126                 target3[i] = src0[i] + src4[i];
00127         }
00128 }
00129 
00130 #endif /* LV_HAVE_GENERIC */
00131 
00132 
00133 
00134 
00135 
00136 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/