GNU Radio 3.6.5 C++ API

volk_32f_x3_sum_of_poly_32f_a.h

Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
00002 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
00003 
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007 
00008 #ifndef MAX
00009 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
00010 #endif
00011 
00012 #ifdef LV_HAVE_SSE3
00013 #include<xmmintrin.h>
00014 #include<pmmintrin.h>
00015 
00016 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
00017 
00018 
00019   float result = 0.0;
00020   float fst = 0.0;
00021   float sq = 0.0;
00022   float thrd = 0.0;
00023   float frth = 0.0;
00024   //float fith = 0.0;
00025 
00026 
00027 
00028   __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
00029 
00030   xmm9 = _mm_setzero_ps();
00031   xmm1 = _mm_setzero_ps();
00032 
00033   xmm0 = _mm_load1_ps(&center_point_array[0]);
00034   xmm6 = _mm_load1_ps(&center_point_array[1]);
00035   xmm7 = _mm_load1_ps(&center_point_array[2]);
00036   xmm8 = _mm_load1_ps(&center_point_array[3]);
00037   //xmm11 = _mm_load1_ps(&center_point_array[4]);
00038   xmm10 = _mm_load1_ps(cutoff);
00039 
00040   int bound = num_bytes >> 4;
00041   int leftovers = (num_bytes >> 2) & 3;
00042   int i = 0;
00043 
00044   for(; i < bound; ++i) {
00045     xmm2 = _mm_load_ps(src0);
00046     xmm2 = _mm_max_ps(xmm10, xmm2);
00047     xmm3 = _mm_mul_ps(xmm2, xmm2);
00048     xmm4 = _mm_mul_ps(xmm2, xmm3);
00049     xmm5 = _mm_mul_ps(xmm3, xmm3);
00050     //xmm12 = _mm_mul_ps(xmm3, xmm4);
00051 
00052     xmm2 = _mm_mul_ps(xmm2, xmm0);
00053     xmm3 = _mm_mul_ps(xmm3, xmm6);
00054     xmm4 = _mm_mul_ps(xmm4, xmm7);
00055     xmm5 = _mm_mul_ps(xmm5, xmm8);
00056     //xmm12 = _mm_mul_ps(xmm12, xmm11);
00057 
00058     xmm2 = _mm_add_ps(xmm2, xmm3);
00059     xmm3 = _mm_add_ps(xmm4, xmm5);
00060 
00061     src0 += 4;
00062 
00063     xmm9 = _mm_add_ps(xmm2, xmm9);
00064 
00065     xmm1 = _mm_add_ps(xmm3, xmm1);
00066 
00067     //xmm9 = _mm_add_ps(xmm12, xmm9);
00068   }
00069 
00070   xmm2 = _mm_hadd_ps(xmm9, xmm1);
00071   xmm3 = _mm_hadd_ps(xmm2, xmm2);
00072   xmm4 = _mm_hadd_ps(xmm3, xmm3);
00073 
00074   _mm_store_ss(&result, xmm4);
00075 
00076 
00077 
00078   for(i = 0; i < leftovers; ++i) {
00079     fst = src0[i];
00080     fst = MAX(fst, *cutoff);
00081     sq = fst * fst;
00082     thrd = fst * sq;
00083     frth = sq * sq;
00084     //fith = sq * thrd;
00085 
00086     result += (center_point_array[0] * fst +
00087                center_point_array[1] * sq +
00088                center_point_array[2] * thrd +
00089                center_point_array[3] * frth);// +
00090                //center_point_array[4] * fith);
00091   }
00092 
00093   result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
00094 
00095   target[0] = result;
00096 }
00097 
00098 
00099 #endif /*LV_HAVE_SSE3*/
00100 
00101 #ifdef LV_HAVE_GENERIC
00102 
00103 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
00104 
00105 
00106 
00107   float result = 0.0;
00108   float fst = 0.0;
00109   float sq = 0.0;
00110   float thrd = 0.0;
00111   float frth = 0.0;
00112   //float fith = 0.0;
00113 
00114 
00115 
00116   unsigned int i = 0;
00117 
00118   for(; i < num_bytes >> 2; ++i) {
00119     fst = src0[i];
00120     fst = MAX(fst, *cutoff);
00121 
00122     sq = fst * fst;
00123     thrd = fst * sq;
00124     frth = sq * sq;
00125     //fith = sq * thrd;
00126 
00127     result += (center_point_array[0] * fst +
00128                center_point_array[1] * sq +
00129                center_point_array[2] * thrd +
00130                center_point_array[3] * frth); //+
00131                //center_point_array[4] * fith);
00132     /*printf("%f12...%d\n", (center_point_array[0] * fst +
00133                   center_point_array[1] * sq +
00134                   center_point_array[2] * thrd +
00135                          center_point_array[3] * frth) +
00136            //center_point_array[4] * fith) +
00137            (center_point_array[4]), i);
00138     */
00139   }
00140 
00141   result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
00142 
00143 
00144 
00145   *target = result;
00146 }
00147 
00148 #endif /*LV_HAVE_GENERIC*/
00149 
00150 
00151 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/