doc/doxygen-3.6/volk__32f__x3__sum__of__poly__32f__a_8h_source.html

00001 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
00002 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
00003
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007
00008 #ifndef MAX
00009 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
00010 #endif
00011
00012 #ifdef LV_HAVE_SSE3
00013 #include<xmmintrin.h>
00014 #include<pmmintrin.h>
00015
00016 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
00017
00018
00019   float result = 0.0;
00020   float fst = 0.0;
00021   float sq = 0.0;
00022   float thrd = 0.0;
00023   float frth = 0.0;
00024   //float fith = 0.0;
00025
00026
00027
00028   __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
00029
00030   xmm9 = _mm_setzero_ps();
00031   xmm1 = _mm_setzero_ps();
00032
00033   xmm0 = _mm_load1_ps(&center_point_array[0]);
00034   xmm6 = _mm_load1_ps(&center_point_array[1]);
00035   xmm7 = _mm_load1_ps(&center_point_array[2]);
00036   xmm8 = _mm_load1_ps(&center_point_array[3]);
00037   //xmm11 = _mm_load1_ps(&center_point_array[4]);
00038   xmm10 = _mm_load1_ps(cutoff);
00039
00040   int bound = num_bytes >> 4;
00041   int leftovers = (num_bytes >> 2) & 3;
00042   int i = 0;
00043
00044   for(; i < bound; ++i) {
00045     xmm2 = _mm_load_ps(src0);
00046     xmm2 = _mm_max_ps(xmm10, xmm2);
00047     xmm3 = _mm_mul_ps(xmm2, xmm2);
00048     xmm4 = _mm_mul_ps(xmm2, xmm3);
00049     xmm5 = _mm_mul_ps(xmm3, xmm3);
00050     //xmm12 = _mm_mul_ps(xmm3, xmm4);
00051
00052     xmm2 = _mm_mul_ps(xmm2, xmm0);
00053     xmm3 = _mm_mul_ps(xmm3, xmm6);
00054     xmm4 = _mm_mul_ps(xmm4, xmm7);
00055     xmm5 = _mm_mul_ps(xmm5, xmm8);
00056     //xmm12 = _mm_mul_ps(xmm12, xmm11);
00057
00058     xmm2 = _mm_add_ps(xmm2, xmm3);
00059     xmm3 = _mm_add_ps(xmm4, xmm5);
00060
00061     src0 += 4;
00062
00063     xmm9 = _mm_add_ps(xmm2, xmm9);
00064
00065     xmm1 = _mm_add_ps(xmm3, xmm1);
00066
00067     //xmm9 = _mm_add_ps(xmm12, xmm9);
00068   }
00069
00070   xmm2 = _mm_hadd_ps(xmm9, xmm1);
00071   xmm3 = _mm_hadd_ps(xmm2, xmm2);
00072   xmm4 = _mm_hadd_ps(xmm3, xmm3);
00073
00074   _mm_store_ss(&result, xmm4);
00075
00076
00077
00078   for(i = 0; i < leftovers; ++i) {
00079     fst = src0[i];
00080     fst = MAX(fst, *cutoff);
00081     sq = fst * fst;
00082     thrd = fst * sq;
00083     frth = sq * sq;
00084     //fith = sq * thrd;
00085
00086     result += (center_point_array[0] * fst +
00087                center_point_array[1] * sq +
00088                center_point_array[2] * thrd +
00089                center_point_array[3] * frth);// +
00090                //center_point_array[4] * fith);
00091   }
00092
00093   result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
00094
00095   target[0] = result;
00096 }
00097
00098
00099 #endif /*LV_HAVE_SSE3*/
00100
00101 #ifdef LV_HAVE_GENERIC
00102
00103 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
00104
00105
00106
00107   float result = 0.0;
00108   float fst = 0.0;
00109   float sq = 0.0;
00110   float thrd = 0.0;
00111   float frth = 0.0;
00112   //float fith = 0.0;
00113
00114
00115
00116   unsigned int i = 0;
00117
00118   for(; i < num_bytes >> 2; ++i) {
00119     fst = src0[i];
00120     fst = MAX(fst, *cutoff);
00121
00122     sq = fst * fst;
00123     thrd = fst * sq;
00124     frth = sq * sq;
00125     //fith = sq * thrd;
00126
00127     result += (center_point_array[0] * fst +
00128                center_point_array[1] * sq +
00129                center_point_array[2] * thrd +
00130                center_point_array[3] * frth); //+
00131                //center_point_array[4] * fith);
00132     /*printf("%f12...%d\n", (center_point_array[0] * fst +
00133                   center_point_array[1] * sq +
00134                   center_point_array[2] * thrd +
00135                          center_point_array[3] * frth) +
00136            //center_point_array[4] * fith) +
00137            (center_point_array[4]), i);
00138     */
00139   }
00140
00141   result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
00142
00143
00144
00145   *target = result;
00146 }
00147
00148 #endif /*LV_HAVE_GENERIC*/
00149
00150
00151 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/