GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 00002 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 00003 00004 #include<inttypes.h> 00005 #include<stdio.h> 00006 #include<volk/volk_complex.h> 00007 00008 #ifndef MAX 00009 #define MAX(X,Y) ((X) > (Y)?(X):(Y)) 00010 #endif 00011 00012 #ifdef LV_HAVE_SSE3 00013 #include<xmmintrin.h> 00014 #include<pmmintrin.h> 00015 00016 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { 00017 00018 00019 float result = 0.0; 00020 float fst = 0.0; 00021 float sq = 0.0; 00022 float thrd = 0.0; 00023 float frth = 0.0; 00024 //float fith = 0.0; 00025 00026 00027 00028 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; 00029 00030 xmm9 = _mm_setzero_ps(); 00031 xmm1 = _mm_setzero_ps(); 00032 00033 xmm0 = _mm_load1_ps(¢er_point_array[0]); 00034 xmm6 = _mm_load1_ps(¢er_point_array[1]); 00035 xmm7 = _mm_load1_ps(¢er_point_array[2]); 00036 xmm8 = _mm_load1_ps(¢er_point_array[3]); 00037 //xmm11 = _mm_load1_ps(¢er_point_array[4]); 00038 xmm10 = _mm_load1_ps(cutoff); 00039 00040 int bound = num_bytes >> 4; 00041 int leftovers = (num_bytes >> 2) & 3; 00042 int i = 0; 00043 00044 for(; i < bound; ++i) { 00045 xmm2 = _mm_load_ps(src0); 00046 xmm2 = _mm_max_ps(xmm10, xmm2); 00047 xmm3 = _mm_mul_ps(xmm2, xmm2); 00048 xmm4 = _mm_mul_ps(xmm2, xmm3); 00049 xmm5 = _mm_mul_ps(xmm3, xmm3); 00050 //xmm12 = _mm_mul_ps(xmm3, xmm4); 00051 00052 xmm2 = _mm_mul_ps(xmm2, xmm0); 00053 xmm3 = _mm_mul_ps(xmm3, xmm6); 00054 xmm4 = _mm_mul_ps(xmm4, xmm7); 00055 xmm5 = _mm_mul_ps(xmm5, xmm8); 00056 //xmm12 = _mm_mul_ps(xmm12, xmm11); 00057 00058 xmm2 = _mm_add_ps(xmm2, xmm3); 00059 xmm3 = _mm_add_ps(xmm4, xmm5); 00060 00061 src0 += 4; 00062 00063 xmm9 = _mm_add_ps(xmm2, xmm9); 00064 00065 xmm1 = _mm_add_ps(xmm3, xmm1); 00066 00067 //xmm9 = _mm_add_ps(xmm12, xmm9); 00068 } 00069 00070 xmm2 = _mm_hadd_ps(xmm9, xmm1); 00071 xmm3 = _mm_hadd_ps(xmm2, xmm2); 00072 xmm4 = _mm_hadd_ps(xmm3, xmm3); 00073 00074 _mm_store_ss(&result, xmm4); 00075 00076 00077 00078 for(i = 0; i < leftovers; ++i) { 00079 fst = src0[i]; 00080 fst = MAX(fst, *cutoff); 00081 sq = fst * fst; 00082 thrd = fst * sq; 00083 frth = sq * sq; 00084 //fith = sq * thrd; 00085 00086 result += (center_point_array[0] * fst + 00087 center_point_array[1] * sq + 00088 center_point_array[2] * thrd + 00089 center_point_array[3] * frth);// + 00090 //center_point_array[4] * fith); 00091 } 00092 00093 result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; 00094 00095 target[0] = result; 00096 } 00097 00098 00099 #endif /*LV_HAVE_SSE3*/ 00100 00101 #ifdef LV_HAVE_GENERIC 00102 00103 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { 00104 00105 00106 00107 float result = 0.0; 00108 float fst = 0.0; 00109 float sq = 0.0; 00110 float thrd = 0.0; 00111 float frth = 0.0; 00112 //float fith = 0.0; 00113 00114 00115 00116 unsigned int i = 0; 00117 00118 for(; i < num_bytes >> 2; ++i) { 00119 fst = src0[i]; 00120 fst = MAX(fst, *cutoff); 00121 00122 sq = fst * fst; 00123 thrd = fst * sq; 00124 frth = sq * sq; 00125 //fith = sq * thrd; 00126 00127 result += (center_point_array[0] * fst + 00128 center_point_array[1] * sq + 00129 center_point_array[2] * thrd + 00130 center_point_array[3] * frth); //+ 00131 //center_point_array[4] * fith); 00132 /*printf("%f12...%d\n", (center_point_array[0] * fst + 00133 center_point_array[1] * sq + 00134 center_point_array[2] * thrd + 00135 center_point_array[3] * frth) + 00136 //center_point_array[4] * fith) + 00137 (center_point_array[4]), i); 00138 */ 00139 } 00140 00141 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); 00142 00143 00144 00145 *target = result; 00146 } 00147 00148 #endif /*LV_HAVE_GENERIC*/ 00149 00150 00151 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/