root / volk / include / volk / volk_32f_x3_sum_of_poly_32f_a.h @ ccfac187
History | View | Annotate | Download (3.5 kB)
| 1 | #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
|
|---|---|
| 2 | #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
|
| 3 | |
| 4 | #include<inttypes.h> |
| 5 | #include<stdio.h> |
| 6 | #include<volk/volk_complex.h> |
| 7 | |
| 8 | #ifndef MAX
|
| 9 | #define MAX(X,Y) ((X) > (Y)?(X):(Y))
|
| 10 | #endif
|
| 11 | |
| 12 | #ifdef LV_HAVE_SSE3
|
| 13 | #include<xmmintrin.h> |
| 14 | #include<pmmintrin.h> |
| 15 | |
| 16 | static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { |
| 17 | |
| 18 | |
| 19 | float result = 0.0; |
| 20 | float fst = 0.0; |
| 21 | float sq = 0.0; |
| 22 | float thrd = 0.0; |
| 23 | float frth = 0.0; |
| 24 | //float fith = 0.0;
|
| 25 | |
| 26 | |
| 27 | |
| 28 | __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
|
| 29 | |
| 30 | xmm9 = _mm_setzero_ps(); |
| 31 | xmm1 = _mm_setzero_ps(); |
| 32 | |
| 33 | xmm0 = _mm_load1_ps(¢er_point_array[0]);
|
| 34 | xmm6 = _mm_load1_ps(¢er_point_array[1]);
|
| 35 | xmm7 = _mm_load1_ps(¢er_point_array[2]);
|
| 36 | xmm8 = _mm_load1_ps(¢er_point_array[3]);
|
| 37 | //xmm11 = _mm_load1_ps(¢er_point_array[4]);
|
| 38 | xmm10 = _mm_load1_ps(cutoff); |
| 39 | |
| 40 | int bound = num_bytes >> 4; |
| 41 | int leftovers = (num_bytes >> 2) & 3; |
| 42 | int i = 0; |
| 43 | |
| 44 | for(; i < bound; ++i) {
|
| 45 | xmm2 = _mm_load_ps(src0); |
| 46 | xmm2 = _mm_max_ps(xmm10, xmm2); |
| 47 | xmm3 = _mm_mul_ps(xmm2, xmm2); |
| 48 | xmm4 = _mm_mul_ps(xmm2, xmm3); |
| 49 | xmm5 = _mm_mul_ps(xmm3, xmm3); |
| 50 | //xmm12 = _mm_mul_ps(xmm3, xmm4);
|
| 51 | |
| 52 | xmm2 = _mm_mul_ps(xmm2, xmm0); |
| 53 | xmm3 = _mm_mul_ps(xmm3, xmm6); |
| 54 | xmm4 = _mm_mul_ps(xmm4, xmm7); |
| 55 | xmm5 = _mm_mul_ps(xmm5, xmm8); |
| 56 | //xmm12 = _mm_mul_ps(xmm12, xmm11);
|
| 57 | |
| 58 | xmm2 = _mm_add_ps(xmm2, xmm3); |
| 59 | xmm3 = _mm_add_ps(xmm4, xmm5); |
| 60 | |
| 61 | src0 += 4;
|
| 62 | |
| 63 | xmm9 = _mm_add_ps(xmm2, xmm9); |
| 64 | |
| 65 | xmm1 = _mm_add_ps(xmm3, xmm1); |
| 66 | |
| 67 | //xmm9 = _mm_add_ps(xmm12, xmm9);
|
| 68 | } |
| 69 | |
| 70 | xmm2 = _mm_hadd_ps(xmm9, xmm1); |
| 71 | xmm3 = _mm_hadd_ps(xmm2, xmm2); |
| 72 | xmm4 = _mm_hadd_ps(xmm3, xmm3); |
| 73 | |
| 74 | _mm_store_ss(&result, xmm4); |
| 75 | |
| 76 | |
| 77 | |
| 78 | for(i = 0; i < leftovers; ++i) { |
| 79 | fst = src0[i]; |
| 80 | fst = MAX(fst, *cutoff); |
| 81 | sq = fst * fst; |
| 82 | thrd = fst * sq; |
| 83 | frth = sq * sq; |
| 84 | //fith = sq * thrd;
|
| 85 | |
| 86 | result += (center_point_array[0] * fst +
|
| 87 | center_point_array[1] * sq +
|
| 88 | center_point_array[2] * thrd +
|
| 89 | center_point_array[3] * frth);// + |
| 90 | //center_point_array[4] * fith);
|
| 91 | } |
| 92 | |
| 93 | result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; |
| 94 | |
| 95 | target[0] = result;
|
| 96 | } |
| 97 | |
| 98 | |
| 99 | #endif /*LV_HAVE_SSE3*/ |
| 100 | |
| 101 | #ifdef LV_HAVE_GENERIC
|
| 102 | |
| 103 | static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { |
| 104 | |
| 105 | |
| 106 | |
| 107 | float result = 0.0; |
| 108 | float fst = 0.0; |
| 109 | float sq = 0.0; |
| 110 | float thrd = 0.0; |
| 111 | float frth = 0.0; |
| 112 | //float fith = 0.0;
|
| 113 | |
| 114 | |
| 115 | |
| 116 | unsigned int i = 0; |
| 117 | |
| 118 | for(; i < num_bytes >> 2; ++i) { |
| 119 | fst = src0[i]; |
| 120 | fst = MAX(fst, *cutoff); |
| 121 | |
| 122 | sq = fst * fst; |
| 123 | thrd = fst * sq; |
| 124 | frth = sq * sq; |
| 125 | //fith = sq * thrd;
|
| 126 | |
| 127 | result += (center_point_array[0] * fst +
|
| 128 | center_point_array[1] * sq +
|
| 129 | center_point_array[2] * thrd +
|
| 130 | center_point_array[3] * frth); //+ |
| 131 | //center_point_array[4] * fith);
|
| 132 | /*printf("%f12...%d\n", (center_point_array[0] * fst +
|
| 133 | center_point_array[1] * sq + |
| 134 | center_point_array[2] * thrd + |
| 135 | center_point_array[3] * frth) + |
| 136 | //center_point_array[4] * fith) + |
| 137 | (center_point_array[4]), i); |
| 138 | */ |
| 139 | } |
| 140 | |
| 141 | result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); |
| 142 | |
| 143 | |
| 144 | |
| 145 | *target = result; |
| 146 | } |
| 147 | |
| 148 | #endif /*LV_HAVE_GENERIC*/ |
| 149 | |
| 150 | |
| 151 | #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/ |