Statistics
| Branch: | Tag: | Revision:

root / volk / include / volk / volk_32f_x3_sum_of_poly_32f_a.h @ ccfac187

History | View | Annotate | Download (3.5 kB)

1
#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
2
#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
3
4
#include<inttypes.h>
5
#include<stdio.h>
6
#include<volk/volk_complex.h>
7
8
#ifndef MAX
9
#define MAX(X,Y) ((X) > (Y)?(X):(Y))
10
#endif
11
12
#ifdef LV_HAVE_SSE3
13
#include<xmmintrin.h>
14
#include<pmmintrin.h>
15
16
static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
17
  
18
  
19
  float result = 0.0;
20
  float fst = 0.0;
21
  float sq = 0.0;
22
  float thrd = 0.0;
23
  float frth = 0.0;
24
  //float fith = 0.0;
25
  
26
  
27
  
28
  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
29
30
  xmm9 = _mm_setzero_ps();
31
  xmm1 = _mm_setzero_ps();
32
  
33
  xmm0 = _mm_load1_ps(&center_point_array[0]);
34
  xmm6 = _mm_load1_ps(&center_point_array[1]);
35
  xmm7 = _mm_load1_ps(&center_point_array[2]);
36
  xmm8 = _mm_load1_ps(&center_point_array[3]);
37
  //xmm11 = _mm_load1_ps(&center_point_array[4]);
38
  xmm10 = _mm_load1_ps(cutoff);
39
  
40
  int bound = num_bytes >> 4;
41
  int leftovers = (num_bytes >> 2) & 3;
42
  int i = 0;
43
  
44
  for(; i < bound; ++i) {
45
    xmm2 = _mm_load_ps(src0);
46
    xmm2 = _mm_max_ps(xmm10, xmm2);
47
    xmm3 = _mm_mul_ps(xmm2, xmm2);
48
    xmm4 = _mm_mul_ps(xmm2, xmm3);
49
    xmm5 = _mm_mul_ps(xmm3, xmm3);
50
    //xmm12 = _mm_mul_ps(xmm3, xmm4);
51
52
    xmm2 = _mm_mul_ps(xmm2, xmm0);
53
    xmm3 = _mm_mul_ps(xmm3, xmm6);
54
    xmm4 = _mm_mul_ps(xmm4, xmm7);
55
    xmm5 = _mm_mul_ps(xmm5, xmm8);
56
    //xmm12 = _mm_mul_ps(xmm12, xmm11);
57
58
    xmm2 = _mm_add_ps(xmm2, xmm3);
59
    xmm3 = _mm_add_ps(xmm4, xmm5);
60
    
61
    src0 += 4;
62
    
63
    xmm9 = _mm_add_ps(xmm2, xmm9);
64
    
65
    xmm1 = _mm_add_ps(xmm3, xmm1);
66
67
    //xmm9 = _mm_add_ps(xmm12, xmm9);
68
  }
69
  
70
  xmm2 = _mm_hadd_ps(xmm9, xmm1);
71
  xmm3 = _mm_hadd_ps(xmm2, xmm2);
72
  xmm4 = _mm_hadd_ps(xmm3, xmm3);
73
74
  _mm_store_ss(&result, xmm4);
75
    
76
  
77
78
  for(i = 0; i < leftovers; ++i) {
79
    fst = src0[i];
80
    fst = MAX(fst, *cutoff);
81
    sq = fst * fst;
82
    thrd = fst * sq;
83
    frth = sq * sq;
84
    //fith = sq * thrd;
85
    
86
    result += (center_point_array[0] * fst + 
87
               center_point_array[1] * sq + 
88
               center_point_array[2] * thrd + 
89
               center_point_array[3] * frth);// + 
90
               //center_point_array[4] * fith);
91
  }
92
93
  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
94
95
  target[0] = result;
96
}
97
 
98
99
#endif /*LV_HAVE_SSE3*/
100
101
#ifdef LV_HAVE_GENERIC
102
103
static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
104
105
106
    
107
  float result = 0.0;
108
  float fst = 0.0;
109
  float sq = 0.0;
110
  float thrd = 0.0;
111
  float frth = 0.0;
112
  //float fith = 0.0;
113
  
114
115
116
  unsigned int i = 0; 
117
  
118
  for(; i < num_bytes >> 2; ++i) {
119
    fst = src0[i];
120
    fst = MAX(fst, *cutoff);
121
    
122
    sq = fst * fst;
123
    thrd = fst * sq;
124
    frth = sq * sq;
125
    //fith = sq * thrd;
126
    
127
    result += (center_point_array[0] * fst + 
128
               center_point_array[1] * sq + 
129
               center_point_array[2] * thrd + 
130
               center_point_array[3] * frth); //+
131
               //center_point_array[4] * fith);
132
    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
133
                  center_point_array[1] * sq + 
134
                  center_point_array[2] * thrd + 
135
                         center_point_array[3] * frth) +
136
           //center_point_array[4] * fith) + 
137
           (center_point_array[4]), i);
138
    */
139
  }
140
141
  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
142
143
  
144
  
145
  *target = result;
146
}
147
148
#endif /*LV_HAVE_GENERIC*/
149
150
151
#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/