Statistics
| Branch: | Tag: | Revision:

root / volk / include / volk / volk_32f_x2_dot_prod_32f_a.h @ 5f145a32

History | View | Annotate | Download (4.4 kB)

1
#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
2
#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
3
4
#include <volk/volk_common.h>
5
#include<stdio.h>
6
7
8
#ifdef LV_HAVE_GENERIC
9
10
11
static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
12
13
  float dotProduct = 0;
14
  const float* aPtr = input;
15
  const float* bPtr=  taps;
16
  unsigned int number = 0;
17
18
  for(number = 0; number < num_points; number++){
19
    dotProduct += ((*aPtr++) * (*bPtr++));
20
  }
21
  
22
  *result = dotProduct;
23
}
24
25
#endif /*LV_HAVE_GENERIC*/
26
27
28
#ifdef LV_HAVE_SSE
29
30
31
static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
32
  
33
  unsigned int number = 0;
34
  const unsigned int quarterPoints = num_points / 4;
35
36
  float dotProduct = 0;
37
  const float* aPtr = input;
38
  const float* bPtr = taps;
39
40
  __m128 aVal, bVal, cVal;
41
42
  __m128 dotProdVal = _mm_setzero_ps();
43
44
  for(;number < quarterPoints; number++){
45
      
46
    aVal = _mm_load_ps(aPtr); 
47
    bVal = _mm_load_ps(bPtr);
48
      
49
    cVal = _mm_mul_ps(aVal, bVal); 
50
51
    dotProdVal = _mm_add_ps(cVal, dotProdVal);
52
53
    aPtr += 4;
54
    bPtr += 4;
55
  }
56
57
  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
58
59
  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
60
61
  dotProduct = dotProductVector[0];
62
  dotProduct += dotProductVector[1];
63
  dotProduct += dotProductVector[2];
64
  dotProduct += dotProductVector[3];
65
66
  number = quarterPoints * 4;
67
  for(;number < num_points; number++){
68
    dotProduct += ((*aPtr++) * (*bPtr++));
69
  }
70
71
  *result = dotProduct;
72
  
73
}
74
75
#endif /*LV_HAVE_SSE*/  
76
77
#ifdef LV_HAVE_SSE3
78
79
#include <pmmintrin.h>
80
81
static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
82
  unsigned int number = 0;
83
  const unsigned int quarterPoints = num_points / 4;
84
85
  float dotProduct = 0;
86
  const float* aPtr = input;
87
  const float* bPtr = taps;
88
89
  __m128 aVal, bVal, cVal;
90
91
  __m128 dotProdVal = _mm_setzero_ps();
92
93
  for(;number < quarterPoints; number++){
94
      
95
    aVal = _mm_load_ps(aPtr); 
96
    bVal = _mm_load_ps(bPtr);
97
      
98
    cVal = _mm_mul_ps(aVal, bVal); 
99
100
    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
101
102
    aPtr += 4;
103
    bPtr += 4;
104
  }
105
106
  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
107
  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
108
109
  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
110
111
  dotProduct = dotProductVector[0];
112
  dotProduct += dotProductVector[1];
113
114
  number = quarterPoints * 4;
115
  for(;number < num_points; number++){
116
    dotProduct += ((*aPtr++) * (*bPtr++));
117
  }
118
119
  *result = dotProduct;
120
}  
121
122
#endif /*LV_HAVE_SSE3*/
123
124
#ifdef LV_HAVE_SSE4_1
125
126
#include <smmintrin.h>
127
128
static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
129
  unsigned int number = 0;
130
  const unsigned int sixteenthPoints = num_points / 16;
131
132
  float dotProduct = 0;
133
  const float* aPtr = input;
134
  const float* bPtr = taps;
135
136
  __m128 aVal1, bVal1, cVal1;
137
  __m128 aVal2, bVal2, cVal2;
138
  __m128 aVal3, bVal3, cVal3;
139
  __m128 aVal4, bVal4, cVal4;
140
141
  __m128 dotProdVal = _mm_setzero_ps();
142
143
  for(;number < sixteenthPoints; number++){      
144
145
    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
146
    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
147
    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
148
    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
149
150
    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
151
    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
152
    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
153
    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
154
    
155
    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
156
    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
157
    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
158
    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
159
160
    cVal1 = _mm_or_ps(cVal1, cVal2);
161
    cVal3 = _mm_or_ps(cVal3, cVal4);
162
    cVal1 = _mm_or_ps(cVal1, cVal3);
163
164
    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
165
  }
166
167
  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
168
  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
169
170
  dotProduct = dotProductVector[0];
171
  dotProduct += dotProductVector[1];
172
  dotProduct += dotProductVector[2];
173
  dotProduct += dotProductVector[3];
174
175
  number = sixteenthPoints * 16;
176
  for(;number < num_points; number++){
177
    dotProduct += ((*aPtr++) * (*bPtr++));
178
  }
179
180
  *result = dotProduct;
181
}  
182
183
#endif /*LV_HAVE_SSE4_1*/
184
185
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/