1 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
2 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
10 #include <smmintrin.h>
18 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(
float* stddev,
float* mean,
const float* inputBuffer,
unsigned int num_points){
19 float returnValue = 0;
22 unsigned int number = 0;
23 const unsigned int sixteenthPoints = num_points / 16;
25 const float* aPtr = inputBuffer;
29 __m128 accumulator = _mm_setzero_ps();
30 __m128 squareAccumulator = _mm_setzero_ps();
31 __m128 aVal1, aVal2, aVal3, aVal4;
32 __m128 cVal1, cVal2, cVal3, cVal4;
33 for(;number < sixteenthPoints; number++) {
34 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
35 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
36 accumulator = _mm_add_ps(accumulator, aVal1);
38 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
39 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
40 accumulator = _mm_add_ps(accumulator, aVal2);
42 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
43 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
44 accumulator = _mm_add_ps(accumulator, aVal3);
46 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
47 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
48 accumulator = _mm_add_ps(accumulator, aVal4);
50 cVal1 = _mm_or_ps(cVal1, cVal2);
51 cVal3 = _mm_or_ps(cVal3, cVal4);
52 cVal1 = _mm_or_ps(cVal1, cVal3);
54 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
56 _mm_store_ps(meanBuffer,accumulator);
57 _mm_store_ps(squareBuffer,squareAccumulator);
58 newMean = meanBuffer[0];
59 newMean += meanBuffer[1];
60 newMean += meanBuffer[2];
61 newMean += meanBuffer[3];
62 returnValue = squareBuffer[0];
63 returnValue += squareBuffer[1];
64 returnValue += squareBuffer[2];
65 returnValue += squareBuffer[3];
67 number = sixteenthPoints * 16;
68 for(;number < num_points; number++){
69 returnValue += (*aPtr) * (*aPtr);
72 newMean /= num_points;
73 returnValue /= num_points;
74 returnValue -= (newMean * newMean);
75 returnValue = sqrtf(returnValue);
77 *stddev = returnValue;
83 #include <xmmintrin.h>
91 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(
float* stddev,
float* mean,
const float* inputBuffer,
unsigned int num_points){
92 float returnValue = 0;
95 unsigned int number = 0;
96 const unsigned int quarterPoints = num_points / 4;
98 const float* aPtr = inputBuffer;
102 __m128 accumulator = _mm_setzero_ps();
103 __m128 squareAccumulator = _mm_setzero_ps();
104 __m128 aVal = _mm_setzero_ps();
105 for(;number < quarterPoints; number++) {
106 aVal = _mm_load_ps(aPtr);
107 accumulator = _mm_add_ps(accumulator, aVal);
108 aVal = _mm_mul_ps(aVal, aVal);
109 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
112 _mm_store_ps(meanBuffer,accumulator);
113 _mm_store_ps(squareBuffer,squareAccumulator);
114 newMean = meanBuffer[0];
115 newMean += meanBuffer[1];
116 newMean += meanBuffer[2];
117 newMean += meanBuffer[3];
118 returnValue = squareBuffer[0];
119 returnValue += squareBuffer[1];
120 returnValue += squareBuffer[2];
121 returnValue += squareBuffer[3];
123 number = quarterPoints * 4;
124 for(;number < num_points; number++){
125 returnValue += (*aPtr) * (*aPtr);
128 newMean /= num_points;
129 returnValue /= num_points;
130 returnValue -= (newMean * newMean);
131 returnValue = sqrtf(returnValue);
133 *stddev = returnValue;
138 #ifdef LV_HAVE_GENERIC
146 static inline void volk_32f_stddev_and_mean_32f_x2_generic(
float* stddev,
float* mean,
const float* inputBuffer,
unsigned int num_points){
147 float returnValue = 0;
150 const float* aPtr = inputBuffer;
151 unsigned int number = 0;
153 for(number = 0; number < num_points; number++){
154 returnValue += (*aPtr) * (*aPtr);
157 newMean /= num_points;
158 returnValue /= num_points;
159 returnValue -= (newMean * newMean);
160 returnValue = sqrtf(returnValue);
162 *stddev = returnValue;
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27