1 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
2 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
10 #include <smmintrin.h>
18 static inline void volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
const float* inputBuffer,
const float mean,
unsigned int num_points){
19 float returnValue = 0;
21 unsigned int number = 0;
22 const unsigned int sixteenthPoints = num_points / 16;
24 const float* aPtr = inputBuffer;
28 __m128 squareAccumulator = _mm_setzero_ps();
29 __m128 aVal1, aVal2, aVal3, aVal4;
30 __m128 cVal1, cVal2, cVal3, cVal4;
31 for(;number < sixteenthPoints; number++) {
32 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
33 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
35 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
36 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
38 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
39 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
41 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
42 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
44 cVal1 = _mm_or_ps(cVal1, cVal2);
45 cVal3 = _mm_or_ps(cVal3, cVal4);
46 cVal1 = _mm_or_ps(cVal1, cVal3);
48 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
50 _mm_store_ps(squareBuffer,squareAccumulator);
51 returnValue = squareBuffer[0];
52 returnValue += squareBuffer[1];
53 returnValue += squareBuffer[2];
54 returnValue += squareBuffer[3];
56 number = sixteenthPoints * 16;
57 for(;number < num_points; number++){
58 returnValue += (*aPtr) * (*aPtr);
61 returnValue /= num_points;
62 returnValue -= (mean * mean);
63 returnValue = sqrtf(returnValue);
65 *stddev = returnValue;
70 #include <xmmintrin.h>
78 static inline void volk_32f_s32f_stddev_32f_a_sse(
float* stddev,
const float* inputBuffer,
const float mean,
unsigned int num_points){
79 float returnValue = 0;
81 unsigned int number = 0;
82 const unsigned int quarterPoints = num_points / 4;
84 const float* aPtr = inputBuffer;
88 __m128 squareAccumulator = _mm_setzero_ps();
89 __m128 aVal = _mm_setzero_ps();
90 for(;number < quarterPoints; number++) {
91 aVal = _mm_load_ps(aPtr);
92 aVal = _mm_mul_ps(aVal, aVal);
93 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
96 _mm_store_ps(squareBuffer,squareAccumulator);
97 returnValue = squareBuffer[0];
98 returnValue += squareBuffer[1];
99 returnValue += squareBuffer[2];
100 returnValue += squareBuffer[3];
102 number = quarterPoints * 4;
103 for(;number < num_points; number++){
104 returnValue += (*aPtr) * (*aPtr);
107 returnValue /= num_points;
108 returnValue -= (mean * mean);
109 returnValue = sqrtf(returnValue);
111 *stddev = returnValue;
115 #ifdef LV_HAVE_GENERIC
123 static inline void volk_32f_s32f_stddev_32f_generic(
float* stddev,
const float* inputBuffer,
const float mean,
unsigned int num_points){
124 float returnValue = 0;
126 const float* aPtr = inputBuffer;
127 unsigned int number = 0;
129 for(number = 0; number < num_points; number++){
130 returnValue += (*aPtr) * (*aPtr);
134 returnValue /= num_points;
135 returnValue -= (mean * mean);
136 returnValue = sqrtf(returnValue);
138 *stddev = returnValue;
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27