summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h51
-rw-r--r--volk/lib/testqa.cc2
2 files changed, 25 insertions, 28 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index ab33a25876..b24e8b1f79 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
@@ -121,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -187,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
for(;number < sixteenthPoints; number++){
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index d1eb1cacbb..507c787772 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -54,7 +54,7 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);