diff options
author | Josh Blum <josh@joshknows.com> | 2012-07-04 11:17:48 -0700 |
---|---|---|
committer | Josh Blum <josh@joshknows.com> | 2012-07-04 11:17:48 -0700 |
commit | 61dd97409abde2412595fa762c0c4161863604f4 (patch) | |
tree | 557ffe8bb96e966efb57a4b81f48b2bfbb42779b /volk | |
parent | 7c0f5550d06e9c5e2ab413e3e2b95a843a435135 (diff) |
volk: use loadu for unaligned volk_32f_x2_dot_prod_32f_u_sse*
Diffstat (limited to 'volk')
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 48 |
1 files changed, 24 insertions, 24 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index c43d229df6..b24e8b1f79 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* for(;number < sixteenthPoints; number++){ - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); c0Val = _mm_mul_ps(a0Val, b0Val); c1Val = _mm_mul_ps(a1Val, b1Val); @@ -118,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * for(;number < sixteenthPoints; number++){ - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); c0Val = _mm_mul_ps(a0Val, b0Val); c1Val = _mm_mul_ps(a1Val, b1Val); @@ -184,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float for(;number < sixteenthPoints; number++){ - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); |