diff options
author | Tom Rondeau <tom@trondeau.com> | 2014-07-07 12:20:09 -0400 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-07-07 12:20:09 -0400 |
commit | bbfc759914da80214fabc70fbbed1edaf39f9e4b (patch) | |
tree | 712eb6d1d95445bb6535534ce86d7faf1bfe6f90 /volk/kernels | |
parent | 3f469513b94ac992138360caca7e1b53f82214ae (diff) | |
parent | 597b93798a804cde1783d6d2ab53b348d57c44cd (diff) |
Merge branch 'maint'
Diffstat (limited to 'volk/kernels')
-rw-r--r-- | volk/kernels/volk/volk_32f_invsqrt_32f.h | 6 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_multiply_32fc.h | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 18 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h | 150 |
6 files changed, 97 insertions, 97 deletions
diff --git a/volk/kernels/volk/volk_32f_invsqrt_32f.h b/volk/kernels/volk/volk_32f_invsqrt_32f.h index c1d28c1ab2..055370661a 100644 --- a/volk/kernels/volk/volk_32f_invsqrt_32f.h +++ b/volk/kernels/volk/volk_32f_invsqrt_32f.h @@ -20,7 +20,7 @@ static inline float Q_rsqrt( float number ) u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck? u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration //u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed - + return u.f; } @@ -47,7 +47,7 @@ static inline void volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVect aPtr += 8; cPtr += 8; } - + number = eighthPoints * 8; for(;number < num_points; number++) *cPtr++ = Q_rsqrt(*aPtr++); @@ -130,7 +130,7 @@ static inline void volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVect aPtr += 8; cPtr += 8; } - + number = eighthPoints * 8; for(;number < num_points; number++) *cPtr++ = Q_rsqrt(*aPtr++); diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index b6c39477c6..fdef68209e 100644 --- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -103,7 +103,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 #ifdef LV_HAVE_AVX #include<immintrin.h> -static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { const unsigned int eighth_points = num_points / 8; float fst = 0.0; @@ -166,7 +166,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, center_point_array[3] * frth); } - *target += ((float)(num_points)) * center_point_array[4]; + *target += ((float)(num_points)) * center_point_array[4]; } #endif // LV_HAVE_AVX @@ -225,7 +225,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src #ifdef LV_HAVE_AVX #include<immintrin.h> -static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { const unsigned int eighth_points = num_points / 8; float fst = 0.0; @@ -288,7 +288,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, center_point_array[3] * frth); } - *target += ((float)(num_points)) * center_point_array[4]; + *target += ((float)(num_points)) * center_point_array[4]; } #endif // LV_HAVE_AVX diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index f567ede516..44535da6d8 100644 --- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -71,8 +71,8 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const l // TODO: it may be possible to rearrange swizzling to better pipeline data b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 - b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); - b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); + b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); + b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); c0Val = _mm256_mul_ps(a0Val, b0Val); c1Val = _mm256_mul_ps(a1Val, b1Val); @@ -239,8 +239,8 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const l // TODO: it may be possible to rearrange swizzling to better pipeline data b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 - b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); - b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); + b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); + b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); c0Val = _mm256_mul_ps(a0Val, b0Val); c1Val = _mm256_mul_ps(a1Val, b1Val); diff --git a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h index a7e81bdd7f..a12d078c68 100644 --- a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h +++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -24,14 +24,14 @@ static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const l __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2; - __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); + __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); for(;number < eighthPoints; number++){ aVal1 = _mm256_load_ps((float *)aPtr); aPtr += 4; - aVal2 = _mm256_load_ps((float *)aPtr); + aVal2 = _mm256_load_ps((float *)aPtr); aPtr += 4; bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7 diff --git a/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h index 3a1f8e471b..c7b46e7e99 100644 --- a/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h +++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h @@ -17,10 +17,10 @@ \param phase initial phase offset \param num_points The number of values in inVector to be rotated and stored into cVector */ -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points); - + } #endif /* LV_HAVE_GENERIC */ @@ -29,10 +29,10 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVect #ifdef LV_HAVE_SSE4_1 #include <smmintrin.h> -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points); - + } #endif /* LV_HAVE_SSE4_1 */ @@ -40,10 +40,10 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec #ifdef LV_HAVE_SSE4_1 #include <smmintrin.h> -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc, phase, num_points); - + } #endif /* LV_HAVE_SSE4_1 */ @@ -60,7 +60,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVec \param phase initial phase offset \param num_points The number of values in inVector to be rotated and stored into cVector */ -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points); } @@ -71,11 +71,11 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector #ifdef LV_HAVE_AVX #include <immintrin.h> -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc, phase, num_points); } - + #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h index 72bb3c04b9..0ed9d67cb5 100644 --- a/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h +++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h @@ -18,9 +18,9 @@ \param phase initial phase offset \param num_points The number of values in inVector to be rotated and stored into cVector */ -static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ - unsigned int i = 0; - int j = 0; +static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ + unsigned int i = 0; + int j = 0; for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { for(j = 0; j < ROTATOR_RELOAD; ++j) { *outVector++ = *inVector++ * (*phase); @@ -36,7 +36,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, *outVector++ = *inVector++ * (*phase); (*phase) *= phase_inc; } - + } #endif /* LV_HAVE_GENERIC */ @@ -50,7 +50,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; - + unsigned int i, j = 0; for(i = 0; i < 2; ++i) { @@ -62,36 +62,36 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - + phase_Val = _mm_loadu_ps((float*)phase_Ptr); inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); - + const unsigned int halfPoints = num_points / 2; - + for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { for(j = 0; j < ROTATOR_RELOAD; ++j) { - + aVal = _mm_load_ps((float*)aPtr); - + yl = _mm_moveldup_ps(phase_Val); yh = _mm_movehdup_ps(phase_Val); ylp = _mm_moveldup_ps(inc_Val); yhp = _mm_movehdup_ps(inc_Val); - + tmp1 = _mm_mul_ps(aVal, yl); tmp1p = _mm_mul_ps(phase_Val, ylp); - + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm_mul_ps(aVal, yh); tmp2p = _mm_mul_ps(phase_Val, yhp); - + z = _mm_addsub_ps(tmp1, tmp2); phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - + _mm_store_ps((float*)cPtr, z); - + aPtr += 2; cPtr += 2; } @@ -103,26 +103,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector } for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { aVal = _mm_load_ps((float*)aPtr); - + yl = _mm_moveldup_ps(phase_Val); yh = _mm_movehdup_ps(phase_Val); ylp = _mm_moveldup_ps(inc_Val); yhp = _mm_movehdup_ps(inc_Val); - + tmp1 = _mm_mul_ps(aVal, yl); tmp1p = _mm_mul_ps(phase_Val, ylp); - + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm_mul_ps(aVal, yh); tmp2p = _mm_mul_ps(phase_Val, yhp); - + z = _mm_addsub_ps(tmp1, tmp2); phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - + _mm_store_ps((float*)cPtr, z); - + aPtr += 2; cPtr += 2; } @@ -132,7 +132,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector *cPtr++ = *aPtr++ * phase_Ptr[0]; phase_Ptr[0] *= (phase_inc); } - + (*phase) = phase_Ptr[0]; } @@ -156,7 +156,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; - + unsigned int i, j = 0; for(i = 0; i < 2; ++i) { @@ -168,36 +168,36 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - + phase_Val = _mm_loadu_ps((float*)phase_Ptr); inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); - + const unsigned int halfPoints = num_points / 2; - + for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { for(j = 0; j < ROTATOR_RELOAD; ++j) { - + aVal = _mm_loadu_ps((float*)aPtr); - + yl = _mm_moveldup_ps(phase_Val); yh = _mm_movehdup_ps(phase_Val); ylp = _mm_moveldup_ps(inc_Val); yhp = _mm_movehdup_ps(inc_Val); - + tmp1 = _mm_mul_ps(aVal, yl); tmp1p = _mm_mul_ps(phase_Val, ylp); - + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm_mul_ps(aVal, yh); tmp2p = _mm_mul_ps(phase_Val, yhp); - + z = _mm_addsub_ps(tmp1, tmp2); phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - + _mm_storeu_ps((float*)cPtr, z); - + aPtr += 2; cPtr += 2; } @@ -209,26 +209,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector } for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { aVal = _mm_loadu_ps((float*)aPtr); - + yl = _mm_moveldup_ps(phase_Val); yh = _mm_movehdup_ps(phase_Val); ylp = _mm_moveldup_ps(inc_Val); yhp = _mm_movehdup_ps(inc_Val); - + tmp1 = _mm_mul_ps(aVal, yl); tmp1p = _mm_mul_ps(phase_Val, ylp); - + aVal = _mm_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm_mul_ps(aVal, yh); tmp2p = _mm_mul_ps(phase_Val, yhp); - + z = _mm_addsub_ps(tmp1, tmp2); phase_Val = _mm_addsub_ps(tmp1p, tmp2p); - + _mm_storeu_ps((float*)cPtr, z); - + aPtr += 2; cPtr += 2; } @@ -238,7 +238,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector *cPtr++ = *aPtr++ * phase_Ptr[0]; phase_Ptr[0] *= (phase_inc); } - + (*phase) = phase_Ptr[0]; } @@ -262,7 +262,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; - + unsigned int i, j = 0; for(i = 0; i < 4; ++i) { @@ -276,35 +276,35 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); const unsigned int fourthPoints = num_points / 4; - + for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { for(j = 0; j < ROTATOR_RELOAD; ++j) { - + aVal = _mm256_load_ps((float*)aPtr); - + yl = _mm256_moveldup_ps(phase_Val); yh = _mm256_movehdup_ps(phase_Val); ylp = _mm256_moveldup_ps(inc_Val); yhp = _mm256_movehdup_ps(inc_Val); - + tmp1 = _mm256_mul_ps(aVal, yl); tmp1p = _mm256_mul_ps(phase_Val, ylp); - + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm256_mul_ps(aVal, yh); tmp2p = _mm256_mul_ps(phase_Val, yhp); - + z = _mm256_addsub_ps(tmp1, tmp2); phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); - + _mm256_store_ps((float*)cPtr, z); - + aPtr += 4; cPtr += 4; } @@ -316,26 +316,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c } for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { aVal = _mm256_load_ps((float*)aPtr); - + yl = _mm256_moveldup_ps(phase_Val); yh = _mm256_movehdup_ps(phase_Val); ylp = _mm256_moveldup_ps(inc_Val); yhp = _mm256_movehdup_ps(inc_Val); - + tmp1 = _mm256_mul_ps(aVal, yl); tmp1p = _mm256_mul_ps(phase_Val, ylp); - + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm256_mul_ps(aVal, yh); tmp2p = _mm256_mul_ps(phase_Val, yhp); - + z = _mm256_addsub_ps(tmp1, tmp2); phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); - + _mm256_store_ps((float*)cPtr, z); - + aPtr += 4; cPtr += 4; } @@ -345,7 +345,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c *cPtr++ = *aPtr++ * phase_Ptr[0]; phase_Ptr[0] *= (phase_inc); } - + (*phase) = phase_Ptr[0]; } @@ -369,7 +369,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; - + unsigned int i, j = 0; for(i = 0; i < 4; ++i) { @@ -383,35 +383,35 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; - + phase_Val = _mm256_loadu_ps((float*)phase_Ptr); inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); const unsigned int fourthPoints = num_points / 4; - + for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { for(j = 0; j < ROTATOR_RELOAD; ++j) { - + aVal = _mm256_loadu_ps((float*)aPtr); - + yl = _mm256_moveldup_ps(phase_Val); yh = _mm256_movehdup_ps(phase_Val); ylp = _mm256_moveldup_ps(inc_Val); yhp = _mm256_movehdup_ps(inc_Val); - + tmp1 = _mm256_mul_ps(aVal, yl); tmp1p = _mm256_mul_ps(phase_Val, ylp); - + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm256_mul_ps(aVal, yh); tmp2p = _mm256_mul_ps(phase_Val, yhp); - + z = _mm256_addsub_ps(tmp1, tmp2); phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); - + _mm256_storeu_ps((float*)cPtr, z); - + aPtr += 4; cPtr += 4; } @@ -423,26 +423,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c } for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { aVal = _mm256_loadu_ps((float*)aPtr); - + yl = _mm256_moveldup_ps(phase_Val); yh = _mm256_movehdup_ps(phase_Val); ylp = _mm256_moveldup_ps(inc_Val); yhp = _mm256_movehdup_ps(inc_Val); - + tmp1 = _mm256_mul_ps(aVal, yl); tmp1p = _mm256_mul_ps(phase_Val, ylp); - + aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1); phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1); tmp2 = _mm256_mul_ps(aVal, yh); tmp2p = _mm256_mul_ps(phase_Val, yhp); - + z = _mm256_addsub_ps(tmp1, tmp2); phase_Val = _mm256_addsub_ps(tmp1p, tmp2p); - + _mm256_storeu_ps((float*)cPtr, z); - + aPtr += 4; cPtr += 4; } @@ -452,11 +452,11 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c *cPtr++ = *aPtr++ * phase_Ptr[0]; phase_Ptr[0] *= (phase_inc); } - + (*phase) = phase_Ptr[0]; } - + #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */ |