summaryrefslogtreecommitdiff
path: root/volk/kernels
diff options
context:
space:
mode:
authorTom Rondeau <tom@trondeau.com>2014-07-07 12:20:09 -0400
committerTom Rondeau <tom@trondeau.com>2014-07-07 12:20:09 -0400
commitbbfc759914da80214fabc70fbbed1edaf39f9e4b (patch)
tree712eb6d1d95445bb6535534ce86d7faf1bfe6f90 /volk/kernels
parent3f469513b94ac992138360caca7e1b53f82214ae (diff)
parent597b93798a804cde1783d6d2ab53b348d57c44cd (diff)
Merge branch 'maint'
Diffstat (limited to 'volk/kernels')
-rw-r--r--volk/kernels/volk/volk_32f_invsqrt_32f.h6
-rw-r--r--volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h8
-rw-r--r--volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h8
-rw-r--r--volk/kernels/volk/volk_32fc_32f_multiply_32fc.h4
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h18
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h150
6 files changed, 97 insertions, 97 deletions
diff --git a/volk/kernels/volk/volk_32f_invsqrt_32f.h b/volk/kernels/volk/volk_32f_invsqrt_32f.h
index c1d28c1ab2..055370661a 100644
--- a/volk/kernels/volk/volk_32f_invsqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_invsqrt_32f.h
@@ -20,7 +20,7 @@ static inline float Q_rsqrt( float number )
u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck?
u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration
//u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed
-
+
return u.f;
}
@@ -47,7 +47,7 @@ static inline void volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVect
aPtr += 8;
cPtr += 8;
}
-
+
number = eighthPoints * 8;
for(;number < num_points; number++)
*cPtr++ = Q_rsqrt(*aPtr++);
@@ -130,7 +130,7 @@ static inline void volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVect
aPtr += 8;
cPtr += 8;
}
-
+
number = eighthPoints * 8;
for(;number < num_points; number++)
*cPtr++ = Q_rsqrt(*aPtr++);
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index b6c39477c6..fdef68209e 100644
--- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -103,7 +103,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
#ifdef LV_HAVE_AVX
#include<immintrin.h>
-static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
{
const unsigned int eighth_points = num_points / 8;
float fst = 0.0;
@@ -166,7 +166,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0,
center_point_array[3] * frth);
}
- *target += ((float)(num_points)) * center_point_array[4];
+ *target += ((float)(num_points)) * center_point_array[4];
}
#endif // LV_HAVE_AVX
@@ -225,7 +225,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src
#ifdef LV_HAVE_AVX
#include<immintrin.h>
-static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
{
const unsigned int eighth_points = num_points / 8;
float fst = 0.0;
@@ -288,7 +288,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0,
center_point_array[3] * frth);
}
- *target += ((float)(num_points)) * center_point_array[4];
+ *target += ((float)(num_points)) * center_point_array[4];
}
#endif // LV_HAVE_AVX
diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index f567ede516..44535da6d8 100644
--- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -71,8 +71,8 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const l
// TODO: it may be possible to rearrange swizzling to better pipeline data
b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
c0Val = _mm256_mul_ps(a0Val, b0Val);
c1Val = _mm256_mul_ps(a1Val, b1Val);
@@ -239,8 +239,8 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const l
// TODO: it may be possible to rearrange swizzling to better pipeline data
b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
c0Val = _mm256_mul_ps(a0Val, b0Val);
c1Val = _mm256_mul_ps(a1Val, b1Val);
diff --git a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
index a7e81bdd7f..a12d078c68 100644
--- a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -24,14 +24,14 @@ static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const l
__m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
- __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+ __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
for(;number < eighthPoints; number++){
aVal1 = _mm256_load_ps((float *)aPtr);
aPtr += 4;
- aVal2 = _mm256_load_ps((float *)aPtr);
+ aVal2 = _mm256_load_ps((float *)aPtr);
aPtr += 4;
bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
diff --git a/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
index 3a1f8e471b..c7b46e7e99 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
@@ -17,10 +17,10 @@
\param phase initial phase offset
\param num_points The number of values in inVector to be rotated and stored into cVector
*/
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points);
-
+
}
#endif /* LV_HAVE_GENERIC */
@@ -29,10 +29,10 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVect
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points);
-
+
}
#endif /* LV_HAVE_SSE4_1 */
@@ -40,10 +40,10 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc, phase, num_points);
-
+
}
#endif /* LV_HAVE_SSE4_1 */
@@ -60,7 +60,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVec
\param phase initial phase offset
\param num_points The number of values in inVector to be rotated and stored into cVector
*/
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points);
}
@@ -71,11 +71,11 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector
#ifdef LV_HAVE_AVX
#include <immintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc, phase, num_points);
}
-
+
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
index 72bb3c04b9..0ed9d67cb5 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -18,9 +18,9 @@
\param phase initial phase offset
\param num_points The number of values in inVector to be rotated and stored into cVector
*/
-static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
- unsigned int i = 0;
- int j = 0;
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+ unsigned int i = 0;
+ int j = 0;
for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
for(j = 0; j < ROTATOR_RELOAD; ++j) {
*outVector++ = *inVector++ * (*phase);
@@ -36,7 +36,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
*outVector++ = *inVector++ * (*phase);
(*phase) *= phase_inc;
}
-
+
}
#endif /* LV_HAVE_GENERIC */
@@ -50,7 +50,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
-
+
unsigned int i, j = 0;
for(i = 0; i < 2; ++i) {
@@ -62,36 +62,36 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
__m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
+
phase_Val = _mm_loadu_ps((float*)phase_Ptr);
inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
-
+
const unsigned int halfPoints = num_points / 2;
-
+
for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
for(j = 0; j < ROTATOR_RELOAD; ++j) {
-
+
aVal = _mm_load_ps((float*)aPtr);
-
+
yl = _mm_moveldup_ps(phase_Val);
yh = _mm_movehdup_ps(phase_Val);
ylp = _mm_moveldup_ps(inc_Val);
yhp = _mm_movehdup_ps(inc_Val);
-
+
tmp1 = _mm_mul_ps(aVal, yl);
tmp1p = _mm_mul_ps(phase_Val, ylp);
-
+
aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm_mul_ps(aVal, yh);
tmp2p = _mm_mul_ps(phase_Val, yhp);
-
+
z = _mm_addsub_ps(tmp1, tmp2);
phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
+
_mm_store_ps((float*)cPtr, z);
-
+
aPtr += 2;
cPtr += 2;
}
@@ -103,26 +103,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
}
for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
aVal = _mm_load_ps((float*)aPtr);
-
+
yl = _mm_moveldup_ps(phase_Val);
yh = _mm_movehdup_ps(phase_Val);
ylp = _mm_moveldup_ps(inc_Val);
yhp = _mm_movehdup_ps(inc_Val);
-
+
tmp1 = _mm_mul_ps(aVal, yl);
tmp1p = _mm_mul_ps(phase_Val, ylp);
-
+
aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm_mul_ps(aVal, yh);
tmp2p = _mm_mul_ps(phase_Val, yhp);
-
+
z = _mm_addsub_ps(tmp1, tmp2);
phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
+
_mm_store_ps((float*)cPtr, z);
-
+
aPtr += 2;
cPtr += 2;
}
@@ -132,7 +132,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
*cPtr++ = *aPtr++ * phase_Ptr[0];
phase_Ptr[0] *= (phase_inc);
}
-
+
(*phase) = phase_Ptr[0];
}
@@ -156,7 +156,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
-
+
unsigned int i, j = 0;
for(i = 0; i < 2; ++i) {
@@ -168,36 +168,36 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
__m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
+
phase_Val = _mm_loadu_ps((float*)phase_Ptr);
inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
-
+
const unsigned int halfPoints = num_points / 2;
-
+
for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
for(j = 0; j < ROTATOR_RELOAD; ++j) {
-
+
aVal = _mm_loadu_ps((float*)aPtr);
-
+
yl = _mm_moveldup_ps(phase_Val);
yh = _mm_movehdup_ps(phase_Val);
ylp = _mm_moveldup_ps(inc_Val);
yhp = _mm_movehdup_ps(inc_Val);
-
+
tmp1 = _mm_mul_ps(aVal, yl);
tmp1p = _mm_mul_ps(phase_Val, ylp);
-
+
aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm_mul_ps(aVal, yh);
tmp2p = _mm_mul_ps(phase_Val, yhp);
-
+
z = _mm_addsub_ps(tmp1, tmp2);
phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
+
_mm_storeu_ps((float*)cPtr, z);
-
+
aPtr += 2;
cPtr += 2;
}
@@ -209,26 +209,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
}
for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
aVal = _mm_loadu_ps((float*)aPtr);
-
+
yl = _mm_moveldup_ps(phase_Val);
yh = _mm_movehdup_ps(phase_Val);
ylp = _mm_moveldup_ps(inc_Val);
yhp = _mm_movehdup_ps(inc_Val);
-
+
tmp1 = _mm_mul_ps(aVal, yl);
tmp1p = _mm_mul_ps(phase_Val, ylp);
-
+
aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm_mul_ps(aVal, yh);
tmp2p = _mm_mul_ps(phase_Val, yhp);
-
+
z = _mm_addsub_ps(tmp1, tmp2);
phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
+
_mm_storeu_ps((float*)cPtr, z);
-
+
aPtr += 2;
cPtr += 2;
}
@@ -238,7 +238,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
*cPtr++ = *aPtr++ * phase_Ptr[0];
phase_Ptr[0] *= (phase_inc);
}
-
+
(*phase) = phase_Ptr[0];
}
@@ -262,7 +262,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
-
+
unsigned int i, j = 0;
for(i = 0; i < 4; ++i) {
@@ -276,35 +276,35 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
__m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
+
phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
const unsigned int fourthPoints = num_points / 4;
-
+
for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
for(j = 0; j < ROTATOR_RELOAD; ++j) {
-
+
aVal = _mm256_load_ps((float*)aPtr);
-
+
yl = _mm256_moveldup_ps(phase_Val);
yh = _mm256_movehdup_ps(phase_Val);
ylp = _mm256_moveldup_ps(inc_Val);
yhp = _mm256_movehdup_ps(inc_Val);
-
+
tmp1 = _mm256_mul_ps(aVal, yl);
tmp1p = _mm256_mul_ps(phase_Val, ylp);
-
+
aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm256_mul_ps(aVal, yh);
tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
+
z = _mm256_addsub_ps(tmp1, tmp2);
phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
-
+
_mm256_store_ps((float*)cPtr, z);
-
+
aPtr += 4;
cPtr += 4;
}
@@ -316,26 +316,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
}
for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
aVal = _mm256_load_ps((float*)aPtr);
-
+
yl = _mm256_moveldup_ps(phase_Val);
yh = _mm256_movehdup_ps(phase_Val);
ylp = _mm256_moveldup_ps(inc_Val);
yhp = _mm256_movehdup_ps(inc_Val);
-
+
tmp1 = _mm256_mul_ps(aVal, yl);
tmp1p = _mm256_mul_ps(phase_Val, ylp);
-
+
aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm256_mul_ps(aVal, yh);
tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
+
z = _mm256_addsub_ps(tmp1, tmp2);
phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
-
+
_mm256_store_ps((float*)cPtr, z);
-
+
aPtr += 4;
cPtr += 4;
}
@@ -345,7 +345,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
*cPtr++ = *aPtr++ * phase_Ptr[0];
phase_Ptr[0] *= (phase_inc);
}
-
+
(*phase) = phase_Ptr[0];
}
@@ -369,7 +369,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
-
+
unsigned int i, j = 0;
for(i = 0; i < 4; ++i) {
@@ -383,35 +383,35 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
__m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
+
phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
const unsigned int fourthPoints = num_points / 4;
-
+
for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
for(j = 0; j < ROTATOR_RELOAD; ++j) {
-
+
aVal = _mm256_loadu_ps((float*)aPtr);
-
+
yl = _mm256_moveldup_ps(phase_Val);
yh = _mm256_movehdup_ps(phase_Val);
ylp = _mm256_moveldup_ps(inc_Val);
yhp = _mm256_movehdup_ps(inc_Val);
-
+
tmp1 = _mm256_mul_ps(aVal, yl);
tmp1p = _mm256_mul_ps(phase_Val, ylp);
-
+
aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm256_mul_ps(aVal, yh);
tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
+
z = _mm256_addsub_ps(tmp1, tmp2);
phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
-
+
_mm256_storeu_ps((float*)cPtr, z);
-
+
aPtr += 4;
cPtr += 4;
}
@@ -423,26 +423,26 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
}
for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
aVal = _mm256_loadu_ps((float*)aPtr);
-
+
yl = _mm256_moveldup_ps(phase_Val);
yh = _mm256_movehdup_ps(phase_Val);
ylp = _mm256_moveldup_ps(inc_Val);
yhp = _mm256_movehdup_ps(inc_Val);
-
+
tmp1 = _mm256_mul_ps(aVal, yl);
tmp1p = _mm256_mul_ps(phase_Val, ylp);
-
+
aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
tmp2 = _mm256_mul_ps(aVal, yh);
tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
+
z = _mm256_addsub_ps(tmp1, tmp2);
phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
-
+
_mm256_storeu_ps((float*)cPtr, z);
-
+
aPtr += 4;
cPtr += 4;
}
@@ -452,11 +452,11 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
*cPtr++ = *aPtr++ * phase_Ptr[0];
phase_Ptr[0] *= (phase_inc);
}
-
+
(*phase) = phase_Ptr[0];
}
-
+
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */