diff options
author | Abhishek Bhowmick <abhowmick22@gmail.com> | 2014-06-14 20:40:51 +0530 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:49:54 -0400 |
commit | ebeab0988c17b08bd1568a53a5a8713616024816 (patch) | |
tree | a2938c6ad1aa0773b621ba5e34522154748a6cc7 | |
parent | 2fdd530e75c345f6a89e2368c1ccced2fc1ef487 (diff) |
volk: Added atan, asin, acos kernels.
-rw-r--r-- | volk/apps/volk_profile.cc | 3 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_acos_32f.h | 186 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_asin_32f.h | 178 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_atan_32f.h | 176 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 3 |
5 files changed, 546 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 46929b3df3..5030836d43 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -168,6 +168,9 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_cos_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_tan_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_atan_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_asin_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_acos_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_32f_acos_32f.h b/volk/kernels/volk/volk_32f_acos_32f.h new file mode 100644 index 0000000000..deba615bc9 --- /dev/null +++ b/volk/kernels/volk/volk_32f_acos_32f.h @@ -0,0 +1,186 @@ +#include <stdio.h> +#include <math.h> +#include <inttypes.h> + +/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +#define TERMS 2 + +#ifndef INCLUDED_volk_32f_acos_32f_a_H +#define INCLUDED_volk_32f_acos_32f_a_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes arccosine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arccosine is to be computed +*/ +static inline void volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, d, pi, pio2, x, y, z, arccosine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm_set1_ps(3.14159265358979323846); + pio2 = _mm_set1_ps(3.14159265358979323846/2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + d = aVal; + aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + x = z; + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for(i = 0; i < 2; i++) x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arccosine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); + condition = _mm_cmplt_ps(d, fzeroes); + arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); + + _mm_store_ps(bPtr, arccosine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = acos(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes arccosine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arccosine is to be computed +*/ +static inline void volk_32f_acos_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = acos(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_acos_32f_a_H */ + +#ifndef INCLUDED_volk_32f_acos_32f_u_H +#define INCLUDED_volk_32f_acos_32f_u_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes arccosine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arccosine is to be computed +*/ +static inline void volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, d, pi, pio2, x, y, z, arccosine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm_set1_ps(3.14159265358979323846); + pio2 = _mm_set1_ps(3.14159265358979323846/2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for(;number < quarterPoints; number++){ + aVal = _mm_loadu_ps(aPtr); + d = aVal; + aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + x = z; + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for(i = 0; i < 2; i++) x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arccosine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); + condition = _mm_cmplt_ps(d, fzeroes); + arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); + + _mm_storeu_ps(bPtr, arccosine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = acos(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes arccosine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arccosine is to be computed +*/ +static inline void volk_32f_acos_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = acos(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_acos_32f_u_H */ diff --git a/volk/kernels/volk/volk_32f_asin_32f.h b/volk/kernels/volk/volk_32f_asin_32f.h new file mode 100644 index 0000000000..976aabc258 --- /dev/null +++ b/volk/kernels/volk/volk_32f_asin_32f.h @@ -0,0 +1,178 @@ +#include <stdio.h> +#include <math.h> +#include <inttypes.h> + +/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +#define TERMS 2 + +#ifndef INCLUDED_volk_32f_asin_32f_a_H +#define INCLUDED_volk_32f_asin_32f_a_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes arcsine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arcsine is to be computed +*/ +static inline void volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arcsine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846/2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + x = z; + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for(i = 0; i < 2; i++) x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arcsine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); + + _mm_store_ps(bPtr, arcsine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = asin(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes arcsine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arcsine is to be computed +*/ +static inline void volk_32f_asin_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = asin(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_asin_32f_a_H */ + +#ifndef INCLUDED_volk_32f_asin_32f_u_H +#define INCLUDED_volk_32f_asin_32f_u_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes arcsine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arcsine is to be computed +*/ +static inline void volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arcsine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846/2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for(;number < quarterPoints; number++){ + aVal = _mm_loadu_ps(aPtr); + aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + x = z; + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for(i = 0; i < 2; i++) x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arcsine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); + + _mm_storeu_ps(bPtr, arcsine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = asin(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for unaligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes arcsine of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arcsine is to be computed +*/ +static inline void volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = asin(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_asin_32f_a_H */ diff --git a/volk/kernels/volk/volk_32f_atan_32f.h b/volk/kernels/volk/volk_32f_atan_32f.h new file mode 100644 index 0000000000..a60e2b8c5f --- /dev/null +++ b/volk/kernels/volk/volk_32f_atan_32f.h @@ -0,0 +1,176 @@ +#include <stdio.h> +#include <math.h> +#include <inttypes.h> + +/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +#define TERMS 2 + +#ifndef INCLUDED_volk_32f_atan_32f_a_H +#define INCLUDED_volk_32f_atan_32f_a_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes arctangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arctangent is to be computed +*/ +static inline void volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arctangent; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846/2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + x = z; + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for(i = 0; i < 2; i++) x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arctangent = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); + + _mm_store_ps(bPtr, arctangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = atan(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes arctangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arctangent is to be computed +*/ +static inline void volk_32f_atan_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = atan(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_atan_32f_a_H */ + +#ifndef INCLUDED_volk_32f_atan_32f_u_H +#define INCLUDED_volk_32f_atan_32f_u_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes arctangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arctangent is to be computed +*/ +static inline void volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arctangent; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846/2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for(;number < quarterPoints; number++){ + aVal = _mm_loadu_ps(aPtr); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + x = z; + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for(i = 0; i < 2; i++) x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for(j = TERMS - 1; j >= 0; j--) y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arctangent = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); + + _mm_storeu_ps(bPtr, arctangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = atan(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for unaligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes arctangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which arctangent is to be computed +*/ +static inline void volk_32f_atan_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = atan(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 971c82fbb5..7d1826b719 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -50,6 +50,9 @@ VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_cos_32f, 1e-6, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_tan_32f, 1e-6, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_atan_32f, 1e-3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_asin_32f, 1e-3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_acos_32f, 1e-3, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1); |