diff options
author | Abhishek Bhowmick <abhowmick22@gmail.com> | 2014-06-14 18:31:03 +0530 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:49:54 -0400 |
commit | 2fdd530e75c345f6a89e2368c1ccced2fc1ef487 (patch) | |
tree | c389ea7f4759e91afa852bbc2fc32ce17087a830 | |
parent | 991d1f2fa25b15ec04ab1ab6f5c78e76aac6a818 (diff) |
volk: Added tan kernel.
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_tan_32f.h | 215 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 1 |
3 files changed, 217 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 1c31addb84..46929b3df3 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -167,6 +167,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_cos_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_tan_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_32f_tan_32f.h b/volk/kernels/volk/volk_32f_tan_32f.h new file mode 100644 index 0000000000..48611b0698 --- /dev/null +++ b/volk/kernels/volk/volk_32f_tan_32f.h @@ -0,0 +1,215 @@ +#include <stdio.h> +#include <math.h> +#include <inttypes.h> + +#ifndef INCLUDED_volk_32f_tan_32f_a_H +#define INCLUDED_volk_32f_tan_32f_a_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes tangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which tangent is to be computed +*/ +static inline void volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; + __m128 sine, cosine, tangent, condition1, condition2, condition3; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi)); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); + + for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); + condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); + + __m128 temp = cosine; + cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); + sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); + sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); + cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); + tangent = _mm_div_ps(sine, cosine); + _mm_store_ps(bPtr, tangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = tan(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes tangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which tangent is to be computed +*/ +static inline void volk_32f_tan_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(; number < num_points; number++){ + *bPtr++ = tan(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_tan_32f_a_H */ + +#ifndef INCLUDED_volk_32f_tan_32f_u_H +#define INCLUDED_volk_32f_tan_32f_u_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes tangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which tangent is to be computed +*/ +static inline void volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; + __m128 sine, cosine, tangent, condition1, condition2, condition3; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi)); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); + + for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); + condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); + + __m128 temp = cosine; + cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); + sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); + sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); + cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); + tangent = _mm_div_ps(sine, cosine); + _mm_storeu_ps(bPtr, tangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = tan(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for unaligned */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes tangent of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which tangent is to be computed +*/ +static inline void volk_32f_tan_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(; number < num_points; number++){ + *bPtr++ = tan(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_tan_32f_u_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 3510f3b1ef..971c82fbb5 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -49,6 +49,7 @@ VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_cos_32f, 1e-6, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_tan_32f, 1e-6, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1); |