diff options
author | Tom Rondeau <tom@trondeau.com> | 2014-09-02 11:36:11 -0400 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-09-02 11:36:11 -0400 |
commit | 01deede32858ef9e2fe4cc937f3245b5b0e6d7c9 (patch) | |
tree | b59617bd53d6fc63b4982656982db3baa46cbaa6 | |
parent | 1a1f502b5dc4fa10d471a49795dddd54a9b4e444 (diff) |
volk: adding tanh kernel with support for sse and avx.
There are two generic kernels, one using tanh and another using a series approximation. The SIMD code uses the series approximation.
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_tanh_32f.h | 296 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 1 |
3 files changed, 298 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 074d1e7be4..1dd0f9ba89 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -178,6 +178,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_binary_slicer_32i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_binary_slicer_8i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_tanh_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); // Until we can update the config on a kernel by kernel basis // do not overwrite volk_config when using a regex. diff --git a/volk/kernels/volk/volk_32f_tanh_32f.h b/volk/kernels/volk/volk_32f_tanh_32f.h new file mode 100644 index 0000000000..3f407d4656 --- /dev/null +++ b/volk/kernels/volk/volk_32f_tanh_32f.h @@ -0,0 +1,296 @@ +#ifndef INCLUDED_volk_32f_tanh_32f_a_H +#define INCLUDED_volk_32f_tanh_32f_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> +#include <string.h> + +#ifdef LV_HAVE_GENERIC +/*! +\brief Calculates tanh(x) +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_generic(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + for(; number < num_points; number++) { + *cPtr++ = tanh(*aPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_GENERIC +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_series(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + for(; number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} + +#endif /* LV_HAVE_GENERIC */ + + + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal, x2, a, b; + __m128 const1, const2, const3, const4, const5, const6; + const1 = _mm_set_ps1(135135.0f); + const2 = _mm_set_ps1(17325.0f); + const3 = _mm_set_ps1(378.0f); + const4 = _mm_set_ps1(62370.0f); + const5 = _mm_set_ps1(3150.0f); + const6 = _mm_set_ps1(28.0f); + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + x2 = _mm_mul_ps(aVal, aVal); + a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); + b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); + + cVal = _mm_div_ps(a, b); + + _mm_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); + b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); + + cVal = _mm256_div_ps(a, b); + + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_AVX */ + + + + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal, x2, a, b; + __m128 const1, const2, const3, const4, const5, const6; + const1 = _mm_set_ps1(135135.0f); + const2 = _mm_set_ps1(17325.0f); + const3 = _mm_set_ps1(378.0f); + const4 = _mm_set_ps1(62370.0f); + const5 = _mm_set_ps1(3150.0f); + const6 = _mm_set_ps1(28.0f); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + x2 = _mm_mul_ps(aVal, aVal); + a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); + b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); + + cVal = _mm_div_ps(a, b); + + _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_SSE */ + + + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); + b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); + + cVal = _mm256_div_ps(a, b); + + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_tanh_32f_a_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index bc97ad16e5..9d837517f1 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -114,3 +114,4 @@ VOLK_RUN_TESTS(volk_8u_conv_k7_r2puppet_8u, 0, 0, 2060, 1); VOLK_RUN_TESTS(volk_32f_invsqrt_32f, 1e-2, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_binary_slicer_32i, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_binary_slicer_8i, 0, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_tanh_32f, 1e-6, 0, 20462, 1); |