volk: adding tanh kernel with support for sse and avx.

There are two generic kernels, one using tanh and another using a series approximation. The SIMD code uses the series approximation.
author: Tom Rondeau <tom@trondeau.com> 2014-09-02 11:36:11 -0400
committer: Tom Rondeau <tom@trondeau.com> 2014-09-02 11:36:11 -0400
commit: 01deede32858ef9e2fe4cc937f3245b5b0e6d7c9 (patch)
tree: b59617bd53d6fc63b4982656982db3baa46cbaa6
parent: 1a1f502b5dc4fa10d471a49795dddd54a9b4e444 (diff)
3 files changed, 298 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 074d1e7be4..1dd0f9ba89 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -178,6 +178,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_binary_slicer_32i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_binary_slicer_8i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_tanh_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
 
     // Until we can update the config on a kernel by kernel basis
     // do not overwrite volk_config when using a regex.
diff --git a/volk/kernels/volk/volk_32f_tanh_32f.h b/volk/kernels/volk/volk_32f_tanh_32f.h
new file mode 100644
index 0000000000..3f407d4656
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_tanh_32f.h
@@ -0,0 +1,296 @@
+#ifndef INCLUDED_volk_32f_tanh_32f_a_H
+#define INCLUDED_volk_32f_tanh_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+/*!
+\brief Calculates tanh(x)
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
+                                             unsigned int num_points)
+{
+  unsigned int number = 0;
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  for(; number < num_points; number++) {
+    *cPtr++ = tanh(*aPtr++);
+  }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_series(float* cVector, const float* aVector,
+                                            unsigned int num_points)
+{
+  unsigned int number = 0;
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  for(; number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m128 aVal, cVal, x2, a, b;
+  __m128 const1, const2, const3, const4, const5, const6;
+  const1 = _mm_set_ps1(135135.0f);
+  const2 = _mm_set_ps1(17325.0f);
+  const3 = _mm_set_ps1(378.0f);
+  const4 = _mm_set_ps1(62370.0f);
+  const5 = _mm_set_ps1(3150.0f);
+  const6 = _mm_set_ps1(28.0f);
+  for(;number < quarterPoints; number++){
+
+    aVal = _mm_load_ps(aPtr);
+    x2 = _mm_mul_ps(aVal, aVal);
+    a  = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+    b  = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+    cVal = _mm_div_ps(a, b);
+
+    _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m256 aVal, cVal, x2, a, b;
+  __m256 const1, const2, const3, const4, const5, const6;
+  const1 = _mm256_set1_ps(135135.0f);
+  const2 = _mm256_set1_ps(17325.0f);
+  const3 = _mm256_set1_ps(378.0f);
+  const4 = _mm256_set1_ps(62370.0f);
+  const5 = _mm256_set1_ps(3150.0f);
+  const6 = _mm256_set1_ps(28.0f);
+  for(;number < eighthPoints; number++){
+
+    aVal = _mm256_load_ps(aPtr);
+    x2 = _mm256_mul_ps(aVal, aVal);
+    a  = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+    b  = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+    cVal = _mm256_div_ps(a, b);
+
+    _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 8;
+    cPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_AVX */
+
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m128 aVal, cVal, x2, a, b;
+  __m128 const1, const2, const3, const4, const5, const6;
+  const1 = _mm_set_ps1(135135.0f);
+  const2 = _mm_set_ps1(17325.0f);
+  const3 = _mm_set_ps1(378.0f);
+  const4 = _mm_set_ps1(62370.0f);
+  const5 = _mm_set_ps1(3150.0f);
+  const6 = _mm_set_ps1(28.0f);
+  for(;number < quarterPoints; number++){
+
+    aVal = _mm_loadu_ps(aPtr);
+    x2 = _mm_mul_ps(aVal, aVal);
+    a  = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+    b  = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+    cVal = _mm_div_ps(a, b);
+
+    _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m256 aVal, cVal, x2, a, b;
+  __m256 const1, const2, const3, const4, const5, const6;
+  const1 = _mm256_set1_ps(135135.0f);
+  const2 = _mm256_set1_ps(17325.0f);
+  const3 = _mm256_set1_ps(378.0f);
+  const4 = _mm256_set1_ps(62370.0f);
+  const5 = _mm256_set1_ps(3150.0f);
+  const6 = _mm256_set1_ps(28.0f);
+  for(;number < eighthPoints; number++){
+
+    aVal = _mm256_loadu_ps(aPtr);
+    x2 = _mm256_mul_ps(aVal, aVal);
+    a  = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+    b  = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+    cVal = _mm256_div_ps(a, b);
+
+    _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 8;
+    cPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index bc97ad16e5..9d837517f1 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -114,3 +114,4 @@ VOLK_RUN_TESTS(volk_8u_conv_k7_r2puppet_8u, 0, 0, 2060, 1);
 VOLK_RUN_TESTS(volk_32f_invsqrt_32f, 1e-2, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_binary_slicer_32i, 0, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_binary_slicer_8i, 0, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_tanh_32f, 1e-6, 0, 20462, 1);
author	Tom Rondeau <tom@trondeau.com>	2014-09-02 11:36:11 -0400
committer	Tom Rondeau <tom@trondeau.com>	2014-09-02 11:36:11 -0400
commit	01deede32858ef9e2fe4cc937f3245b5b0e6d7c9 (patch)
tree	b59617bd53d6fc63b4982656982db3baa46cbaa6
parent	1a1f502b5dc4fa10d471a49795dddd54a9b4e444 (diff)