6 files changed, 431 insertions, 18 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 074d1e7be4..416884734d 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -37,6 +37,49 @@
 
 namespace fs = boost::filesystem;
 
+void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results) {
+    json_file << "{" << std::endl;
+    json_file << " \"volk_tests\": [" << std::endl;
+    size_t len = results.size();
+    size_t i = 0;
+    BOOST_FOREACH(volk_test_results_t &result, results) {
+        json_file << "  {" << std::endl;
+        json_file << "   \"name\": \"" << result.name << "\"," << std::endl;
+        json_file << "   \"vlen\": " << result.vlen << "," << std::endl;
+        json_file << "   \"iter\": " << result.iter << "," << std::endl;
+        json_file << "   \"best_arch_a\": \"" << result.best_arch_a
+            << "\"," << std::endl;
+        json_file << "   \"best_arch_u\": \"" << result.best_arch_u
+            << "\"," << std::endl;
+        json_file << "   \"results\": {" << std::endl;
+        size_t results_len = result.results.size();
+        size_t ri = 0;
+        typedef std::pair<std::string, volk_test_time_t> tpair;
+        BOOST_FOREACH(tpair pair, result.results) {
+            volk_test_time_t time = pair.second;
+            json_file << "    \"" << time.name << "\": {" << std::endl;
+            json_file << "     \"name\": \"" << time.name << "\"," << std::endl;
+            json_file << "     \"time\": " << time.time << "," << std::endl;
+            json_file << "     \"units\": \"" << time.units << "\"" << std::endl;
+            json_file << "    }" ;
+            if(ri+1 != results_len) {
+                json_file << ",";
+            }
+            json_file << std::endl;
+            ri++;
+        }
+        json_file << "   }" << std::endl;
+        json_file << "  }";
+        if(i+1 != len) {
+            json_file << ",";
+        }
+        json_file << std::endl;
+        i++;
+    }
+    json_file << " ]" << std::endl;
+    json_file << "}" << std::endl;
+}
+
 int main(int argc, char *argv[]) {
     // Adding program options
     boost::program_options::options_description desc("Options");
@@ -49,6 +92,9 @@ int main(int argc, char *argv[]) {
       ("tests-regex,R",
             boost::program_options::value<std::string>(),
             "Run tests matching regular expression.")
+      ("json,j",
+            boost::program_options::value<std::string>(),
+            "JSON output file")
       ;
 
     // Handle the options that were given
@@ -56,6 +102,8 @@ int main(int argc, char *argv[]) {
     bool benchmark_mode;
     std::string kernel_regex;
     bool store_results = true;
+    std::ofstream json_file;
+
     try {
         boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
         boost::program_options::notify(vm);
@@ -83,9 +131,14 @@ int main(int argc, char *argv[]) {
       return 0;
     }
 
+    if ( vm.count("json") )
+    {
+        json_file.open( vm["json"].as<std::string>().c_str() );
+    }
+
 
     // Run tests
-    std::vector<std::string> results;
+    std::vector<volk_test_results_t> results;
 
     //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
@@ -178,6 +231,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_binary_slicer_32i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_binary_slicer_8i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_tanh_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
 
     // Until we can update the config on a kernel by kernel basis
     // do not overwrite volk_config when using a regex.
@@ -204,8 +258,10 @@ int main(int argc, char *argv[]) {
 #the function name is followed by the preferred architecture.\n\
 ";
 
-        BOOST_FOREACH(std::string result, results) {
-            config << result << std::endl;
+        BOOST_FOREACH(volk_test_results_t result, results) {
+            config << result.config_name << " "
+                << result.best_arch_a << " "
+                << result.best_arch_u << std::endl;
         }
         config.close();
     }
diff --git a/volk/kernels/volk/volk_32f_tanh_32f.h b/volk/kernels/volk/volk_32f_tanh_32f.h
new file mode 100644
index 0000000000..3f407d4656
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_tanh_32f.h
@@ -0,0 +1,296 @@
+#ifndef INCLUDED_volk_32f_tanh_32f_a_H
+#define INCLUDED_volk_32f_tanh_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+/*!
+\brief Calculates tanh(x)
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
+                                             unsigned int num_points)
+{
+  unsigned int number = 0;
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  for(; number < num_points; number++) {
+    *cPtr++ = tanh(*aPtr++);
+  }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_series(float* cVector, const float* aVector,
+                                            unsigned int num_points)
+{
+  unsigned int number = 0;
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  for(; number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m128 aVal, cVal, x2, a, b;
+  __m128 const1, const2, const3, const4, const5, const6;
+  const1 = _mm_set_ps1(135135.0f);
+  const2 = _mm_set_ps1(17325.0f);
+  const3 = _mm_set_ps1(378.0f);
+  const4 = _mm_set_ps1(62370.0f);
+  const5 = _mm_set_ps1(3150.0f);
+  const6 = _mm_set_ps1(28.0f);
+  for(;number < quarterPoints; number++){
+
+    aVal = _mm_load_ps(aPtr);
+    x2 = _mm_mul_ps(aVal, aVal);
+    a  = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+    b  = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+    cVal = _mm_div_ps(a, b);
+
+    _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m256 aVal, cVal, x2, a, b;
+  __m256 const1, const2, const3, const4, const5, const6;
+  const1 = _mm256_set1_ps(135135.0f);
+  const2 = _mm256_set1_ps(17325.0f);
+  const3 = _mm256_set1_ps(378.0f);
+  const4 = _mm256_set1_ps(62370.0f);
+  const5 = _mm256_set1_ps(3150.0f);
+  const6 = _mm256_set1_ps(28.0f);
+  for(;number < eighthPoints; number++){
+
+    aVal = _mm256_load_ps(aPtr);
+    x2 = _mm256_mul_ps(aVal, aVal);
+    a  = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+    b  = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+    cVal = _mm256_div_ps(a, b);
+
+    _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 8;
+    cPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_AVX */
+
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m128 aVal, cVal, x2, a, b;
+  __m128 const1, const2, const3, const4, const5, const6;
+  const1 = _mm_set_ps1(135135.0f);
+  const2 = _mm_set_ps1(17325.0f);
+  const3 = _mm_set_ps1(378.0f);
+  const4 = _mm_set_ps1(62370.0f);
+  const5 = _mm_set_ps1(3150.0f);
+  const6 = _mm_set_ps1(28.0f);
+  for(;number < quarterPoints; number++){
+
+    aVal = _mm_loadu_ps(aPtr);
+    x2 = _mm_mul_ps(aVal, aVal);
+    a  = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+    b  = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+    cVal = _mm_div_ps(a, b);
+
+    _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
+                                           unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+  __m256 aVal, cVal, x2, a, b;
+  __m256 const1, const2, const3, const4, const5, const6;
+  const1 = _mm256_set1_ps(135135.0f);
+  const2 = _mm256_set1_ps(17325.0f);
+  const3 = _mm256_set1_ps(378.0f);
+  const4 = _mm256_set1_ps(62370.0f);
+  const5 = _mm256_set1_ps(3150.0f);
+  const6 = _mm256_set1_ps(28.0f);
+  for(;number < eighthPoints; number++){
+
+    aVal = _mm256_loadu_ps(aPtr);
+    x2 = _mm256_mul_ps(aVal, aVal);
+    a  = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+    b  = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+    cVal = _mm256_div_ps(a, b);
+
+    _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+    aPtr += 8;
+    cPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(;number < num_points; number++) {
+    if(*aPtr > 4.97)
+      *cPtr++ = 1;
+    else if(*aPtr <= -4.97)
+      *cPtr++ = -1;
+    else {
+      float x2 = (*aPtr) * (*aPtr);
+      float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+      float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+      *cPtr++ = a / b;
+      aPtr++;
+    }
+  }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index 6cc4504cfa..37915e5552 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -406,10 +406,8 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
   # if we find one that matches our current system architecture
   # set up the assembler flags and include the source files
   foreach(ARCH ${ASM_ARCHS_AVAILABLE})
-    message(STATUS "--==>> -CFLAGS1: ${FULL_C_FLAGS}")
     string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}")
     if( ASM_ARCH STREQUAL "armv7" )
-      set(ASM-ATT $ENV{ASM})
       message(STATUS "---- Adding ASM files") # we always use ATT syntax
       message(STATUS "-- Detected armv7 architecture; enabling ASM")
       # setup architecture specific assembler flags
@@ -422,13 +420,20 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
         message(STATUS "Adding source file: ${asm_file}")
       endforeach(asm_file)
     endif()
-    set(CMAKE_ASM-ATT_FLAGS_INIT ${ARCH_ASM_FLAGS})
-    enable_language(ASM-ATT) # this must be after flags_init
-    message(STATUS "asm flags: ${CMAKE_ASM-ATT_FLAGS}")
+    enable_language(ASM)
+    set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS})
+    message(STATUS "c flags: ${FULL_C_FLAGS}")
+    message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}")
   endforeach(ARCH)
 
 else(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
   message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.")
+  foreach(machine_name ${available_machines})
+    string(REGEX MATCH "neon" NEON_MACHINE ${machine_name})
+    if( NEON_MACHINE STREQUAL "neon")
+      message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support")
+    endif()
+  endforeach()
 endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
 
 ########################################################################
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index f30f0097ae..3ab4a9970c 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -5,7 +5,9 @@
 #include <boost/tokenizer.hpp>
 #include <boost/xpressive/xpressive.hpp>
 #include <iostream>
+#include <fstream>
 #include <vector>
+#include <map>
 #include <list>
 #include <ctime>
 #include <cmath>
@@ -328,9 +330,9 @@ bool run_volk_tests(volk_func_desc_t desc,
                     lv_32fc_t scalar,
                     int vlen,
                     int iter,
-                    std::vector<std::string> *best_arch_vector = 0,
-                    std::string puppet_master_name = "NULL",
-                    bool benchmark_mode,
+                    std::vector<volk_test_results_t> *results,
+                    std::string puppet_master_name,
+                    bool benchmark_mode, 
                     std::string kernel_regex
                    ) {
     boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex);
@@ -338,6 +340,12 @@ bool run_volk_tests(volk_func_desc_t desc,
         // in this case we have a regex and are only looking to test one kernel
         return false;
     }
+    if(results) {
+        results->push_back(volk_test_results_t()); 
+        results->back().name = name;
+        results->back().vlen = vlen;
+        results->back().iter = iter;
+    }
     std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
 
     // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583
@@ -453,6 +461,13 @@ bool run_volk_tests(volk_func_desc_t desc,
         end = clock();
         double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC;
         std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl;
+        if(results) {
+            volk_test_time_t result;
+            result.name = arch_list[i];
+            result.time = arch_time;
+            result.units = "ms";
+            results->back().results[result.name] = result;
+        }
 
         profile_times.push_back(arch_time);
     }
@@ -553,13 +568,14 @@ bool run_volk_tests(volk_func_desc_t desc,
 
     std::cout << "Best aligned arch: " << best_arch_a << std::endl;
     std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
-    if(best_arch_vector) {
+    if(results) {
         if(puppet_master_name == "NULL") {
-            best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
-        }
-        else {
-            best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
+            results->back().config_name = name;
+        } else {
+            results->back().config_name = puppet_master_name;
         }
+        results->back().best_arch_a = best_arch_a;
+        results->back().best_arch_u = best_arch_u;
     }
 
     return fail_global;
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index fc1a0239eb..7ca8b8d1e8 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -3,7 +3,10 @@
 
 #include <cstdlib>
 #include <string>
+#include <iostream>
+#include <fstream>
 #include <vector>
+#include <map>
 #include <volk/volk.h>
 #include <volk/volk_common.h>
 
@@ -21,10 +24,46 @@ volk_type_t volk_type_from_string(std::string);
 float uniform(void);
 void random_floats(float *buf, unsigned n);
 
-bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string, bool benchmark_mode=false, std::string kernel_regex="");
+class volk_test_time_t {
+    public:
+        std::string name;
+        double time;
+        std::string units;
+};
+
+class volk_test_results_t {
+    public: 
+        std::string name;
+        std::string config_name;
+        int vlen;
+        int iter;
+        std::map<std::string, volk_test_time_t> results;
+        std::string best_arch_a;
+        std::string best_arch_u;
+};
+
+bool run_volk_tests(
+    volk_func_desc_t, 
+    void(*)(), 
+    std::string, 
+    float, 
+    lv_32fc_t, 
+    int, 
+    int, 
+    std::vector<volk_test_results_t> *results = NULL, 
+    std::string puppet_master_name = "NULL",
+    bool benchmark_mode=false, 
+    std::string kernel_regex=""
+    );
 
 
-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
+    BOOST_AUTO_TEST_CASE(func##_test) { \
+        BOOST_CHECK_EQUAL(run_volk_tests( \
+            func##_get_func_desc(), (void (*)())func##_manual, \
+            std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
+          0); \
+    }
 #define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex)
 #define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex)
 typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index bc97ad16e5..9d837517f1 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -114,3 +114,4 @@ VOLK_RUN_TESTS(volk_8u_conv_k7_r2puppet_8u, 0, 0, 2060, 1);
 VOLK_RUN_TESTS(volk_32f_invsqrt_32f, 1e-2, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_binary_slicer_32i, 0, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_binary_slicer_8i, 0, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_tanh_32f, 1e-6, 0, 20462, 1);