diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 62 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_tanh_32f.h | 296 | ||||
-rw-r--r-- | volk/lib/CMakeLists.txt | 15 | ||||
-rw-r--r-- | volk/lib/qa_utils.cc | 32 | ||||
-rw-r--r-- | volk/lib/qa_utils.h | 43 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 1 |
6 files changed, 431 insertions, 18 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 074d1e7be4..416884734d 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -37,6 +37,49 @@ namespace fs = boost::filesystem; +void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results) { + json_file << "{" << std::endl; + json_file << " \"volk_tests\": [" << std::endl; + size_t len = results.size(); + size_t i = 0; + BOOST_FOREACH(volk_test_results_t &result, results) { + json_file << " {" << std::endl; + json_file << " \"name\": \"" << result.name << "\"," << std::endl; + json_file << " \"vlen\": " << result.vlen << "," << std::endl; + json_file << " \"iter\": " << result.iter << "," << std::endl; + json_file << " \"best_arch_a\": \"" << result.best_arch_a + << "\"," << std::endl; + json_file << " \"best_arch_u\": \"" << result.best_arch_u + << "\"," << std::endl; + json_file << " \"results\": {" << std::endl; + size_t results_len = result.results.size(); + size_t ri = 0; + typedef std::pair<std::string, volk_test_time_t> tpair; + BOOST_FOREACH(tpair pair, result.results) { + volk_test_time_t time = pair.second; + json_file << " \"" << time.name << "\": {" << std::endl; + json_file << " \"name\": \"" << time.name << "\"," << std::endl; + json_file << " \"time\": " << time.time << "," << std::endl; + json_file << " \"units\": \"" << time.units << "\"" << std::endl; + json_file << " }" ; + if(ri+1 != results_len) { + json_file << ","; + } + json_file << std::endl; + ri++; + } + json_file << " }" << std::endl; + json_file << " }"; + if(i+1 != len) { + json_file << ","; + } + json_file << std::endl; + i++; + } + json_file << " ]" << std::endl; + json_file << "}" << std::endl; +} + int main(int argc, char *argv[]) { // Adding program options boost::program_options::options_description desc("Options"); @@ -49,6 +92,9 @@ int main(int argc, char *argv[]) { ("tests-regex,R", boost::program_options::value<std::string>(), "Run tests matching regular expression.") + ("json,j", + boost::program_options::value<std::string>(), + "JSON output file") ; // Handle the options that were given @@ -56,6 +102,8 @@ int main(int argc, char *argv[]) { bool benchmark_mode; std::string kernel_regex; bool store_results = true; + std::ofstream json_file; + try { boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); boost::program_options::notify(vm); @@ -83,9 +131,14 @@ int main(int argc, char *argv[]) { return 0; } + if ( vm.count("json") ) + { + json_file.open( vm["json"].as<std::string>().c_str() ); + } + // Run tests - std::vector<std::string> results; + std::vector<volk_test_results_t> results; //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); @@ -178,6 +231,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_binary_slicer_32i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_binary_slicer_8i, 0, 1.0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_tanh_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); // Until we can update the config on a kernel by kernel basis // do not overwrite volk_config when using a regex. @@ -204,8 +258,10 @@ int main(int argc, char *argv[]) { #the function name is followed by the preferred architecture.\n\ "; - BOOST_FOREACH(std::string result, results) { - config << result << std::endl; + BOOST_FOREACH(volk_test_results_t result, results) { + config << result.config_name << " " + << result.best_arch_a << " " + << result.best_arch_u << std::endl; } config.close(); } diff --git a/volk/kernels/volk/volk_32f_tanh_32f.h b/volk/kernels/volk/volk_32f_tanh_32f.h new file mode 100644 index 0000000000..3f407d4656 --- /dev/null +++ b/volk/kernels/volk/volk_32f_tanh_32f.h @@ -0,0 +1,296 @@ +#ifndef INCLUDED_volk_32f_tanh_32f_a_H +#define INCLUDED_volk_32f_tanh_32f_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> +#include <string.h> + +#ifdef LV_HAVE_GENERIC +/*! +\brief Calculates tanh(x) +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_generic(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + for(; number < num_points; number++) { + *cPtr++ = tanh(*aPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_GENERIC +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_series(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + for(; number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} + +#endif /* LV_HAVE_GENERIC */ + + + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal, x2, a, b; + __m128 const1, const2, const3, const4, const5, const6; + const1 = _mm_set_ps1(135135.0f); + const2 = _mm_set_ps1(17325.0f); + const3 = _mm_set_ps1(378.0f); + const4 = _mm_set_ps1(62370.0f); + const5 = _mm_set_ps1(3150.0f); + const6 = _mm_set_ps1(28.0f); + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + x2 = _mm_mul_ps(aVal, aVal); + a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); + b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); + + cVal = _mm_div_ps(a, b); + + _mm_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); + b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); + + cVal = _mm256_div_ps(a, b); + + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_AVX */ + + + + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal, x2, a, b; + __m128 const1, const2, const3, const4, const5, const6; + const1 = _mm_set_ps1(135135.0f); + const2 = _mm_set_ps1(17325.0f); + const3 = _mm_set_ps1(378.0f); + const4 = _mm_set_ps1(62370.0f); + const5 = _mm_set_ps1(3150.0f); + const6 = _mm_set_ps1(28.0f); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + x2 = _mm_mul_ps(aVal, aVal); + a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); + b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); + + cVal = _mm_div_ps(a, b); + + _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_SSE */ + + + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! +\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of the actual tanh. +\param cVector The vector where the results will be stored +\param aVector Input vector +\param num_points The number of values to calulate +*/ +static inline void volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); + b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); + + cVal = _mm256_div_ps(a, b); + + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++) { + if(*aPtr > 4.97) + *cPtr++ = 1; + else if(*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_tanh_32f_a_H */ diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index 6cc4504cfa..37915e5552 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -406,10 +406,8 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8.9") # if we find one that matches our current system architecture # set up the assembler flags and include the source files foreach(ARCH ${ASM_ARCHS_AVAILABLE}) - message(STATUS "--==>> -CFLAGS1: ${FULL_C_FLAGS}") string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}") if( ASM_ARCH STREQUAL "armv7" ) - set(ASM-ATT $ENV{ASM}) message(STATUS "---- Adding ASM files") # we always use ATT syntax message(STATUS "-- Detected armv7 architecture; enabling ASM") # setup architecture specific assembler flags @@ -422,13 +420,20 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8.9") message(STATUS "Adding source file: ${asm_file}") endforeach(asm_file) endif() - set(CMAKE_ASM-ATT_FLAGS_INIT ${ARCH_ASM_FLAGS}) - enable_language(ASM-ATT) # this must be after flags_init - message(STATUS "asm flags: ${CMAKE_ASM-ATT_FLAGS}") + enable_language(ASM) + set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS}) + message(STATUS "c flags: ${FULL_C_FLAGS}") + message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}") endforeach(ARCH) else(${CMAKE_VERSION} VERSION_GREATER "2.8.9") message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.") + foreach(machine_name ${available_machines}) + string(REGEX MATCH "neon" NEON_MACHINE ${machine_name}) + if( NEON_MACHINE STREQUAL "neon") + message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support") + endif() + endforeach() endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9") ######################################################################## diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index f30f0097ae..3ab4a9970c 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -5,7 +5,9 @@ #include <boost/tokenizer.hpp> #include <boost/xpressive/xpressive.hpp> #include <iostream> +#include <fstream> #include <vector> +#include <map> #include <list> #include <ctime> #include <cmath> @@ -328,9 +330,9 @@ bool run_volk_tests(volk_func_desc_t desc, lv_32fc_t scalar, int vlen, int iter, - std::vector<std::string> *best_arch_vector = 0, - std::string puppet_master_name = "NULL", - bool benchmark_mode, + std::vector<volk_test_results_t> *results, + std::string puppet_master_name, + bool benchmark_mode, std::string kernel_regex ) { boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex); @@ -338,6 +340,12 @@ bool run_volk_tests(volk_func_desc_t desc, // in this case we have a regex and are only looking to test one kernel return false; } + if(results) { + results->push_back(volk_test_results_t()); + results->back().name = name; + results->back().vlen = vlen; + results->back().iter = iter; + } std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl; // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583 @@ -453,6 +461,13 @@ bool run_volk_tests(volk_func_desc_t desc, end = clock(); double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC; std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl; + if(results) { + volk_test_time_t result; + result.name = arch_list[i]; + result.time = arch_time; + result.units = "ms"; + results->back().results[result.name] = result; + } profile_times.push_back(arch_time); } @@ -553,13 +568,14 @@ bool run_volk_tests(volk_func_desc_t desc, std::cout << "Best aligned arch: " << best_arch_a << std::endl; std::cout << "Best unaligned arch: " << best_arch_u << std::endl; - if(best_arch_vector) { + if(results) { if(puppet_master_name == "NULL") { - best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u); - } - else { - best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u); + results->back().config_name = name; + } else { + results->back().config_name = puppet_master_name; } + results->back().best_arch_a = best_arch_a; + results->back().best_arch_u = best_arch_u; } return fail_global; diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index fc1a0239eb..7ca8b8d1e8 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -3,7 +3,10 @@ #include <cstdlib> #include <string> +#include <iostream> +#include <fstream> #include <vector> +#include <map> #include <volk/volk.h> #include <volk/volk_common.h> @@ -21,10 +24,46 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string, bool benchmark_mode=false, std::string kernel_regex=""); +class volk_test_time_t { + public: + std::string name; + double time; + std::string units; +}; + +class volk_test_results_t { + public: + std::string name; + std::string config_name; + int vlen; + int iter; + std::map<std::string, volk_test_time_t> results; + std::string best_arch_a; + std::string best_arch_u; +}; + +bool run_volk_tests( + volk_func_desc_t, + void(*)(), + std::string, + float, + lv_32fc_t, + int, + int, + std::vector<volk_test_results_t> *results = NULL, + std::string puppet_master_name = "NULL", + bool benchmark_mode=false, + std::string kernel_regex="" + ); -#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ + BOOST_AUTO_TEST_CASE(func##_test) { \ + BOOST_CHECK_EQUAL(run_volk_tests( \ + func##_get_func_desc(), (void (*)())func##_manual, \ + std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ + 0); \ + } #define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex) #define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex) typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index bc97ad16e5..9d837517f1 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -114,3 +114,4 @@ VOLK_RUN_TESTS(volk_8u_conv_k7_r2puppet_8u, 0, 0, 2060, 1); VOLK_RUN_TESTS(volk_32f_invsqrt_32f, 1e-2, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_binary_slicer_32i, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_binary_slicer_8i, 0, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_tanh_32f, 1e-6, 0, 20462, 1); |