diff options
author | Abhishek Bhowmick <abhowmick22@gmail.com> | 2014-06-09 09:51:05 +0530 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:29:19 -0400 |
commit | a43d53348d5fabdaeccca1c3ad2c661e1d292e79 (patch) | |
tree | d009f0402191e1015019fbf1319a10dc567214ed | |
parent | d0c0e856c0618d87ff88ca53fd63584679e2fb56 (diff) |
volk: Added log2
volk: Added fast inaccurate exp kernel.
-rw-r--r-- | volk/apps/volk_profile.cc | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_expfast_32f.h | 138 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_log2_32f.h | 210 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_log2_32f_mine.cc | 169 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc | 115 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 2 |
6 files changed, 352 insertions, 284 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 416884734d..08c31cb7d9 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -162,6 +162,8 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_32f_expfast_32f.h b/volk/kernels/volk/volk_32f_expfast_32f.h new file mode 100644 index 0000000000..0826527feb --- /dev/null +++ b/volk/kernels/volk/volk_32f_expfast_32f.h @@ -0,0 +1,138 @@ +#include <stdio.h> +#include <math.h> +#include <inttypes.h> +#include <immintrin.h> + +#define Mln2 0.6931471805f +#define A 8388608.0f +#define B 1065353216.0f +#define C 60801.0f + + +#ifndef INCLUDED_volk_32f_expfast_32f_a_H +#define INCLUDED_volk_32f_expfast_32f_a_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes fast exp (max 7% error) of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, a, b; + __m128i exp; + a = _mm_set1_ps(A/Mln2); + b = _mm_set1_ps(B-C); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b)); + bVal = _mm_castsi128_ps(exp); + + _mm_store_ps(bPtr, bVal); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = expf(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes fast exp (max 7% error) of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_expfast_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = expf(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_expfast_32f_a_H */ + +#ifndef INCLUDED_volk_32f_expfast_32f_u_H +#define INCLUDED_volk_32f_expfast_32f_u_H + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes fast exp (max 7% error) of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, a, b; + __m128i exp; + a = _mm_set1_ps(A/Mln2); + b = _mm_set1_ps(B-C); + + for(;number < quarterPoints; number++){ + aVal = _mm_loadu_ps(aPtr); + exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b)); + bVal = _mm_castsi128_ps(exp); + + _mm_storeu_ps(bPtr, bVal); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = expf(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for unaligned */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes fast exp (max 7% error) of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_expfast_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = expf(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_expfast_32f_u_H */ diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h new file mode 100644 index 0000000000..c6caceb789 --- /dev/null +++ b/volk/kernels/volk/volk_32f_log2_32f.h @@ -0,0 +1,210 @@ +/* + * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation + * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + * This is the MIT License (MIT) + +*/ + + +#include <stdio.h> +#include <stdlib.h> +#include <inttypes.h> +#include <math.h> + +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define LOG_POLY_DEGREE 3 + + +#ifndef INCLUDED_volk_32f_log2_32f_a_H +#define INCLUDED_volk_32f_log2_32f_a_H + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes base 2 log of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = log2(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes base 2 log of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, mantissa, frac, leadingOne; + __m128i bias, exp; + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); + bVal = _mm_cvtepi32_ps(exp); + + // Now to extract mantissa + frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + + #if LOG_POLY_DEGREE == 6 + mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + #elif LOG_POLY_DEGREE == 5 + mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + #elif LOG_POLY_DEGREE == 4 + mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + #elif LOG_POLY_DEGREE == 3 + mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + #else + #error + #endif + + bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + _mm_store_ps(bPtr, bVal); + + + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = log2(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#endif /* INCLUDED_volk_32f_log2_32f_a_H */ + +#ifndef INCLUDED_volk_32f_log2_32f_u_H +#define INCLUDED_volk_32f_log2_32f_u_H + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes base 2 log of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *bPtr++ = log2(*aPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes base 2 log of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, mantissa, frac, leadingOne; + __m128i bias, exp; + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); + bVal = _mm_cvtepi32_ps(exp); + + // Now to extract mantissa + frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + + #if LOG_POLY_DEGREE == 6 + mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + #elif LOG_POLY_DEGREE == 5 + mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + #elif LOG_POLY_DEGREE == 4 + mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + #elif LOG_POLY_DEGREE == 3 + mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + #else + #error + #endif + + bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + _mm_storeu_ps(bPtr, bVal); + + + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *bPtr++ = log2(*aPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for unaligned */ + +#endif /* INCLUDED_volk_32f_log2_32f_u_H */ diff --git a/volk/kernels/volk/volk_32f_log2_32f_mine.cc b/volk/kernels/volk/volk_32f_log2_32f_mine.cc deleted file mode 100644 index 94d3daafa3..0000000000 --- a/volk/kernels/volk/volk_32f_log2_32f_mine.cc +++ /dev/null @@ -1,169 +0,0 @@ - -#include <stdio.h> -#include <math.h> -#include <inttypes.h> -#include <iostream> -#include <immintrin.h> - -void print128_num(__m128i var) -{ - uint32_t *val = (uint32_t*) &var;//can also use uint32_t instead of 16_t - printf("Vector of Ints: %x %x %d %d \n", - val[3], val[2], val[1], val[0]); -} - -void print128f_num(__m128 var) -{ - float *val = (float*) &var;//can also use uint32_t instead of 16_t - printf("Vector of Floats: %f %f %f %f \n", - val[3], val[2], val[1], val[0]); -} - -static inline void volk_32f_log2_32f_a_avx(float* bVector, float* aVector, unsigned int num_points){ - - float* bPtr = bVector; - float* aPtr = aVector; - int exp1 = 0x7f800000; - - __m128 aVal, bVal, charac, mantissa; - __m128i bias = _mm_set1_epi32(127); - __m128i aVali = _mm_castps_si128(aVal); - __m128i exp = _mm_set1_epi32(exp1); - aVal = _mm_load_ps(aPtr); - exp = _mm_and_si128(aVali, exp); - exp = _mm_srli_epi32(exp, 23); - exp = _mm_sub_epi32(exp, bias); - charac = _mm_cvtepi32_ps(exp); - - // Now to extract mantissa - int frac1 = 0x007fffff; - __m128i fracMask = _mm_set1_epi32(frac1); - __m128 frac = _mm_castsi128_ps(fracMask); - frac = _mm_and_ps(aVal, frac); - __m128 leadingOne = _mm_set1_ps(1.0f); - frac = _mm_or_ps(leadingOne, frac); - - - // Add the decimal bits one by one, can use LUT to speed up - __m128 compareResults, compareResultsNot, compareTo, fracByTwo, allOnes, a, b, c; - __m128i allOnesi; - allOnesi = _mm_set1_epi32(0xffffffff); - allOnes = _mm_castsi128_ps(allOnesi); - compareTo = _mm_set1_ps(2.0); - - mantissa = _mm_set1_ps(0.5); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - //b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_sub_ps(frac, a); - c = _mm_and_ps(compareResults, mantissa); - charac = _mm_add_ps(charac, c); - - - mantissa = _mm_set1_ps(0.25); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - //b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_sub_ps(frac, a); - charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); - - - mantissa = _mm_set1_ps(0.125); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_add_ps(a, b); - charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); - -/* - mantissa = _mm_set1_ps(0.0625); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_add_ps(a, b); - charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); - - mantissa = _mm_set1_ps(0.03125); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_add_ps(a, b); - charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); - - mantissa = _mm_set1_ps(0.015625); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_add_ps(a, b); - charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); - - mantissa = _mm_set1_ps(0.0078125); - frac = _mm_mul_ps(frac, frac); - compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare - compareResultsNot = _mm_andnot_ps(compareResults, allOnes); - fracByTwo = _mm_div_ps(frac, compareTo); - a = _mm_and_ps(compareResults, fracByTwo); - b = _mm_and_ps(compareResultsNot, frac); - frac = _mm_add_ps(a, b); - charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); -*/ - _mm_store_ps(bPtr, charac); -} - - -int main(){ - -clock_t start, end; -double time; -float dummy; -//float input[] = {0.5, 1.0, 4.0, 9.5}; -float* input = new float(4*sizeof(float)); -input[0] = 0.125; -input[1] = 2.4; -input[2] = 8.0; -input[3] = 16.0; -float* output = new float(4*sizeof(float)); - -start = clock(); -for(int i = 0; i < 10000000; i++) - volk_32f_log2_32f_a_avx(output, input, 4); -end = clock(); -time = 1000 * (double)(end-start); -std::cout << "my time is " << time << std::endl; - -start = clock(); -for(int j = 0; j < 40000000; j++) - dummy = log2(input[0]); -end = clock(); -time = 1000 * (double)(end-start); -std::cout << "cmath time is " << time << std::endl; - - -for(int i =0; i < 4; i ++){ - std::cout << input[i] << "\t" << output[i]; - std::cout << " cmath : \t " << log2(input[i]) << std::endl; -} - -delete input; -delete output; -return 0; - -} diff --git a/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc b/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc deleted file mode 100644 index 7ee18cc46b..0000000000 --- a/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc +++ /dev/null @@ -1,115 +0,0 @@ - -#include <stdio.h> -#include <math.h> -#include <inttypes.h> -#include <iostream> -#include <immintrin.h> - - -#define POLY0(x, c0) _mm_set1_ps(c0) -#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) -#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) - -#define LOG_POLY_DEGREE 3 - -void print128_num(__m128i var) -{ - uint32_t *val = (uint32_t*) &var;//can also use uint32_t instead of 16_t - printf("Vector of Ints: %x %x %d %d \n", - val[3], val[2], val[1], val[0]); -} - -void print128f_num(__m128 var) -{ - float *val = (float*) &var;//can also use uint32_t instead of 16_t - printf("Vector of Floats: %f %f %f %f \n", - val[3], val[2], val[1], val[0]); -} - -static inline void volk_32f_log2_32f_a_avx(float* bVector, float* aVector, unsigned int num_points){ - - float* bPtr = bVector; - float* aPtr = aVector; - int exp1 = 0x7f800000; - - __m128 aVal, bVal, charac, mantissa, frac; - __m128i bias = _mm_set1_epi32(127); - __m128i aVali = _mm_castps_si128(aVal); - __m128i exp = _mm_set1_epi32(exp1); - aVal = _mm_load_ps(aPtr); - exp = _mm_and_si128(aVali, exp); - exp = _mm_srli_epi32(exp, 23); - exp = _mm_sub_epi32(exp, bias); - charac = _mm_cvtepi32_ps(exp); - - // Now to extract mantissa - int frac1 = 0x007fffff; - __m128i fracMask = _mm_set1_epi32(frac1); - frac = _mm_castsi128_ps(fracMask); - frac = _mm_and_ps(aVal, frac); - __m128 leadingOne = _mm_set1_ps(1.0f); - frac = _mm_or_ps(leadingOne, frac); - - - // This is where MIT Licensed code starts - #if LOG_POLY_DEGREE == 6 - mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); -#elif LOG_POLY_DEGREE == 5 - mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); -#elif LOG_POLY_DEGREE == 4 - mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); -#elif LOG_POLY_DEGREE == 3 - mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); -#else -#error -#endif - - mantissa = _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)); - - bVal = _mm_add_ps(charac, mantissa); - - _mm_store_ps(bPtr, bVal); -} - - -int main(){ - -clock_t start, end; -double time; -float dummy; -//float input[] = {0.5, 1.0, 4.0, 9.5}; -float* input = new float(4*sizeof(float)); -input[0] = 0.125; -input[1] = 2.4; -input[2] = 8.0; -input[3] = 16.0; -float* output = new float(4*sizeof(float)); - -start = clock(); -for(int i = 0; i < 10000000; i++) - volk_32f_log2_32f_a_avx(output, input, 4); -end = clock(); -time = 1000 * (double)(end-start); -std::cout << "my time is " << time << std::endl; - -start = clock(); -for(int j = 0; j < 40000000; j++) - dummy = log2(input[0]); -end = clock(); -time = 1000 * (double)(end-start); -std::cout << "cmath time is " << time << std::endl; - - -for(int i =0; i < 4; i ++){ - std::cout << input[i] << "\t" << output[i]; - std::cout << " cmath : \t " << log2(input[i]) << std::endl; -} - -delete input; -delete output; -return 0; - -} diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 9d837517f1..bdea584370 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -44,6 +44,8 @@ VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_log2_32f, 1e-3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1); |