diff options
author | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:49:45 -0400 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:49:45 -0400 |
commit | b2632afd71f06ec6da1aef8ab8cc9a137d5134ba (patch) | |
tree | d0c01e327f0a9ea7377ed8c674c3f8bea464f4ec | |
parent | add88cac3a8512fc534b1f725ea8b4e21fbbef1b (diff) |
volk: added power kernel.
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rwxr-xr-x | volk/kernels/volk/volk_32f_x2_pow_32f.h | 299 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 1 |
3 files changed, 301 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 08c31cb7d9..f4701579ac 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -164,6 +164,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h b/volk/kernels/volk/volk_32f_x2_pow_32f.h new file mode 100755 index 0000000000..cc11dafeee --- /dev/null +++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h @@ -0,0 +1,299 @@ +#include <stdio.h> +#include <stdlib.h> +#include <inttypes.h> +#include <math.h> + +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define LOG_POLY_DEGREE 3 + +#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H +#define INCLUDED_volk_32f_x2_pow_32f_a_H + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes pow(x,y) by using exp and log + \param cVector The vector where results will be stored + \param aVector The input vector of bases + \param bVector The input vector of indices + \param num_points Number of points for which pow is to be computed +*/ +static inline void volk_32f_x2_pow_32f_a_generic(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = pow(*aPtr++, *bPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes pow(x,y) by using exp and log + \param cVector The vector where results will be stored + \param aVector The input vector of bases + \param bVector The input vector of indices + \param num_points Number of points for which pow is to be computed +*/ +static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){ + + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m128 tmp, fx, mask, pow2n, z, y; + __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m128i bias, exp, emm0, pi32_0x7f; + + one = _mm_set1_ps(1.0); + exp_hi = _mm_set1_ps(88.3762626647949); + exp_lo = _mm_set1_ps(-88.3762626647949); + ln2 = _mm_set1_ps(0.6931471805); + log2EF = _mm_set1_ps(1.44269504088896341); + half = _mm_set1_ps(0.5); + exp_C1 = _mm_set1_ps(0.693359375); + exp_C2 = _mm_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm_set1_epi32(0x7f); + + exp_p0 = _mm_set1_ps(1.9875691500e-4); + exp_p1 = _mm_set1_ps(1.3981999507e-3); + exp_p2 = _mm_set1_ps(8.3334519073e-3); + exp_p3 = _mm_set1_ps(4.1665795894e-2); + exp_p4 = _mm_set1_ps(1.6666665459e-1); + exp_p5 = _mm_set1_ps(5.0000001201e-1); + + for(;number < quarterPoints; number++){ + + // First compute the logarithm + aVal = _mm_load_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); + logarithm = _mm_cvtepi32_ps(exp); + + frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + + #if LOG_POLY_DEGREE == 6 + mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + #elif LOG_POLY_DEGREE == 5 + mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + #elif LOG_POLY_DEGREE == 4 + mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + #elif LOG_POLY_DEGREE == 3 + mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + #else + #error + #endif + + logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + logarithm = _mm_mul_ps(logarithm, ln2); + + + // Now calculate b*lna + bVal = _mm_load_ps(bPtr); + bVal = _mm_mul_ps(bVal, logarithm); + + // Now compute exp(b*lna) + tmp = _mm_setzero_ps(); + + bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); + + fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); + + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); + + mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); + fx = _mm_sub_ps(tmp, mask); + + tmp = _mm_mul_ps(fx, exp_C1); + z = _mm_mul_ps(fx, exp_C2); + bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); + z = _mm_mul_ps(bVal, bVal); + + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); + y = _mm_add_ps(y, one); + + emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); + + pow2n = _mm_castsi128_ps(emm0); + cVal = _mm_mul_ps(y, pow2n); + + _mm_store_ps(cPtr, cVal); + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = pow(*aPtr++, *bPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for aligned */ + +#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */ + +#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H +#define INCLUDED_volk_32f_x2_pow_32f_u_H + +#ifdef LV_HAVE_GENERIC +/*! + \brief Computes pow(x,y) by using exp and log + \param cVector The vector where results will be stored + \param aVector The input vector of bases + \param bVector The input vector of indices + \param num_points Number of points for which pow is to be computed +*/ +static inline void volk_32f_x2_pow_32f_u_generic(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = pow(*aPtr++, *bPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> +/*! + \brief Computes pow(x,y) by using exp and log + \param cVector The vector where results will be stored + \param aVector The input vector of bases + \param bVector The input vector of indices + \param num_points Number of points for which pow is to be computed +*/ +static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, const float* aVector, unsigned int num_points){ + + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m128 tmp, fx, mask, pow2n, z, y; + __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m128i bias, exp, emm0, pi32_0x7f; + + one = _mm_set1_ps(1.0); + exp_hi = _mm_set1_ps(88.3762626647949); + exp_lo = _mm_set1_ps(-88.3762626647949); + ln2 = _mm_set1_ps(0.6931471805); + log2EF = _mm_set1_ps(1.44269504088896341); + half = _mm_set1_ps(0.5); + exp_C1 = _mm_set1_ps(0.693359375); + exp_C2 = _mm_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm_set1_epi32(0x7f); + + exp_p0 = _mm_set1_ps(1.9875691500e-4); + exp_p1 = _mm_set1_ps(1.3981999507e-3); + exp_p2 = _mm_set1_ps(8.3334519073e-3); + exp_p3 = _mm_set1_ps(4.1665795894e-2); + exp_p4 = _mm_set1_ps(1.6666665459e-1); + exp_p5 = _mm_set1_ps(5.0000001201e-1); + + for(;number < quarterPoints; number++){ + + // First compute the logarithm + aVal = _mm_loadu_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); + logarithm = _mm_cvtepi32_ps(exp); + + frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + + #if LOG_POLY_DEGREE == 6 + mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + #elif LOG_POLY_DEGREE == 5 + mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + #elif LOG_POLY_DEGREE == 4 + mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + #elif LOG_POLY_DEGREE == 3 + mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + #else + #error + #endif + + logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + logarithm = _mm_mul_ps(logarithm, ln2); + + + // Now calculate b*lna + bVal = _mm_loadu_ps(bPtr); + bVal = _mm_mul_ps(bVal, logarithm); + + // Now compute exp(b*lna) + tmp = _mm_setzero_ps(); + + bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); + + fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); + + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); + + mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); + fx = _mm_sub_ps(tmp, mask); + + tmp = _mm_mul_ps(fx, exp_C1); + z = _mm_mul_ps(fx, exp_C2); + bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); + z = _mm_mul_ps(bVal, bVal); + + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); + y = _mm_add_ps(y, one); + + emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); + + pow2n = _mm_castsi128_ps(emm0); + cVal = _mm_mul_ps(y, pow2n); + + _mm_storeu_ps(cPtr, cVal); + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = pow(*aPtr++, *bPtr++); + } +} + +#endif /* LV_HAVE_SSE4_1 for unaligned */ + +#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index bdea584370..a8099347aa 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -46,6 +46,7 @@ VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_log2_32f, 1e-3, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1); |