diff options
author | Nathan West <nathan.west@okstate.edu> | 2014-10-24 19:49:19 -0500 |
---|---|---|
committer | Nathan West <nathan.west@okstate.edu> | 2014-10-26 17:41:13 -0500 |
commit | eba094a3010ca326da747390be51da8326fff0c1 (patch) | |
tree | b8310aa62d11b8b21e0a84f5c5275eebe1f5aec3 /volk | |
parent | 86a2cf03996a2950d3c7cc4467311b1506f5182a (diff) |
volk: add neon log2 implementation and fix QA to properly test
The implementation adds the float exponent to a 6th order minimax fit
of log2(significand) which has domain [1,2).
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_log2_32f.h | 120 | ||||
-rw-r--r-- | volk/lib/qa_utils.cc | 2 |
3 files changed, 112 insertions, 12 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 4167f4de1f..9bc1842c63 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -162,7 +162,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h index 52c1b60549..9452d358f8 100644 --- a/volk/kernels/volk/volk_32f_log2_32f.h +++ b/volk/kernels/volk/volk_32f_log2_32f.h @@ -145,30 +145,130 @@ static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVect #endif /* LV_HAVE_SSE4_1 for aligned */ -#endif /* INCLUDED_volk_32f_log2_32f_a_H */ - -#ifndef INCLUDED_volk_32f_log2_32f_u_H -#define INCLUDED_volk_32f_log2_32f_u_H +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + +/* these macros allow us to embed logs in other kernels */ +#define VLOG2Q_NEON_PREAMBLE() \ + int32x4_t one = vdupq_n_s32(0x000800000); \ + /* minimax polynomial */ \ + float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ + float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ + float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ + float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ + float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ + float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ + float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ + int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ + int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ + int32x4_t exp_bias = vdupq_n_s32(127); + + +#define VLOG2Q_NEON_F32(log2_approx, aval) \ + int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ + int32x4_t significand_i = vandq_s32(aval, sig_mask); \ + exponent_i = vshrq_n_s32(exponent_i, 23); \ + \ + /* extract the exponent and significand \ + we can treat this as fixed point to save ~9% on the \ + conversion + float add */ \ + significand_i = vorrq_s32(one, significand_i); \ + float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \ + /* debias the exponent and convert to float */ \ + exponent_i = vsubq_s32(exponent_i, exp_bias); \ + float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ + \ + /* put the significand through a polynomial fit of log2(x) [1,2]\ + add the result to the exponent */ \ + log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ + float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ + tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + \ + float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ + tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ + tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ + tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ + tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); -#ifdef LV_HAVE_GENERIC /*! \brief Computes base 2 log of input vector and stores results in output vector \param bVector The vector where results will be stored \param aVector The input vector of floats \param num_points Number of points for which log is to be computed */ -static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){ float* bPtr = bVector; const float* aPtr = aVector; - unsigned int number = 0; + unsigned int number; + const unsigned int quarterPoints = num_points / 4; - for(number = 0; number < num_points; number++){ - *bPtr++ = log2(*aPtr++); + int32x4_t aval; + float32x4_t log2_approx; + + VLOG2Q_NEON_PREAMBLE() + // lms + //p0 = vdupq_n_f32(-1.649132280361871); + //p1 = vdupq_n_f32(1.995047138579499); + //p2 = vdupq_n_f32(-0.336914839219728); + + // keep in mind a single precision float is represented as + // (-1)^sign * 2^exp * 1.significand, so the log2 is + // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) + for(number = 0; number < quarterPoints; ++number){ + // load float in to an int register without conversion + aval = vld1q_s32((int*)aPtr); + + VLOG2Q_NEON_F32(log2_approx, aval) + + vst1q_f32(bPtr, log2_approx); + + aPtr += 4; + bPtr += 4; } + for(number = quarterPoints * 4; number < num_points; number++){ + *bPtr++ = log2(*aPtr++); + } } -#endif /* LV_HAVE_GENERIC */ + +#endif /* LV_HAVE_NEON */ + + +#endif /* INCLUDED_volk_32f_log2_32f_a_H */ + +#ifndef INCLUDED_volk_32f_log2_32f_u_H +#define INCLUDED_volk_32f_log2_32f_u_H + + +//#ifdef LV_HAVE_GENERIC +///*! +// \brief Computes base 2 log of input vector and stores results in output vector +// \param bVector The vector where results will be stored +// \param aVector The input vector of floats +// \param num_points Number of points for which log is to be computed +//*/ +//static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){ +// float* bPtr = bVector; +// const float* aPtr = aVector; +// unsigned int number = 0; +// +// for(number = 0; number < num_points; number++){ +// *bPtr++ = log2(*aPtr++); +// } +// +//} +//#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE4_1 diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 3ab4a9970c..be20ed6585 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { } } // the primary test is the percent different greater than given tol - else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) { + else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { fail=true; if(print_max_errs-- > 0) { std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; |