volk: add neon log2 implementation and fix QA to properly test

The implementation adds the float exponent to a 6th order minimax fit of log2(significand) which has domain [1,2).
author: Nathan West <nathan.west@okstate.edu> 2014-10-24 19:49:19 -0500
committer: Nathan West <nathan.west@okstate.edu> 2014-10-26 17:41:13 -0500
commit: eba094a3010ca326da747390be51da8326fff0c1 (patch)
tree: b8310aa62d11b8b21e0a84f5c5275eebe1f5aec3 /volk
parent: 86a2cf03996a2950d3c7cc4467311b1506f5182a (diff)
3 files changed, 112 insertions, 12 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 4167f4de1f..9bc1842c63 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h
index 52c1b60549..9452d358f8 100644
--- a/volk/kernels/volk/volk_32f_log2_32f.h
+++ b/volk/kernels/volk/volk_32f_log2_32f.h
@@ -145,30 +145,130 @@ static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVect
 
 #endif /* LV_HAVE_SSE4_1 for aligned */
 
-#endif /* INCLUDED_volk_32f_log2_32f_a_H */
-
-#ifndef INCLUDED_volk_32f_log2_32f_u_H
-#define INCLUDED_volk_32f_log2_32f_u_H
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE()                                  \
+    int32x4_t one = vdupq_n_s32(0x000800000);                   \
+    /* minimax polynomial */                                    \
+    float32x4_t p0 = vdupq_n_f32(-3.0400402727048585);          \
+    float32x4_t p1 = vdupq_n_f32(6.1129631282966113);           \
+    float32x4_t p2 = vdupq_n_f32(-5.3419892024633207);          \
+    float32x4_t p3 = vdupq_n_f32(3.2865287703753912);           \
+    float32x4_t p4 = vdupq_n_f32(-1.2669182593441635);          \
+    float32x4_t p5 = vdupq_n_f32(0.2751487703421256);           \
+    float32x4_t p6 = vdupq_n_f32(-0.0256910888150985);          \
+    int32x4_t exp_mask = vdupq_n_s32(0x7f800000);               \
+    int32x4_t sig_mask = vdupq_n_s32(0x007fffff);               \
+    int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval)                              \
+        int32x4_t exponent_i = vandq_s32(aval, exp_mask);               \
+        int32x4_t significand_i = vandq_s32(aval, sig_mask);            \
+        exponent_i = vshrq_n_s32(exponent_i, 23);                       \
+                                                                        \
+        /* extract the exponent and significand                         \
+         we can treat this as fixed point to save ~9% on the            \
+         conversion + float add */                                      \
+        significand_i = vorrq_s32(one, significand_i);                  \
+        float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23);  \
+        /* debias the exponent and convert to float */                  \
+        exponent_i = vsubq_s32(exponent_i, exp_bias);                   \
+        float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);             \
+                                                                        \
+        /* put the significand through a polynomial fit of log2(x) [1,2]\
+         add the result to the exponent */                              \
+        log2_approx = vaddq_f32(exponent_f, p0); /* p0 */               \
+        float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */   \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+        tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+                                                                        \
+        float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */  \
+        tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */          \
+        tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */          \
+        tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */          \
+        tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);
 
-#ifdef LV_HAVE_GENERIC
 /*!
   \brief Computes base 2 log of input vector and stores results in output vector
   \param bVector The vector where results will be stored
   \param aVector The input vector of floats
   \param num_points Number of points for which log is to be computed
 */
-static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
-    unsigned int number = 0;
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
 
-    for(number = 0; number < num_points; number++){
-      *bPtr++ = log2(*aPtr++);
+    int32x4_t aval;
+    float32x4_t log2_approx;
+
+    VLOG2Q_NEON_PREAMBLE()
+    // lms
+    //p0 = vdupq_n_f32(-1.649132280361871);
+    //p1 = vdupq_n_f32(1.995047138579499);
+    //p2 = vdupq_n_f32(-0.336914839219728);
+
+    // keep in mind a single precision float is represented as
+    //   (-1)^sign * 2^exp * 1.significand, so the log2 is
+    // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+    for(number = 0; number < quarterPoints; ++number){
+        // load float in to an int register without conversion
+        aval = vld1q_s32((int*)aPtr);
+
+        VLOG2Q_NEON_F32(log2_approx, aval)
+
+        vst1q_f32(bPtr, log2_approx);
+
+        aPtr += 4;
+        bPtr += 4;
     }
 
+    for(number = quarterPoints * 4; number < num_points; number++){
+       *bPtr++ = log2(*aPtr++);
+    }
 }
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_log2_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_log2_32f_u_H
+#define INCLUDED_volk_32f_log2_32f_u_H
+
+
+//#ifdef LV_HAVE_GENERIC
+///*!
+//  \brief Computes base 2 log of input vector and stores results in output vector
+//  \param bVector The vector where results will be stored
+//  \param aVector The input vector of floats
+//  \param num_points Number of points for which log is to be computed
+//*/
+//static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){
+//    float* bPtr = bVector;
+//    const float* aPtr = aVector;
+//    unsigned int number = 0;
+//
+//    for(number = 0; number < num_points; number++){
+//      *bPtr++ = log2(*aPtr++);
+//    }
+//
+//}
+//#endif /* LV_HAVE_GENERIC */
 
 
 #ifdef LV_HAVE_SSE4_1
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 3ab4a9970c..be20ed6585 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
             }
         }
         // the primary test is the percent different greater than given tol
-        else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+        else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
             fail=true;
             if(print_max_errs-- > 0) {
                 std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
author	Nathan West <nathan.west@okstate.edu>	2014-10-24 19:49:19 -0500
committer	Nathan West <nathan.west@okstate.edu>	2014-10-26 17:41:13 -0500
commit	eba094a3010ca326da747390be51da8326fff0c1 (patch)
tree	b8310aa62d11b8b21e0a84f5c5275eebe1f5aec3 /volk
parent	86a2cf03996a2950d3c7cc4467311b1506f5182a (diff)