summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
authorNathan West <nathan.west@okstate.edu>2014-10-24 19:49:19 -0500
committerNathan West <nathan.west@okstate.edu>2014-10-26 17:41:13 -0500
commiteba094a3010ca326da747390be51da8326fff0c1 (patch)
treeb8310aa62d11b8b21e0a84f5c5275eebe1f5aec3 /volk
parent86a2cf03996a2950d3c7cc4467311b1506f5182a (diff)
volk: add neon log2 implementation and fix QA to properly test
The implementation adds the float exponent to a 6th order minimax fit of log2(significand) which has domain [1,2).
Diffstat (limited to 'volk')
-rw-r--r--volk/apps/volk_profile.cc2
-rw-r--r--volk/kernels/volk/volk_32f_log2_32f.h120
-rw-r--r--volk/lib/qa_utils.cc2
3 files changed, 112 insertions, 12 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 4167f4de1f..9bc1842c63 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h
index 52c1b60549..9452d358f8 100644
--- a/volk/kernels/volk/volk_32f_log2_32f.h
+++ b/volk/kernels/volk/volk_32f_log2_32f.h
@@ -145,30 +145,130 @@ static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVect
#endif /* LV_HAVE_SSE4_1 for aligned */
-#endif /* INCLUDED_volk_32f_log2_32f_a_H */
-
-#ifndef INCLUDED_volk_32f_log2_32f_u_H
-#define INCLUDED_volk_32f_log2_32f_u_H
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE() \
+ int32x4_t one = vdupq_n_s32(0x000800000); \
+ /* minimax polynomial */ \
+ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
+ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
+ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
+ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
+ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
+ int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval) \
+ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
+ int32x4_t significand_i = vandq_s32(aval, sig_mask); \
+ exponent_i = vshrq_n_s32(exponent_i, 23); \
+ \
+ /* extract the exponent and significand \
+ we can treat this as fixed point to save ~9% on the \
+ conversion + float add */ \
+ significand_i = vorrq_s32(one, significand_i); \
+ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
+ /* debias the exponent and convert to float */ \
+ exponent_i = vsubq_s32(exponent_i, exp_bias); \
+ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
+ \
+ /* put the significand through a polynomial fit of log2(x) [1,2]\
+ add the result to the exponent */ \
+ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
+ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ \
+ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
+ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
+ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
+ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
+ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1);
-#ifdef LV_HAVE_GENERIC
/*!
\brief Computes base 2 log of input vector and stores results in output vector
\param bVector The vector where results will be stored
\param aVector The input vector of floats
\param num_points Number of points for which log is to be computed
*/
-static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
- unsigned int number = 0;
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
- for(number = 0; number < num_points; number++){
- *bPtr++ = log2(*aPtr++);
+ int32x4_t aval;
+ float32x4_t log2_approx;
+
+ VLOG2Q_NEON_PREAMBLE()
+ // lms
+ //p0 = vdupq_n_f32(-1.649132280361871);
+ //p1 = vdupq_n_f32(1.995047138579499);
+ //p2 = vdupq_n_f32(-0.336914839219728);
+
+ // keep in mind a single precision float is represented as
+ // (-1)^sign * 2^exp * 1.significand, so the log2 is
+ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+ for(number = 0; number < quarterPoints; ++number){
+ // load float in to an int register without conversion
+ aval = vld1q_s32((int*)aPtr);
+
+ VLOG2Q_NEON_F32(log2_approx, aval)
+
+ vst1q_f32(bPtr, log2_approx);
+
+ aPtr += 4;
+ bPtr += 4;
}
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *bPtr++ = log2(*aPtr++);
+ }
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_log2_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_log2_32f_u_H
+#define INCLUDED_volk_32f_log2_32f_u_H
+
+
+//#ifdef LV_HAVE_GENERIC
+///*!
+// \brief Computes base 2 log of input vector and stores results in output vector
+// \param bVector The vector where results will be stored
+// \param aVector The input vector of floats
+// \param num_points Number of points for which log is to be computed
+//*/
+//static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){
+// float* bPtr = bVector;
+// const float* aPtr = aVector;
+// unsigned int number = 0;
+//
+// for(number = 0; number < num_points; number++){
+// *bPtr++ = log2(*aPtr++);
+// }
+//
+//}
+//#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE4_1
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 3ab4a9970c..be20ed6585 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
}
}
// the primary test is the percent different greater than given tol
- else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+ else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;