Merge commit '0c92479f'

This is one commit shy of the nwest/neon tip
author: Johnathan Corgan <johnathan@corganlabs.com> 2014-10-31 12:12:09 -0700
committer: Johnathan Corgan <johnathan@corganlabs.com> 2014-10-31 12:12:09 -0700
commit: 4869607eb840318595edda85ca20b872579b27e3 (patch)
tree: d1bf8c14e957fcddb11df1b550ef9d211d1affe1 /volk
parent: b3bbe5659a20a8377e9a6278d0c77f2649399dc9 (diff)
parent: 0c92479f10274e9e5f4e1506dafe4d515576ee8a (diff)
26 files changed, 987 insertions, 24 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 5030836d43..9bc1842c63 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -157,12 +157,12 @@ int main(int argc, char *argv[]) {
     //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_16u_byteswappuppet_16u, volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
@@ -175,7 +175,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 100, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204602, 10000, &results, benchmark_mode, kernel_regex);
@@ -219,13 +219,13 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_32u_byteswappuppet_32u, volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_32u_popcntpuppet_32u, volk32u_popcnt_32u,  0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
-    VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
-    //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_64u_byteswappuppet_64u, volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+    VOLK_PUPPET_PROFILE(volk_64u_popcntpuppet_64u, volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204602, 3000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204602, 3000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204602, 3000, &results, benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
index 27f0bf6df7..2656d766b8 100644
--- a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -29,7 +29,6 @@
 
 #ifdef LV_HAVE_GENERIC
 
-
 static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
 
   static const int N_UNROLL = 4;
@@ -58,7 +57,54 @@ static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const
 
 #endif /*LV_HAVE_GENERIC*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+  unsigned ii;
+  unsigned quarter_points = num_points / 4;
+  lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
+  short* inputPtr = (short*) input;
+  lv_32fc_t accumulator_vec[4];
+
+  float32x4x2_t tapsVal, accumulator_val;
+  int16x4_t input16;
+  int32x4_t input32;
+  float32x4_t input_float, prod_re, prod_im;
+
+  accumulator_val.val[0] = vdupq_n_f32(0.0);
+  accumulator_val.val[1] = vdupq_n_f32(0.0);
+
+  for(ii = 0; ii < quarter_points; ++ii) {
+    tapsVal = vld2q_f32((float*)tapsPtr);
+    input16 = vld1_s16(inputPtr);
+    // widen 16-bit int to 32-bit int
+    input32 = vmovl_s16(input16);
+    // convert 32-bit int to float with scale
+    input_float = vcvtq_f32_s32(input32);
+
+    prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+    prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+
+    accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+    accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+
+    tapsPtr += 4;
+    inputPtr += 4;
+  }
+  vst2q_f32((float*)accumulator_vec, accumulator_val);
+  accumulator_vec[0] += accumulator_vec[1];
+  accumulator_vec[2] += accumulator_vec[3];
+  accumulator_vec[0] += accumulator_vec[2];
+
+  for(ii = quarter_points * 4; ii < num_points; ++ii) {
+    accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+  }
+
+  *result = accumulator_vec[0];
+}
 
+#endif /*LV_HAVE_NEON*/
 
 #if LV_HAVE_SSE && LV_HAVE_MMX
 
diff --git a/volk/kernels/volk/volk_16i_convert_8i.h b/volk/kernels/volk/volk_16i_convert_8i.h
index eb29949417..6f16fa45d3 100644
--- a/volk/kernels/volk/volk_16i_convert_8i.h
+++ b/volk/kernels/volk/volk_16i_convert_8i.h
@@ -138,6 +138,46 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_
 }
 #endif /* LV_HAVE_SSE2 */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  unsigned int sixteenth_points = num_points / 16;
+
+  int16x8_t inputVal0;
+  int16x8_t inputVal1;
+  int8x8_t outputVal0;
+  int8x8_t outputVal1;
+  int8x16_t outputVal;
+  
+  for(number = 0; number < sixteenth_points; number++){
+    // load two input vectors
+    inputVal0 = vld1q_s16(inputVectorPtr);
+    inputVal1 = vld1q_s16(inputVectorPtr+8);
+    // shift right
+    outputVal0 = vshrn_n_s16(inputVal0, 8);
+    outputVal1 = vshrn_n_s16(inputVal1, 8);
+    // squash two vectors and write output
+    outputVal = vcombine_s8(outputVal0, outputVal1);
+    vst1q_s8(outputVectorPtr, outputVal);
+    inputVectorPtr += 16;
+    outputVectorPtr += 16;
+  }
+
+  for(number = sixteenth_points * 16; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Converts the input 16 bit integer data into 8 bit integer data
@@ -156,7 +196,4 @@ static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int
 }
 #endif /* LV_HAVE_GENERIC */
 
-
-
-
 #endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index 24134c80cc..6ea28f0e90 100644
--- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -195,7 +195,53 @@ static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputPtr = outputVector;
+  const int16_t* inputPtr = inputVector;
+  unsigned int number = 0;
+  unsigned int eighth_points = num_points / 8;
+
+  int16x4x2_t input16;
+  int32x4_t input32_0, input32_1;
+  float32x4_t input_float_0, input_float_1;
+  float32x4x2_t output_float;
+  float32x4_t inv_scale;
+
+  inv_scale = vdupq_n_f32(1.0/scalar);
+
+  // the generic disassembles to a 128-bit load
+  // and duplicates every instruction to operate on 64-bits
+  // at a time. This is only possible with lanes, which is faster
+  // than just doing a vld1_s16, but still slower.
+  for(number = 0; number < eighth_points; number++){
+    input16 = vld2_s16(inputPtr);
+    // widen 16-bit int to 32-bit int
+    input32_0 = vmovl_s16(input16.val[0]);
+    input32_1 = vmovl_s16(input16.val[1]);
+    // convert 32-bit int to float with scale
+    input_float_0 = vcvtq_f32_s32(input32_0);
+    input_float_1 = vcvtq_f32_s32(input32_1);
+    output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+    output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+    vst2q_f32(outputPtr, output_float);
+    inputPtr += 8;
+    outputPtr += 8;
+  }
 
+  for(number = eighth_points*8; number < num_points; number++){
+    *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_NEON */
 
 
 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h
index bffdeed185..3b2f9e2c91 100644
--- a/volk/kernels/volk/volk_16u_byteswap.h
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -158,6 +158,58 @@ static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num
 }
 #endif /* LV_HAVE_NEON */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
+  uint16_t* inputPtr = intsToSwap;
+  unsigned int number = 0;
+  unsigned int n16points = num_points / 16;
+
+  uint8x8x4_t input_table;
+  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+  /* these magic numbers are used as byte-indeces in the LUT.
+     they are pre-computed to save time. A simple C program
+     can calculate them; for example for lookup01:
+    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+    for(ii=0; ii < 8; ++ii) {
+        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+    }
+  */
+  int_lookup01 = vcreate_u8(1232017111498883080);
+  int_lookup23 = vcreate_u8(1376697457175036426);
+  int_lookup45 = vcreate_u8(1521377802851189772);
+  int_lookup67 = vcreate_u8(1666058148527343118);
+
+  for(number = 0; number < n16points; ++number){
+    input_table = vld4_u8((uint8_t*) inputPtr);
+    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+    vst1_u8((uint8_t*)inputPtr, swapped_int01);
+    vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
+    vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
+    vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
+
+    inputPtr += 16;
+  }
+
+  for(number = n16points * 16; number < num_points; ++number){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Byteswaps (in-place) an aligned vector of int16_t's.
diff --git a/volk/kernels/volk/volk_16u_byteswappuppet_16u.h b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h
new file mode 100644
index 0000000000..74745d328c
--- /dev/null
+++ b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h
@@ -0,0 +1,55 @@
+#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H
+#define INCLUDED_volk_16u_byteswappuppet_16u_H
+
+
+#include <stdint.h>
+#include <volk/volk_16u_byteswap.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+    volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_32f_binary_slicer_8i.h b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
index 88a25b7a05..ae4420b6e1 100644
--- a/volk/kernels/volk/volk_32f_binary_slicer_8i.h
+++ b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -206,4 +206,84 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
 #endif /* LV_HAVE_SSE2 */
 
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
+  \param cVector The char (int8_t) output (either 0 or 1)
+  \param aVector The float input
+  \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
+                                  unsigned int num_points)
+{
+  int8_t* cPtr = cVector;
+  const float* aPtr = aVector;
+  unsigned int number = 0;
+  unsigned int n16points = num_points / 16;
+
+  float32x4x2_t input_val0, input_val1;
+  float32x4_t zero_val;
+  uint32x4x2_t res0_u32, res1_u32;
+  uint16x4x2_t res0_u16x4, res1_u16x4;
+  uint16x8x2_t res_u16x8;
+  uint8x8x2_t res_u8;
+  uint8x8_t one;
+
+  zero_val = vdupq_n_f32(0.0);
+  one = vdup_n_u8(0x01);
+  
+  // TODO: this is a good candidate for asm because the vcombines
+  // can be eliminated simply by picking dst registers that are
+  // adjacent.
+  for(number = 0; number < n16points; number++) {
+    input_val0 = vld2q_f32(aPtr);
+    input_val1 = vld2q_f32(aPtr+8);
+
+    // test against 0; return uint32
+    res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+    res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+    res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+    res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+    // narrow uint32 -> uint16 followed by combine to 8-element vectors
+    res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+    res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+    res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+    res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+    res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+    res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+    
+    // narrow uint16x8 -> uint8x8
+    res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+    res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+    // we *could* load twice as much data and do another vcombine here
+    // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+    // but that turns out to be ~16% slower than this version on zc702
+    // it's possible register contention in GCC scheduler slows it down
+    // and a hand-written asm with quad-word u8 registers is much faster.
+
+    res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+    res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+    vst2_u8((unsigned char*)cPtr, res_u8);
+    cPtr += 16;
+    aPtr += 16;
+
+  }
+
+  for(number = n16points * 16; number < num_points; number++) {
+    if(*aPtr++ >= 0) {
+      *cPtr++ = 1;
+    }
+    else {
+      *cPtr++ = 0;
+    }
+  }
+}
+#endif /* LV_HAVE_NEON */
+
+
 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h
index 52c1b60549..892eeb1685 100644
--- a/volk/kernels/volk/volk_32f_log2_32f.h
+++ b/volk/kernels/volk/volk_32f_log2_32f.h
@@ -61,7 +61,7 @@
 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 
-#define LOG_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 6
 
 
 #ifndef INCLUDED_volk_32f_log2_32f_a_H
@@ -145,6 +145,106 @@ static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVect
 
 #endif /* LV_HAVE_SSE4_1 for aligned */
 
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE()                                  \
+    int32x4_t one = vdupq_n_s32(0x000800000);                   \
+    /* minimax polynomial */                                    \
+    float32x4_t p0 = vdupq_n_f32(-3.0400402727048585);          \
+    float32x4_t p1 = vdupq_n_f32(6.1129631282966113);           \
+    float32x4_t p2 = vdupq_n_f32(-5.3419892024633207);          \
+    float32x4_t p3 = vdupq_n_f32(3.2865287703753912);           \
+    float32x4_t p4 = vdupq_n_f32(-1.2669182593441635);          \
+    float32x4_t p5 = vdupq_n_f32(0.2751487703421256);           \
+    float32x4_t p6 = vdupq_n_f32(-0.0256910888150985);          \
+    int32x4_t exp_mask = vdupq_n_s32(0x7f800000);               \
+    int32x4_t sig_mask = vdupq_n_s32(0x007fffff);               \
+    int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval)                              \
+        int32x4_t exponent_i = vandq_s32(aval, exp_mask);               \
+        int32x4_t significand_i = vandq_s32(aval, sig_mask);            \
+        exponent_i = vshrq_n_s32(exponent_i, 23);                       \
+                                                                        \
+        /* extract the exponent and significand                         \
+         we can treat this as fixed point to save ~9% on the            \
+         conversion + float add */                                      \
+        significand_i = vorrq_s32(one, significand_i);                  \
+        float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23);  \
+        /* debias the exponent and convert to float */                  \
+        exponent_i = vsubq_s32(exponent_i, exp_bias);                   \
+        float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);             \
+                                                                        \
+        /* put the significand through a polynomial fit of log2(x) [1,2]\
+         add the result to the exponent */                              \
+        log2_approx = vaddq_f32(exponent_f, p0); /* p0 */               \
+        float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */   \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+        tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+                                                                        \
+        float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */  \
+        tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */          \
+        tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */          \
+        tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);                     \
+        float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */          \
+        tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */                     \
+        log2_approx = vaddq_f32(log2_approx, tmp1);
+
+/*!
+  \brief Computes base 2 log of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which log is to be computed
+*/
+static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
+
+    int32x4_t aval;
+    float32x4_t log2_approx;
+
+    VLOG2Q_NEON_PREAMBLE()
+    // lms
+    //p0 = vdupq_n_f32(-1.649132280361871);
+    //p1 = vdupq_n_f32(1.995047138579499);
+    //p2 = vdupq_n_f32(-0.336914839219728);
+
+    // keep in mind a single precision float is represented as
+    //   (-1)^sign * 2^exp * 1.significand, so the log2 is
+    // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+    for(number = 0; number < quarterPoints; ++number){
+        // load float in to an int register without conversion
+        aval = vld1q_s32((int*)aPtr);
+
+        VLOG2Q_NEON_F32(log2_approx, aval)
+
+        vst1q_f32(bPtr, log2_approx);
+
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    for(number = quarterPoints * 4; number < num_points; number++){
+       *bPtr++ = log2(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
 
 #ifndef INCLUDED_volk_32f_log2_32f_u_H
diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h b/volk/kernels/volk/volk_32f_x2_pow_32f.h
index 431c4c7021..431c4c7021 100755..100644
--- a/volk/kernels/volk/volk_32f_x2_pow_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index 5b485ec542..68749665da 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -122,6 +122,39 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  unsigned int quarter_points = num_points / 4;
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  float32x4x2_t complexInput;
+
+  for(number = 0; number < quarter_points; number++){
+    complexInput = vld2q_f32(complexVectorPtr);
+    vst1q_f32( iBufferPtr, complexInput.val[0] );
+    vst1q_f32( qBufferPtr, complexInput.val[1] );
+    complexVectorPtr += 8;
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  for(number = quarter_points*4; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Deinterleaves the complex vector into I & Q vector data
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index 6fabeddcf3..f80265decd 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -110,6 +110,35 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Deinterleaves the complex vector into Q vector data
+  \param complexVector The complex input vector
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  unsigned int quarter_points = num_points / 4;
+  const float* complexVectorPtr = (float*)complexVector;
+  float* qBufferPtr = qBuffer;
+  float32x4x2_t complexInput;
+
+  for(number = 0; number < quarter_points; number++){
+    complexInput = vld2q_f32(complexVectorPtr);
+    vst1q_f32( qBufferPtr, complexInput.val[1] );
+    complexVectorPtr += 8;
+    qBufferPtr += 4;
+  }
+
+  for(number = quarter_points*4; number < num_points; number++){
+    complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Deinterleaves the complex vector into Q vector data
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
index 9200206dc7..c0e8d8fb34 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -84,7 +84,33 @@ static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  unsigned int quarter_points = num_points / 4;
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float32x4x2_t complexInput;
 
+  for(number = 0; number < quarter_points; number++){
+    complexInput = vld2q_f32(complexVectorPtr);
+    vst1q_f32( iBufferPtr, complexInput.val[0] );
+    complexVectorPtr += 8;
+    iBufferPtr += 4;
+  }
 
+  for(number = quarter_points*4; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_NEON */
 
 #endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 945b4b5a2c..474b982887 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -252,6 +252,43 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t a_val, scalar_val;
+    float32x4x2_t tmp_imag;
+
+    scalar_val = vld2q_f32((const float*)&scalar);
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)aPtr);
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
+        tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
+
+        vst2q_f32((float*)cVector, tmp_imag);
+        aPtr += 4;
+        cVector += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      *cPtr++ = *aPtr++ * scalar;
+    }
+}
+#endif /* LV_HAVE_NEON */
 
 #ifdef LV_HAVE_GENERIC
   /*!
@@ -285,8 +322,4 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c
 }
 #endif /* LV_HAVE_GENERIC */
 
-
-
-
-
 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
index 750f508b7e..8964434bef 100644
--- a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -164,10 +164,57 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result
 
 #endif /*LV_HAVE_SSE3*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, accumulator;
+    float32x4x2_t tmp_imag;
+    accumulator.val[0] = vdupq_n_f32(0);
+    accumulator.val[1] = vdupq_n_f32(0);
+
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+8);
+        __builtin_prefetch(b_ptr+8);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+        accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
 
-#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+    // tail case
+    for(number = quarter_points*4; number < num_points; ++number) {
+      *result += (*a_ptr++) * lv_conj(*b_ptr++);
+    }
+    *result = lv_conj(*result);
 
+}
+#endif /*LV_HAVE_NEON*/
 
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
 
 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index d0a09f989f..c65d0984c5 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -896,7 +896,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
 #endif /*LV_HAVE_NEON*/
 
 #ifdef LV_HAVE_NEON
-
+#include <arm_neon.h>
 static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
 
     unsigned int quarter_points = num_points / 4;
diff --git a/volk/kernels/volk/volk_32i_x2_and_32i.h b/volk/kernels/volk/volk_32i_x2_and_32i.h
index b33a60e951..c138540e69 100644
--- a/volk/kernels/volk/volk_32i_x2_and_32i.h
+++ b/volk/kernels/volk/volk_32i_x2_and_32i.h
@@ -65,6 +65,40 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    int32x4_t a_val, b_val, c_val;
+
+    for(number = 0; number < quarter_points; number++){
+        a_val = vld1q_s32(aPtr);
+        b_val = vld1q_s32(bPtr);
+        c_val = vandq_s32(a_val, b_val);
+        vst1q_s32(cPtr, c_val);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for(number = quarter_points * 4; number < num_points; number++){
+      *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Ands the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32i_x2_or_32i.h b/volk/kernels/volk/volk_32i_x2_or_32i.h
index a8556a3e74..544a71c67c 100644
--- a/volk/kernels/volk/volk_32i_x2_or_32i.h
+++ b/volk/kernels/volk/volk_32i_x2_or_32i.h
@@ -65,6 +65,40 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    int32x4_t a_val, b_val, c_val;
+
+    for(number = 0; number < quarter_points; number++){
+        a_val = vld1q_s32(aPtr);
+        b_val = vld1q_s32(bPtr);
+        c_val = vorrq_s32(a_val, b_val);
+        vst1q_s32(cPtr, c_val);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for(number = quarter_points * 4; number < num_points; number++){
+      *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Ors the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32u_byteswap.h b/volk/kernels/volk/volk_32u_byteswap.h
index 74d9a0bc3a..0194efc12c 100644
--- a/volk/kernels/volk/volk_32u_byteswap.h
+++ b/volk/kernels/volk/volk_32u_byteswap.h
@@ -73,6 +73,59 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n
 }
 #endif /* LV_HAVE_SSE2 */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = intsToSwap;
+  unsigned int number = 0;
+  unsigned int n8points = num_points / 8;
+
+  uint8x8x4_t input_table;
+  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+  /* these magic numbers are used as byte-indeces in the LUT.
+     they are pre-computed to save time. A simple C program
+     can calculate them; for example for lookup01:
+    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+    for(ii=0; ii < 8; ++ii) {
+        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+    }
+  */
+  int_lookup01 = vcreate_u8(74609667900706840);
+  int_lookup23 = vcreate_u8(219290013576860186);
+  int_lookup45 = vcreate_u8(363970359253013532);
+  int_lookup67 = vcreate_u8(508650704929166878);
+  
+  for(number = 0; number < n8points; ++number){
+    input_table = vld4_u8((uint8_t*) inputPtr);
+    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+    vst1_u8((uint8_t*) inputPtr, swapped_int01);
+    vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+    vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+    vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+    inputPtr += 8;
+  }
+
+  for(number = n8points * 8; number < num_points; ++number){
+    uint32_t output = *inputPtr;
+    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Byteswaps (in-place) an aligned vector of int32_t's.
@@ -94,8 +147,6 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int
 #endif /* LV_HAVE_GENERIC */
 
 
-
-
 #endif /* INCLUDED_volk_32u_byteswap_u_H */
 #ifndef INCLUDED_volk_32u_byteswap_a_H
 #define INCLUDED_volk_32u_byteswap_a_H
diff --git a/volk/kernels/volk/volk_32u_byteswappuppet_32u.h b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h
new file mode 100644
index 0000000000..bf7055e241
--- /dev/null
+++ b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h
@@ -0,0 +1,45 @@
+#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
+#define INCLUDED_volk_32u_byteswappuppet_32u_H
+
+
+#include <stdint.h>
+#include <volk/volk_32u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+
+    volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+
+    volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){
+
+    volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+
+    volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_32u_popcntpuppet_32u.h b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h
new file mode 100644
index 0000000000..056983e817
--- /dev/null
+++ b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_32u_popcntpuppet_32u_H
+#define INCLUDED_volk_32u_popcntpuppet_32u_H
+
+#include <stdint.h>
+#include <volk/volk_32u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+    unsigned int ii;
+    for(ii=0; ii < num_points; ++ii) {
+        volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) );
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE_4_2
+static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+    unsigned int ii;
+    for(ii=0; ii < num_points; ++ii) {
+        volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) );
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/kernels/volk/volk_64u_byteswap.h b/volk/kernels/volk/volk_64u_byteswap.h
index df71f0ee5e..dce883278d 100644
--- a/volk/kernels/volk/volk_64u_byteswap.h
+++ b/volk/kernels/volk/volk_64u_byteswap.h
@@ -104,7 +104,62 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Byteswaps (in-place) a vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+  unsigned int number = 0;
+  unsigned int n8points = num_points / 4;
+
+  uint8x8x4_t input_table;
+  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+  /* these magic numbers are used as byte-indeces in the LUT.
+     they are pre-computed to save time. A simple C program
+     can calculate them; for example for lookup01:
+    uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+    for(ii=0; ii < 8; ++ii) {
+        index += ((uint64_t)(*(chars+ii))) << (ii*8);
+    }
+  */
+  int_lookup01 = vcreate_u8(2269495096316185);
+  int_lookup23 = vcreate_u8(146949840772469531);
+  int_lookup45 = vcreate_u8(291630186448622877);
+  int_lookup67 = vcreate_u8(436310532124776223);
+
+  for(number = 0; number < n8points; ++number){
+    input_table = vld4_u8((uint8_t*) inputPtr);
+    swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+    swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+    swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+    swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+    vst1_u8((uint8_t*) inputPtr, swapped_int01);
+    vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+    vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+    vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+    inputPtr += 4;
+  }
+
+  for(number = n8points * 4; number < num_points; ++number){
+    uint32_t output1 = *inputPtr;
+    uint32_t output2 = inputPtr[1];
 
+    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+    *inputPtr++ = output2;
+    *inputPtr++ = output1;
+  }
+
+}
+#endif /* LV_HAVE_NEON */
 
 
 #endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/kernels/volk/volk_64u_byteswappuppet_64u.h b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h
new file mode 100644
index 0000000000..ac5b16e212
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h
@@ -0,0 +1,45 @@
+#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H
+#define INCLUDED_volk_64u_byteswappuppet_64u_H
+
+
+#include <stdint.h>
+#include <volk/volk_64u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+
+    volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+
+    volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+    volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+    volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_64u_popcnt.h b/volk/kernels/volk/volk_64u_popcnt.h
index d425cd5206..0ec72e3404 100644
--- a/volk/kernels/volk/volk_64u_popcnt.h
+++ b/volk/kernels/volk/volk_64u_popcnt.h
@@ -71,4 +71,24 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
 
 #endif /*LV_HAVE_SSE4_2*/
 
+#if LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) {
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((unsigned char *) &value);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(ret, count64x1_val);
+
+    //*ret = _mm_popcnt_u64(value);
+
+}
+#endif /*LV_HAVE_NEON*/
+
 #endif /*INCLUDED_volk_64u_popcnt_a_H*/
diff --git a/volk/kernels/volk/volk_64u_popcntpuppet_64u.h b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h
new file mode 100644
index 0000000000..3903e0d561
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
+#define INCLUDED_volk_64u_popcntpuppet_64u_H
+
+#include <stdint.h>
+#include <volk/volk_64u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+    unsigned int ii;
+    for(ii=0; ii < num_points; ++ii) {
+        volk_64u_popcnt_generic(outVector+ii, num_points );
+        
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+    unsigned int ii;
+    for(ii=0; ii < num_points; ++ii) {
+        volk_64u_popcnt_neon(outVector+ii, num_points );
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 3ab4a9970c..be20ed6585 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
             }
         }
         // the primary test is the percent different greater than given tol
-        else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+        else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
             fail=true;
             if(print_max_errs-- > 0) {
                 std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 7d1826b719..7807ce40a6 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -44,7 +44,7 @@ VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20462, 1);
-VOLK_RUN_TESTS(volk_32f_log2_32f, 1e-3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_log2_32f, 1.5e-1, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1);
@@ -97,12 +97,12 @@ VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20462, 1);
 VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20462, 1);
-//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_32u_popcntpuppet_32u, 0, 0, 2046, 10000);
 VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20462, 1);
-//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_64u_popcntpuppet_64u, 0, 0, 2046, 10000);
 VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20462, 1);
 VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20462, 1);
author	Johnathan Corgan <johnathan@corganlabs.com>	2014-10-31 12:12:09 -0700
committer	Johnathan Corgan <johnathan@corganlabs.com>	2014-10-31 12:12:09 -0700
commit	4869607eb840318595edda85ca20b872579b27e3 (patch)
tree	d1bf8c14e957fcddb11df1b550ef9d211d1affe1 /volk
parent	b3bbe5659a20a8377e9a6278d0c77f2649399dc9 (diff)
parent	0c92479f10274e9e5f4e1506dafe4d515576ee8a (diff)