summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
authorJohnathan Corgan <johnathan@corganlabs.com>2014-10-31 12:12:09 -0700
committerJohnathan Corgan <johnathan@corganlabs.com>2014-10-31 12:12:09 -0700
commit4869607eb840318595edda85ca20b872579b27e3 (patch)
treed1bf8c14e957fcddb11df1b550ef9d211d1affe1 /volk
parentb3bbe5659a20a8377e9a6278d0c77f2649399dc9 (diff)
parent0c92479f10274e9e5f4e1506dafe4d515576ee8a (diff)
Merge commit '0c92479f'
This is one commit shy of the nwest/neon tip
Diffstat (limited to 'volk')
-rw-r--r--volk/apps/volk_profile.cc14
-rw-r--r--volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h48
-rw-r--r--volk/kernels/volk/volk_16i_convert_8i.h43
-rw-r--r--volk/kernels/volk/volk_16i_s32f_convert_32f.h46
-rw-r--r--volk/kernels/volk/volk_16u_byteswap.h52
-rw-r--r--volk/kernels/volk/volk_16u_byteswappuppet_16u.h55
-rw-r--r--volk/kernels/volk/volk_32f_binary_slicer_8i.h80
-rw-r--r--volk/kernels/volk/volk_32f_log2_32f.h102
-rw-r--r--[-rwxr-xr-x]volk/kernels/volk/volk_32f_x2_pow_32f.h0
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h33
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h29
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_real_32f.h26
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h41
-rw-r--r--volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h49
-rw-r--r--volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h2
-rw-r--r--volk/kernels/volk/volk_32i_x2_and_32i.h34
-rw-r--r--volk/kernels/volk/volk_32i_x2_or_32i.h34
-rw-r--r--volk/kernels/volk/volk_32u_byteswap.h55
-rw-r--r--volk/kernels/volk/volk_32u_byteswappuppet_32u.h45
-rw-r--r--volk/kernels/volk/volk_32u_popcntpuppet_32u.h47
-rw-r--r--volk/kernels/volk/volk_64u_byteswap.h55
-rw-r--r--volk/kernels/volk/volk_64u_byteswappuppet_64u.h45
-rw-r--r--volk/kernels/volk/volk_64u_popcnt.h20
-rw-r--r--volk/kernels/volk/volk_64u_popcntpuppet_64u.h48
-rw-r--r--volk/lib/qa_utils.cc2
-rw-r--r--volk/lib/testqa.cc6
26 files changed, 987 insertions, 24 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 5030836d43..9bc1842c63 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -157,12 +157,12 @@ int main(int argc, char *argv[]) {
//VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PUPPET_PROFILE(volk_16u_byteswappuppet_16u, volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
@@ -175,7 +175,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 100, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204602, 10000, &results, benchmark_mode, kernel_regex);
@@ -219,13 +219,13 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PUPPET_PROFILE(volk_32u_byteswappuppet_32u, volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex);
+ VOLK_PUPPET_PROFILE(volk_32u_popcntpuppet_32u, volk32u_popcnt_32u, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
- //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PUPPET_PROFILE(volk_64u_byteswappuppet_64u, volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PUPPET_PROFILE(volk_64u_popcntpuppet_64u, volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204602, 3000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204602, 3000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204602, 3000, &results, benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
index 27f0bf6df7..2656d766b8 100644
--- a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -29,7 +29,6 @@
#ifdef LV_HAVE_GENERIC
-
static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
static const int N_UNROLL = 4;
@@ -58,7 +57,54 @@ static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const
#endif /*LV_HAVE_GENERIC*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ unsigned ii;
+ unsigned quarter_points = num_points / 4;
+ lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
+ short* inputPtr = (short*) input;
+ lv_32fc_t accumulator_vec[4];
+
+ float32x4x2_t tapsVal, accumulator_val;
+ int16x4_t input16;
+ int32x4_t input32;
+ float32x4_t input_float, prod_re, prod_im;
+
+ accumulator_val.val[0] = vdupq_n_f32(0.0);
+ accumulator_val.val[1] = vdupq_n_f32(0.0);
+
+ for(ii = 0; ii < quarter_points; ++ii) {
+ tapsVal = vld2q_f32((float*)tapsPtr);
+ input16 = vld1_s16(inputPtr);
+ // widen 16-bit int to 32-bit int
+ input32 = vmovl_s16(input16);
+ // convert 32-bit int to float with scale
+ input_float = vcvtq_f32_s32(input32);
+
+ prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+ prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+
+ accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+ accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+
+ tapsPtr += 4;
+ inputPtr += 4;
+ }
+ vst2q_f32((float*)accumulator_vec, accumulator_val);
+ accumulator_vec[0] += accumulator_vec[1];
+ accumulator_vec[2] += accumulator_vec[3];
+ accumulator_vec[0] += accumulator_vec[2];
+
+ for(ii = quarter_points * 4; ii < num_points; ++ii) {
+ accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+ }
+
+ *result = accumulator_vec[0];
+}
+#endif /*LV_HAVE_NEON*/
#if LV_HAVE_SSE && LV_HAVE_MMX
diff --git a/volk/kernels/volk/volk_16i_convert_8i.h b/volk/kernels/volk/volk_16i_convert_8i.h
index eb29949417..6f16fa45d3 100644
--- a/volk/kernels/volk/volk_16i_convert_8i.h
+++ b/volk/kernels/volk/volk_16i_convert_8i.h
@@ -138,6 +138,46 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ unsigned int sixteenth_points = num_points / 16;
+
+ int16x8_t inputVal0;
+ int16x8_t inputVal1;
+ int8x8_t outputVal0;
+ int8x8_t outputVal1;
+ int8x16_t outputVal;
+
+ for(number = 0; number < sixteenth_points; number++){
+ // load two input vectors
+ inputVal0 = vld1q_s16(inputVectorPtr);
+ inputVal1 = vld1q_s16(inputVectorPtr+8);
+ // shift right
+ outputVal0 = vshrn_n_s16(inputVal0, 8);
+ outputVal1 = vshrn_n_s16(inputVal1, 8);
+ // squash two vectors and write output
+ outputVal = vcombine_s8(outputVal0, outputVal1);
+ vst1q_s8(outputVectorPtr, outputVal);
+ inputVectorPtr += 16;
+ outputVectorPtr += 16;
+ }
+
+ for(number = sixteenth_points * 16; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts the input 16 bit integer data into 8 bit integer data
@@ -156,7 +196,4 @@ static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int
}
#endif /* LV_HAVE_GENERIC */
-
-
-
#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index 24134c80cc..6ea28f0e90 100644
--- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -195,7 +195,53 @@ static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputPtr = outputVector;
+ const int16_t* inputPtr = inputVector;
+ unsigned int number = 0;
+ unsigned int eighth_points = num_points / 8;
+
+ int16x4x2_t input16;
+ int32x4_t input32_0, input32_1;
+ float32x4_t input_float_0, input_float_1;
+ float32x4x2_t output_float;
+ float32x4_t inv_scale;
+
+ inv_scale = vdupq_n_f32(1.0/scalar);
+
+ // the generic disassembles to a 128-bit load
+ // and duplicates every instruction to operate on 64-bits
+ // at a time. This is only possible with lanes, which is faster
+ // than just doing a vld1_s16, but still slower.
+ for(number = 0; number < eighth_points; number++){
+ input16 = vld2_s16(inputPtr);
+ // widen 16-bit int to 32-bit int
+ input32_0 = vmovl_s16(input16.val[0]);
+ input32_1 = vmovl_s16(input16.val[1]);
+ // convert 32-bit int to float with scale
+ input_float_0 = vcvtq_f32_s32(input32_0);
+ input_float_1 = vcvtq_f32_s32(input32_1);
+ output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+ output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+ vst2q_f32(outputPtr, output_float);
+ inputPtr += 8;
+ outputPtr += 8;
+ }
+ for(number = eighth_points*8; number < num_points; number++){
+ *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h
index bffdeed185..3b2f9e2c91 100644
--- a/volk/kernels/volk/volk_16u_byteswap.h
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -158,6 +158,58 @@ static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
+ uint16_t* inputPtr = intsToSwap;
+ unsigned int number = 0;
+ unsigned int n16points = num_points / 16;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indeces in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(1232017111498883080);
+ int_lookup23 = vcreate_u8(1376697457175036426);
+ int_lookup45 = vcreate_u8(1521377802851189772);
+ int_lookup67 = vcreate_u8(1666058148527343118);
+
+ for(number = 0; number < n16points; ++number){
+ input_table = vld4_u8((uint8_t*) inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*)inputPtr, swapped_int01);
+ vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
+ vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
+ vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
+
+ inputPtr += 16;
+ }
+
+ for(number = n16points * 16; number < num_points; ++number){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Byteswaps (in-place) an aligned vector of int16_t's.
diff --git a/volk/kernels/volk/volk_16u_byteswappuppet_16u.h b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h
new file mode 100644
index 0000000000..74745d328c
--- /dev/null
+++ b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h
@@ -0,0 +1,55 @@
+#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H
+#define INCLUDED_volk_16u_byteswappuppet_16u_H
+
+
+#include <stdint.h>
+#include <volk/volk_16u_byteswap.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
+
+ volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_32f_binary_slicer_8i.h b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
index 88a25b7a05..ae4420b6e1 100644
--- a/volk/kernels/volk/volk_32f_binary_slicer_8i.h
+++ b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -206,4 +206,84 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
+ \param cVector The char (int8_t) output (either 0 or 1)
+ \param aVector The float input
+ \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n16points = num_points / 16;
+
+ float32x4x2_t input_val0, input_val1;
+ float32x4_t zero_val;
+ uint32x4x2_t res0_u32, res1_u32;
+ uint16x4x2_t res0_u16x4, res1_u16x4;
+ uint16x8x2_t res_u16x8;
+ uint8x8x2_t res_u8;
+ uint8x8_t one;
+
+ zero_val = vdupq_n_f32(0.0);
+ one = vdup_n_u8(0x01);
+
+ // TODO: this is a good candidate for asm because the vcombines
+ // can be eliminated simply by picking dst registers that are
+ // adjacent.
+ for(number = 0; number < n16points; number++) {
+ input_val0 = vld2q_f32(aPtr);
+ input_val1 = vld2q_f32(aPtr+8);
+
+ // test against 0; return uint32
+ res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+ res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+ res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+ res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+ // narrow uint32 -> uint16 followed by combine to 8-element vectors
+ res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+ res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+ res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+ res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+ res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+ res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+
+ // narrow uint16x8 -> uint8x8
+ res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+ res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+ // we *could* load twice as much data and do another vcombine here
+ // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+ // but that turns out to be ~16% slower than this version on zc702
+ // it's possible register contention in GCC scheduler slows it down
+ // and a hand-written asm with quad-word u8 registers is much faster.
+
+ res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+ res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+ vst2_u8((unsigned char*)cPtr, res_u8);
+ cPtr += 16;
+ aPtr += 16;
+
+ }
+
+ for(number = n16points * 16; number < num_points; number++) {
+ if(*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h
index 52c1b60549..892eeb1685 100644
--- a/volk/kernels/volk/volk_32f_log2_32f.h
+++ b/volk/kernels/volk/volk_32f_log2_32f.h
@@ -61,7 +61,7 @@
#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-#define LOG_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 6
#ifndef INCLUDED_volk_32f_log2_32f_a_H
@@ -145,6 +145,106 @@ static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVect
#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE() \
+ int32x4_t one = vdupq_n_s32(0x000800000); \
+ /* minimax polynomial */ \
+ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
+ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
+ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
+ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
+ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
+ int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval) \
+ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
+ int32x4_t significand_i = vandq_s32(aval, sig_mask); \
+ exponent_i = vshrq_n_s32(exponent_i, 23); \
+ \
+ /* extract the exponent and significand \
+ we can treat this as fixed point to save ~9% on the \
+ conversion + float add */ \
+ significand_i = vorrq_s32(one, significand_i); \
+ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
+ /* debias the exponent and convert to float */ \
+ exponent_i = vsubq_s32(exponent_i, exp_bias); \
+ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
+ \
+ /* put the significand through a polynomial fit of log2(x) [1,2]\
+ add the result to the exponent */ \
+ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
+ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ \
+ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
+ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
+ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
+ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
+ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1);
+
+/*!
+ \brief Computes base 2 log of input vector and stores results in output vector
+ \param bVector The vector where results will be stored
+ \param aVector The input vector of floats
+ \param num_points Number of points for which log is to be computed
+*/
+static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
+
+ int32x4_t aval;
+ float32x4_t log2_approx;
+
+ VLOG2Q_NEON_PREAMBLE()
+ // lms
+ //p0 = vdupq_n_f32(-1.649132280361871);
+ //p1 = vdupq_n_f32(1.995047138579499);
+ //p2 = vdupq_n_f32(-0.336914839219728);
+
+ // keep in mind a single precision float is represented as
+ // (-1)^sign * 2^exp * 1.significand, so the log2 is
+ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+ for(number = 0; number < quarterPoints; ++number){
+ // load float in to an int register without conversion
+ aval = vld1q_s32((int*)aPtr);
+
+ VLOG2Q_NEON_F32(log2_approx, aval)
+
+ vst1q_f32(bPtr, log2_approx);
+
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *bPtr++ = log2(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
#endif /* INCLUDED_volk_32f_log2_32f_a_H */
#ifndef INCLUDED_volk_32f_log2_32f_u_H
diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h b/volk/kernels/volk/volk_32f_x2_pow_32f.h
index 431c4c7021..431c4c7021 100755..100644
--- a/volk/kernels/volk/volk_32f_x2_pow_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index 5b485ec542..68749665da 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -122,6 +122,39 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+ float32x4x2_t complexInput;
+
+ for(number = 0; number < quarter_points; number++){
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32( iBufferPtr, complexInput.val[0] );
+ vst1q_f32( qBufferPtr, complexInput.val[1] );
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Deinterleaves the complex vector into I & Q vector data
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index 6fabeddcf3..f80265decd 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -110,6 +110,35 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Deinterleaves the complex vector into Q vector data
+ \param complexVector The complex input vector
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* qBufferPtr = qBuffer;
+ float32x4x2_t complexInput;
+
+ for(number = 0; number < quarter_points; number++){
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32( qBufferPtr, complexInput.val[1] );
+ complexVectorPtr += 8;
+ qBufferPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Deinterleaves the complex vector into Q vector data
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
index 9200206dc7..c0e8d8fb34 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -84,7 +84,33 @@ static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Deinterleaves the complex vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float32x4x2_t complexInput;
+ for(number = 0; number < quarter_points; number++){
+ complexInput = vld2q_f32(complexVectorPtr);
+ vst1q_f32( iBufferPtr, complexInput.val[0] );
+ complexVectorPtr += 8;
+ iBufferPtr += 4;
+ }
+ for(number = quarter_points*4; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 945b4b5a2c..474b982887 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -252,6 +252,43 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t a_val, scalar_val;
+ float32x4x2_t tmp_imag;
+
+ scalar_val = vld2q_f32((const float*)&scalar);
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)aPtr);
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
+ tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
+
+ vst2q_f32((float*)cVector, tmp_imag);
+ aPtr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cPtr++ = *aPtr++ * scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
/*!
@@ -285,8 +322,4 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c
}
#endif /* LV_HAVE_GENERIC */
-
-
-
-
#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
index 750f508b7e..8964434bef 100644
--- a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -164,10 +164,57 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result
#endif /*LV_HAVE_SSE3*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator;
+ float32x4x2_t tmp_imag;
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+8);
+ __builtin_prefetch(b_ptr+8);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+ accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
-#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * lv_conj(*b_ptr++);
+ }
+ *result = lv_conj(*result);
+}
+#endif /*LV_HAVE_NEON*/
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index d0a09f989f..c65d0984c5 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -896,7 +896,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_NEON
-
+#include <arm_neon.h>
static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
unsigned int quarter_points = num_points / 4;
diff --git a/volk/kernels/volk/volk_32i_x2_and_32i.h b/volk/kernels/volk/volk_32i_x2_and_32i.h
index b33a60e951..c138540e69 100644
--- a/volk/kernels/volk/volk_32i_x2_and_32i.h
+++ b/volk/kernels/volk/volk_32i_x2_and_32i.h
@@ -65,6 +65,40 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ int32x4_t a_val, b_val, c_val;
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = vld1q_s32(aPtr);
+ b_val = vld1q_s32(bPtr);
+ c_val = vandq_s32(a_val, b_val);
+ vst1q_s32(cPtr, c_val);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) & (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Ands the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32i_x2_or_32i.h b/volk/kernels/volk/volk_32i_x2_or_32i.h
index a8556a3e74..544a71c67c 100644
--- a/volk/kernels/volk/volk_32i_x2_or_32i.h
+++ b/volk/kernels/volk/volk_32i_x2_or_32i.h
@@ -65,6 +65,40 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Ands the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors
+ \param bVector One of the vectors
+ \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+ int32_t* cPtr = cVector;
+ const int32_t* aPtr = aVector;
+ const int32_t* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ int32x4_t a_val, b_val, c_val;
+
+ for(number = 0; number < quarter_points; number++){
+ a_val = vld1q_s32(aPtr);
+ b_val = vld1q_s32(bPtr);
+ c_val = vorrq_s32(a_val, b_val);
+ vst1q_s32(cPtr, c_val);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) | (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Ors the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32u_byteswap.h b/volk/kernels/volk/volk_32u_byteswap.h
index 74d9a0bc3a..0194efc12c 100644
--- a/volk/kernels/volk/volk_32u_byteswap.h
+++ b/volk/kernels/volk/volk_32u_byteswap.h
@@ -73,6 +73,59 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 8;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indeces in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(74609667900706840);
+ int_lookup23 = vcreate_u8(219290013576860186);
+ int_lookup45 = vcreate_u8(363970359253013532);
+ int_lookup67 = vcreate_u8(508650704929166878);
+
+ for(number = 0; number < n8points; ++number){
+ input_table = vld4_u8((uint8_t*) inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*) inputPtr, swapped_int01);
+ vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+ vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+ vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+ inputPtr += 8;
+ }
+
+ for(number = n8points * 8; number < num_points; ++number){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Byteswaps (in-place) an aligned vector of int32_t's.
@@ -94,8 +147,6 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32u_byteswap_u_H */
#ifndef INCLUDED_volk_32u_byteswap_a_H
#define INCLUDED_volk_32u_byteswap_a_H
diff --git a/volk/kernels/volk/volk_32u_byteswappuppet_32u.h b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h
new file mode 100644
index 0000000000..bf7055e241
--- /dev/null
+++ b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h
@@ -0,0 +1,45 @@
+#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
+#define INCLUDED_volk_32u_byteswappuppet_32u_H
+
+
+#include <stdint.h>
+#include <volk/volk_32u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
+
+ volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_32u_popcntpuppet_32u.h b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h
new file mode 100644
index 0000000000..056983e817
--- /dev/null
+++ b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_32u_popcntpuppet_32u_H
+#define INCLUDED_volk_32u_popcntpuppet_32u_H
+
+#include <stdint.h>
+#include <volk/volk_32u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) );
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE_4_2
+static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) );
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/kernels/volk/volk_64u_byteswap.h b/volk/kernels/volk/volk_64u_byteswap.h
index df71f0ee5e..dce883278d 100644
--- a/volk/kernels/volk/volk_64u_byteswap.h
+++ b/volk/kernels/volk/volk_64u_byteswap.h
@@ -104,7 +104,62 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Byteswaps (in-place) a vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 4;
+
+ uint8x8x4_t input_table;
+ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+ /* these magic numbers are used as byte-indeces in the LUT.
+ they are pre-computed to save time. A simple C program
+ can calculate them; for example for lookup01:
+ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+ for(ii=0; ii < 8; ++ii) {
+ index += ((uint64_t)(*(chars+ii))) << (ii*8);
+ }
+ */
+ int_lookup01 = vcreate_u8(2269495096316185);
+ int_lookup23 = vcreate_u8(146949840772469531);
+ int_lookup45 = vcreate_u8(291630186448622877);
+ int_lookup67 = vcreate_u8(436310532124776223);
+
+ for(number = 0; number < n8points; ++number){
+ input_table = vld4_u8((uint8_t*) inputPtr);
+ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+ vst1_u8((uint8_t*) inputPtr, swapped_int01);
+ vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+ vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+ vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+
+ inputPtr += 4;
+ }
+
+ for(number = n8points * 4; number < num_points; ++number){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+
+}
+#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/kernels/volk/volk_64u_byteswappuppet_64u.h b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h
new file mode 100644
index 0000000000..ac5b16e212
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h
@@ -0,0 +1,45 @@
+#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H
+#define INCLUDED_volk_64u_byteswappuppet_64u_H
+
+
+#include <stdint.h>
+#include <volk/volk_64u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
+
+ volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+
+}
+#endif
+
+
+#endif
diff --git a/volk/kernels/volk/volk_64u_popcnt.h b/volk/kernels/volk/volk_64u_popcnt.h
index d425cd5206..0ec72e3404 100644
--- a/volk/kernels/volk/volk_64u_popcnt.h
+++ b/volk/kernels/volk/volk_64u_popcnt.h
@@ -71,4 +71,24 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
#endif /*LV_HAVE_SSE4_2*/
+#if LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) {
+ uint8x8_t input_val, count8x8_val;
+ uint16x4_t count16x4_val;
+ uint32x2_t count32x2_val;
+ uint64x1_t count64x1_val;
+
+ input_val = vld1_u8((unsigned char *) &value);
+ count8x8_val = vcnt_u8(input_val);
+ count16x4_val = vpaddl_u8(count8x8_val);
+ count32x2_val = vpaddl_u16(count16x4_val);
+ count64x1_val = vpaddl_u32(count32x2_val);
+ vst1_u64(ret, count64x1_val);
+
+ //*ret = _mm_popcnt_u64(value);
+
+}
+#endif /*LV_HAVE_NEON*/
+
#endif /*INCLUDED_volk_64u_popcnt_a_H*/
diff --git a/volk/kernels/volk/volk_64u_popcntpuppet_64u.h b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h
new file mode 100644
index 0000000000..3903e0d561
--- /dev/null
+++ b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
+#define INCLUDED_volk_64u_popcntpuppet_64u_H
+
+#include <stdint.h>
+#include <volk/volk_64u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_64u_popcnt_generic(outVector+ii, num_points );
+
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
+ unsigned int ii;
+ for(ii=0; ii < num_points; ++ii) {
+ volk_64u_popcnt_neon(outVector+ii, num_points );
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 3ab4a9970c..be20ed6585 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
}
}
// the primary test is the percent different greater than given tol
- else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+ else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 7d1826b719..7807ce40a6 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -44,7 +44,7 @@ VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20462, 1);
-VOLK_RUN_TESTS(volk_32f_log2_32f, 1e-3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_log2_32f, 1.5e-1, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1);
@@ -97,12 +97,12 @@ VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20462, 1);
VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20462, 1);
-//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_32u_popcntpuppet_32u, 0, 0, 2046, 10000);
VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20462, 1);
-//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_64u_popcntpuppet_64u, 0, 0, 2046, 10000);
VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20462, 1);
VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20462, 1);
VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20462, 1);