diff options
author | Johnathan Corgan <johnathan@corganlabs.com> | 2014-10-31 12:12:09 -0700 |
---|---|---|
committer | Johnathan Corgan <johnathan@corganlabs.com> | 2014-10-31 12:12:09 -0700 |
commit | 4869607eb840318595edda85ca20b872579b27e3 (patch) | |
tree | d1bf8c14e957fcddb11df1b550ef9d211d1affe1 /volk | |
parent | b3bbe5659a20a8377e9a6278d0c77f2649399dc9 (diff) | |
parent | 0c92479f10274e9e5f4e1506dafe4d515576ee8a (diff) |
Merge commit '0c92479f'
This is one commit shy of the nwest/neon tip
Diffstat (limited to 'volk')
26 files changed, 987 insertions, 24 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 5030836d43..9bc1842c63 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -157,12 +157,12 @@ int main(int argc, char *argv[]) { //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_16u_byteswappuppet_16u, volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); @@ -175,7 +175,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 100, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204602, 10000, &results, benchmark_mode, kernel_regex); @@ -219,13 +219,13 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_32u_byteswappuppet_32u, volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_32u_popcntpuppet_32u, volk32u_popcnt_32u, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_64u_byteswappuppet_64u, volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_64u_popcntpuppet_64u, volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204602, 3000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204602, 3000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204602, 3000, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index 27f0bf6df7..2656d766b8 100644 --- a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -29,7 +29,6 @@ #ifdef LV_HAVE_GENERIC - static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { static const int N_UNROLL = 4; @@ -58,7 +57,54 @@ static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const #endif /*LV_HAVE_GENERIC*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { + + unsigned ii; + unsigned quarter_points = num_points / 4; + lv_32fc_t* tapsPtr = (lv_32fc_t*) taps; + short* inputPtr = (short*) input; + lv_32fc_t accumulator_vec[4]; + + float32x4x2_t tapsVal, accumulator_val; + int16x4_t input16; + int32x4_t input32; + float32x4_t input_float, prod_re, prod_im; + + accumulator_val.val[0] = vdupq_n_f32(0.0); + accumulator_val.val[1] = vdupq_n_f32(0.0); + + for(ii = 0; ii < quarter_points; ++ii) { + tapsVal = vld2q_f32((float*)tapsPtr); + input16 = vld1_s16(inputPtr); + // widen 16-bit int to 32-bit int + input32 = vmovl_s16(input16); + // convert 32-bit int to float with scale + input_float = vcvtq_f32_s32(input32); + + prod_re = vmulq_f32(input_float, tapsVal.val[0]); + prod_im = vmulq_f32(input_float, tapsVal.val[1]); + + accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); + accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); + + tapsPtr += 4; + inputPtr += 4; + } + vst2q_f32((float*)accumulator_vec, accumulator_val); + accumulator_vec[0] += accumulator_vec[1]; + accumulator_vec[2] += accumulator_vec[3]; + accumulator_vec[0] += accumulator_vec[2]; + + for(ii = quarter_points * 4; ii < num_points; ++ii) { + accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); + } + + *result = accumulator_vec[0]; +} +#endif /*LV_HAVE_NEON*/ #if LV_HAVE_SSE && LV_HAVE_MMX diff --git a/volk/kernels/volk/volk_16i_convert_8i.h b/volk/kernels/volk/volk_16i_convert_8i.h index eb29949417..6f16fa45d3 100644 --- a/volk/kernels/volk/volk_16i_convert_8i.h +++ b/volk/kernels/volk/volk_16i_convert_8i.h @@ -138,6 +138,46 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_ } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + + int16x8_t inputVal0; + int16x8_t inputVal1; + int8x8_t outputVal0; + int8x8_t outputVal1; + int8x16_t outputVal; + + for(number = 0; number < sixteenth_points; number++){ + // load two input vectors + inputVal0 = vld1q_s16(inputVectorPtr); + inputVal1 = vld1q_s16(inputVectorPtr+8); + // shift right + outputVal0 = vshrn_n_s16(inputVal0, 8); + outputVal1 = vshrn_n_s16(inputVal1, 8); + // squash two vectors and write output + outputVal = vcombine_s8(outputVal0, outputVal1); + vst1q_s8(outputVectorPtr, outputVal); + inputVectorPtr += 16; + outputVectorPtr += 16; + } + + for(number = sixteenth_points * 16; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Converts the input 16 bit integer data into 8 bit integer data @@ -156,7 +196,4 @@ static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int } #endif /* LV_HAVE_GENERIC */ - - - #endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h index 24134c80cc..6ea28f0e90 100644 --- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h +++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h @@ -195,7 +195,53 @@ static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputPtr = outputVector; + const int16_t* inputPtr = inputVector; + unsigned int number = 0; + unsigned int eighth_points = num_points / 8; + + int16x4x2_t input16; + int32x4_t input32_0, input32_1; + float32x4_t input_float_0, input_float_1; + float32x4x2_t output_float; + float32x4_t inv_scale; + + inv_scale = vdupq_n_f32(1.0/scalar); + + // the generic disassembles to a 128-bit load + // and duplicates every instruction to operate on 64-bits + // at a time. This is only possible with lanes, which is faster + // than just doing a vld1_s16, but still slower. + for(number = 0; number < eighth_points; number++){ + input16 = vld2_s16(inputPtr); + // widen 16-bit int to 32-bit int + input32_0 = vmovl_s16(input16.val[0]); + input32_1 = vmovl_s16(input16.val[1]); + // convert 32-bit int to float with scale + input_float_0 = vcvtq_f32_s32(input32_0); + input_float_1 = vcvtq_f32_s32(input32_1); + output_float.val[0] = vmulq_f32(input_float_0, inv_scale); + output_float.val[1] = vmulq_f32(input_float_1, inv_scale); + vst2q_f32(outputPtr, output_float); + inputPtr += 8; + outputPtr += 8; + } + for(number = eighth_points*8; number < num_points; number++){ + *outputPtr++ = ((float)(*inputPtr++)) / scalar; + } +} +#endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */ diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h index bffdeed185..3b2f9e2c91 100644 --- a/volk/kernels/volk/volk_16u_byteswap.h +++ b/volk/kernels/volk/volk_16u_byteswap.h @@ -158,6 +158,58 @@ static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){ + uint16_t* inputPtr = intsToSwap; + unsigned int number = 0; + unsigned int n16points = num_points / 16; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indeces in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(1232017111498883080); + int_lookup23 = vcreate_u8(1376697457175036426); + int_lookup45 = vcreate_u8(1521377802851189772); + int_lookup67 = vcreate_u8(1666058148527343118); + + for(number = 0; number < n16points; ++number){ + input_table = vld4_u8((uint8_t*) inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*)inputPtr, swapped_int01); + vst1_u8((uint8_t*)(inputPtr+4), swapped_int23); + vst1_u8((uint8_t*)(inputPtr+8), swapped_int45); + vst1_u8((uint8_t*)(inputPtr+12), swapped_int67); + + inputPtr += 16; + } + + for(number = n16points * 16; number < num_points; ++number){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int16_t's. diff --git a/volk/kernels/volk/volk_16u_byteswappuppet_16u.h b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h new file mode 100644 index 0000000000..74745d328c --- /dev/null +++ b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h @@ -0,0 +1,55 @@ +#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H +#define INCLUDED_volk_16u_byteswappuppet_16u_H + + +#include <stdint.h> +#include <volk/volk_16u_byteswap.h> +#include <string.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + + +#endif diff --git a/volk/kernels/volk/volk_32f_binary_slicer_8i.h b/volk/kernels/volk/volk_32f_binary_slicer_8i.h index 88a25b7a05..ae4420b6e1 100644 --- a/volk/kernels/volk/volk_32f_binary_slicer_8i.h +++ b/volk/kernels/volk/volk_32f_binary_slicer_8i.h @@ -206,4 +206,84 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector, #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise + \param cVector The char (int8_t) output (either 0 or 1) + \param aVector The float input + \param num_points The number of values in aVector and stored into cVector +*/ +static inline void +volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector, + unsigned int num_points) +{ + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + unsigned int n16points = num_points / 16; + + float32x4x2_t input_val0, input_val1; + float32x4_t zero_val; + uint32x4x2_t res0_u32, res1_u32; + uint16x4x2_t res0_u16x4, res1_u16x4; + uint16x8x2_t res_u16x8; + uint8x8x2_t res_u8; + uint8x8_t one; + + zero_val = vdupq_n_f32(0.0); + one = vdup_n_u8(0x01); + + // TODO: this is a good candidate for asm because the vcombines + // can be eliminated simply by picking dst registers that are + // adjacent. + for(number = 0; number < n16points; number++) { + input_val0 = vld2q_f32(aPtr); + input_val1 = vld2q_f32(aPtr+8); + + // test against 0; return uint32 + res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val); + res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val); + res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val); + res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val); + + // narrow uint32 -> uint16 followed by combine to 8-element vectors + res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]); + res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]); + res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]); + res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]); + + res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]); + res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]); + + // narrow uint16x8 -> uint8x8 + res_u8.val[0] = vmovn_u16(res_u16x8.val[0]); + res_u8.val[1] = vmovn_u16(res_u16x8.val[1]); + // we *could* load twice as much data and do another vcombine here + // to get a uint8x16x2 vector, still only do 2 vandqs and a single store + // but that turns out to be ~16% slower than this version on zc702 + // it's possible register contention in GCC scheduler slows it down + // and a hand-written asm with quad-word u8 registers is much faster. + + res_u8.val[0] = vand_u8(one, res_u8.val[0]); + res_u8.val[1] = vand_u8(one, res_u8.val[1]); + + vst2_u8((unsigned char*)cPtr, res_u8); + cPtr += 16; + aPtr += 16; + + } + + for(number = n16points * 16; number < num_points; number++) { + if(*aPtr++ >= 0) { + *cPtr++ = 1; + } + else { + *cPtr++ = 0; + } + } +} +#endif /* LV_HAVE_NEON */ + + #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */ diff --git a/volk/kernels/volk/volk_32f_log2_32f.h b/volk/kernels/volk/volk_32f_log2_32f.h index 52c1b60549..892eeb1685 100644 --- a/volk/kernels/volk/volk_32f_log2_32f.h +++ b/volk/kernels/volk/volk_32f_log2_32f.h @@ -61,7 +61,7 @@ #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) -#define LOG_POLY_DEGREE 3 +#define LOG_POLY_DEGREE 6 #ifndef INCLUDED_volk_32f_log2_32f_a_H @@ -145,6 +145,106 @@ static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVect #endif /* LV_HAVE_SSE4_1 for aligned */ + +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + +/* these macros allow us to embed logs in other kernels */ +#define VLOG2Q_NEON_PREAMBLE() \ + int32x4_t one = vdupq_n_s32(0x000800000); \ + /* minimax polynomial */ \ + float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ + float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ + float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ + float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ + float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ + float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ + float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ + int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ + int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ + int32x4_t exp_bias = vdupq_n_s32(127); + + +#define VLOG2Q_NEON_F32(log2_approx, aval) \ + int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ + int32x4_t significand_i = vandq_s32(aval, sig_mask); \ + exponent_i = vshrq_n_s32(exponent_i, 23); \ + \ + /* extract the exponent and significand \ + we can treat this as fixed point to save ~9% on the \ + conversion + float add */ \ + significand_i = vorrq_s32(one, significand_i); \ + float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \ + /* debias the exponent and convert to float */ \ + exponent_i = vsubq_s32(exponent_i, exp_bias); \ + float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ + \ + /* put the significand through a polynomial fit of log2(x) [1,2]\ + add the result to the exponent */ \ + log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ + float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ + tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + \ + float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ + tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ + tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ + tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ + tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); + +/*! + \brief Computes base 2 log of input vector and stores results in output vector + \param bVector The vector where results will be stored + \param aVector The input vector of floats + \param num_points Number of points for which log is to be computed +*/ +static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points){ + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number; + const unsigned int quarterPoints = num_points / 4; + + int32x4_t aval; + float32x4_t log2_approx; + + VLOG2Q_NEON_PREAMBLE() + // lms + //p0 = vdupq_n_f32(-1.649132280361871); + //p1 = vdupq_n_f32(1.995047138579499); + //p2 = vdupq_n_f32(-0.336914839219728); + + // keep in mind a single precision float is represented as + // (-1)^sign * 2^exp * 1.significand, so the log2 is + // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) + for(number = 0; number < quarterPoints; ++number){ + // load float in to an int register without conversion + aval = vld1q_s32((int*)aPtr); + + VLOG2Q_NEON_F32(log2_approx, aval) + + vst1q_f32(bPtr, log2_approx); + + aPtr += 4; + bPtr += 4; + } + + for(number = quarterPoints * 4; number < num_points; number++){ + *bPtr++ = log2(*aPtr++); + } +} + +#endif /* LV_HAVE_NEON */ + + #endif /* INCLUDED_volk_32f_log2_32f_a_H */ #ifndef INCLUDED_volk_32f_log2_32f_u_H diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h b/volk/kernels/volk/volk_32f_x2_pow_32f.h index 431c4c7021..431c4c7021 100755..100644 --- a/volk/kernels/volk/volk_32f_x2_pow_32f.h +++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h index 5b485ec542..68749665da 100644 --- a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -122,6 +122,39 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + float32x4x2_t complexInput; + + for(number = 0; number < quarter_points; number++){ + complexInput = vld2q_f32(complexVectorPtr); + vst1q_f32( iBufferPtr, complexInput.val[0] ); + vst1q_f32( qBufferPtr, complexInput.val[1] ); + complexVectorPtr += 8; + iBufferPtr += 4; + qBufferPtr += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into I & Q vector data diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h index 6fabeddcf3..f80265decd 100644 --- a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -110,6 +110,35 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + float32x4x2_t complexInput; + + for(number = 0; number < quarter_points; number++){ + complexInput = vld2q_f32(complexVectorPtr); + vst1q_f32( qBufferPtr, complexInput.val[1] ); + complexVectorPtr += 8; + qBufferPtr += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Deinterleaves the complex vector into Q vector data diff --git a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h index 9200206dc7..c0e8d8fb34 100644 --- a/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h @@ -84,7 +84,33 @@ static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float32x4x2_t complexInput; + for(number = 0; number < quarter_points; number++){ + complexInput = vld2q_f32(complexVectorPtr); + vst1q_f32( iBufferPtr, complexInput.val[0] ); + complexVectorPtr += 8; + iBufferPtr += 4; + } + for(number = quarter_points*4; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */ diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h index 945b4b5a2c..474b982887 100644 --- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h +++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h @@ -252,6 +252,43 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + unsigned int quarter_points = num_points / 4; + + float32x4x2_t a_val, scalar_val; + float32x4x2_t tmp_imag; + + scalar_val = vld2q_f32((const float*)&scalar); + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)aPtr); + tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]); + + tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]); + tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]); + + vst2q_f32((float*)cVector, tmp_imag); + aPtr += 4; + cVector += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + *cPtr++ = *aPtr++ * scalar; + } +} +#endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC /*! @@ -285,8 +322,4 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c } #endif /* LV_HAVE_GENERIC */ - - - - #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ diff --git a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index 750f508b7e..8964434bef 100644 --- a/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -164,10 +164,57 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result #endif /*LV_HAVE_SSE3*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int quarter_points = num_points / 4; + unsigned int number; + + lv_32fc_t* a_ptr = (lv_32fc_t*) taps; + lv_32fc_t* b_ptr = (lv_32fc_t*) input; + // for 2-lane vectors, 1st lane holds the real part, + // 2nd lane holds the imaginary part + float32x4x2_t a_val, b_val, accumulator; + float32x4x2_t tmp_imag; + accumulator.val[0] = vdupq_n_f32(0); + accumulator.val[1] = vdupq_n_f32(0); + + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+8); + __builtin_prefetch(b_ptr+8); + + // do the first multiply + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + + // use multiply accumulate/subtract to get result + tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); + tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); + + accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]); + accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]); + + // increment pointers + a_ptr += 4; + b_ptr += 4; + } + lv_32fc_t accum_result[4]; + vst2q_f32((float*)accum_result, accumulator); + *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ + // tail case + for(number = quarter_points*4; number < num_points; ++number) { + *result += (*a_ptr++) * lv_conj(*b_ptr++); + } + *result = lv_conj(*result); +} +#endif /*LV_HAVE_NEON*/ +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index d0a09f989f..c65d0984c5 100644 --- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -896,7 +896,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 #endif /*LV_HAVE_NEON*/ #ifdef LV_HAVE_NEON - +#include <arm_neon.h> static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { unsigned int quarter_points = num_points / 4; diff --git a/volk/kernels/volk/volk_32i_x2_and_32i.h b/volk/kernels/volk/volk_32i_x2_and_32i.h index b33a60e951..c138540e69 100644 --- a/volk/kernels/volk/volk_32i_x2_and_32i.h +++ b/volk/kernels/volk/volk_32i_x2_and_32i.h @@ -65,6 +65,40 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + int32x4_t a_val, b_val, c_val; + + for(number = 0; number < quarter_points; number++){ + a_val = vld1q_s32(aPtr); + b_val = vld1q_s32(bPtr); + c_val = vandq_s32(a_val, b_val); + vst1q_s32(cPtr, c_val); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for(number = quarter_points * 4; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Ands the two input vectors and store their results in the third vector diff --git a/volk/kernels/volk/volk_32i_x2_or_32i.h b/volk/kernels/volk/volk_32i_x2_or_32i.h index a8556a3e74..544a71c67c 100644 --- a/volk/kernels/volk/volk_32i_x2_or_32i.h +++ b/volk/kernels/volk/volk_32i_x2_or_32i.h @@ -65,6 +65,40 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + int32x4_t a_val, b_val, c_val; + + for(number = 0; number < quarter_points; number++){ + a_val = vld1q_s32(aPtr); + b_val = vld1q_s32(bPtr); + c_val = vorrq_s32(a_val, b_val); + vst1q_s32(cPtr, c_val); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for(number = quarter_points * 4; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Ors the two input vectors and store their results in the third vector diff --git a/volk/kernels/volk/volk_32u_byteswap.h b/volk/kernels/volk/volk_32u_byteswap.h index 74d9a0bc3a..0194efc12c 100644 --- a/volk/kernels/volk/volk_32u_byteswap.h +++ b/volk/kernels/volk/volk_32u_byteswap.h @@ -73,6 +73,59 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + unsigned int number = 0; + unsigned int n8points = num_points / 8; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indeces in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(74609667900706840); + int_lookup23 = vcreate_u8(219290013576860186); + int_lookup45 = vcreate_u8(363970359253013532); + int_lookup67 = vcreate_u8(508650704929166878); + + for(number = 0; number < n8points; ++number){ + input_table = vld4_u8((uint8_t*) inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*) inputPtr, swapped_int01); + vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); + vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); + vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); + + inputPtr += 8; + } + + for(number = n8points * 8; number < num_points; ++number){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int32_t's. @@ -94,8 +147,6 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32u_byteswap_u_H */ #ifndef INCLUDED_volk_32u_byteswap_a_H #define INCLUDED_volk_32u_byteswap_a_H diff --git a/volk/kernels/volk/volk_32u_byteswappuppet_32u.h b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h new file mode 100644 index 0000000000..bf7055e241 --- /dev/null +++ b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h @@ -0,0 +1,45 @@ +#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H +#define INCLUDED_volk_32u_byteswappuppet_32u_H + + +#include <stdint.h> +#include <volk/volk_32u_byteswap.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + + +#endif diff --git a/volk/kernels/volk/volk_32u_popcntpuppet_32u.h b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h new file mode 100644 index 0000000000..056983e817 --- /dev/null +++ b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h @@ -0,0 +1,47 @@ +/* -*- c++ -*- */ +/* + * Copyright 2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_volk_32u_popcntpuppet_32u_H +#define INCLUDED_volk_32u_popcntpuppet_32u_H + +#include <stdint.h> +#include <volk/volk_32u_popcnt.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) ); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE_4_2 +static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) ); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/volk/kernels/volk/volk_64u_byteswap.h b/volk/kernels/volk/volk_64u_byteswap.h index df71f0ee5e..dce883278d 100644 --- a/volk/kernels/volk/volk_64u_byteswap.h +++ b/volk/kernels/volk/volk_64u_byteswap.h @@ -104,7 +104,62 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Byteswaps (in-place) a vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int number = 0; + unsigned int n8points = num_points / 4; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indeces in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(2269495096316185); + int_lookup23 = vcreate_u8(146949840772469531); + int_lookup45 = vcreate_u8(291630186448622877); + int_lookup67 = vcreate_u8(436310532124776223); + + for(number = 0; number < n8points; ++number){ + input_table = vld4_u8((uint8_t*) inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*) inputPtr, swapped_int01); + vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); + vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); + vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); + + inputPtr += 4; + } + + for(number = n8points * 4; number < num_points; ++number){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } + +} +#endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_64u_byteswap_u_H */ diff --git a/volk/kernels/volk/volk_64u_byteswappuppet_64u.h b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h new file mode 100644 index 0000000000..ac5b16e212 --- /dev/null +++ b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -0,0 +1,45 @@ +#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H +#define INCLUDED_volk_64u_byteswappuppet_64u_H + + +#include <stdint.h> +#include <volk/volk_64u_byteswap.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + + +#endif diff --git a/volk/kernels/volk/volk_64u_popcnt.h b/volk/kernels/volk/volk_64u_popcnt.h index d425cd5206..0ec72e3404 100644 --- a/volk/kernels/volk/volk_64u_popcnt.h +++ b/volk/kernels/volk/volk_64u_popcnt.h @@ -71,4 +71,24 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) #endif /*LV_HAVE_SSE4_2*/ +#if LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) { + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((unsigned char *) &value); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(ret, count64x1_val); + + //*ret = _mm_popcnt_u64(value); + +} +#endif /*LV_HAVE_NEON*/ + #endif /*INCLUDED_volk_64u_popcnt_a_H*/ diff --git a/volk/kernels/volk/volk_64u_popcntpuppet_64u.h b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h new file mode 100644 index 0000000000..3903e0d561 --- /dev/null +++ b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- */ +/* + * Copyright 2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H +#define INCLUDED_volk_64u_popcntpuppet_64u_H + +#include <stdint.h> +#include <volk/volk_64u_popcnt.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_64u_popcnt_generic(outVector+ii, num_points ); + + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_NEON +static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_64u_popcnt_neon(outVector+ii, num_points ); + } +} +#endif /* LV_HAVE_NEON */ + +#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 3ab4a9970c..be20ed6585 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { } } // the primary test is the percent different greater than given tol - else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) { + else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { fail=true; if(print_max_errs-- > 0) { std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 7d1826b719..7807ce40a6 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -44,7 +44,7 @@ VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_32f_log2_32f, 1e-3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_32f_log2_32f, 1.5e-1, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1); VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1); @@ -97,12 +97,12 @@ VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20462, 1); VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20462, 1); -//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_32u_popcntpuppet_32u, 0, 0, 2046, 10000); VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20462, 1); -//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_64u_popcntpuppet_64u, 0, 0, 2046, 10000); VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20462, 1); VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20462, 1); |