diff options
-rw-r--r-- | volk/kernels/volk/volk_32u_byteswap.h | 55 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32u_byteswappuppet_32u.h | 45 |
2 files changed, 98 insertions, 2 deletions
diff --git a/volk/kernels/volk/volk_32u_byteswap.h b/volk/kernels/volk/volk_32u_byteswap.h index 74d9a0bc3a..0194efc12c 100644 --- a/volk/kernels/volk/volk_32u_byteswap.h +++ b/volk/kernels/volk/volk_32u_byteswap.h @@ -73,6 +73,59 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + unsigned int number = 0; + unsigned int n8points = num_points / 8; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indeces in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(74609667900706840); + int_lookup23 = vcreate_u8(219290013576860186); + int_lookup45 = vcreate_u8(363970359253013532); + int_lookup67 = vcreate_u8(508650704929166878); + + for(number = 0; number < n8points; ++number){ + input_table = vld4_u8((uint8_t*) inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*) inputPtr, swapped_int01); + vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); + vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); + vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); + + inputPtr += 8; + } + + for(number = n8points * 8; number < num_points; ++number){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int32_t's. @@ -94,8 +147,6 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32u_byteswap_u_H */ #ifndef INCLUDED_volk_32u_byteswap_a_H #define INCLUDED_volk_32u_byteswap_a_H diff --git a/volk/kernels/volk/volk_32u_byteswappuppet_32u.h b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h new file mode 100644 index 0000000000..e50ad1d4f3 --- /dev/null +++ b/volk/kernels/volk/volk_32u_byteswappuppet_32u.h @@ -0,0 +1,45 @@ +#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H +#define INCLUDED_volk_32u_byteswappuppet_32u_H + + +#include <stdint.h> +#include <volk/volk_32u_byteswap.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_32u_byteswappuppet_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_32u_byteswappuppet_a_sse2(uint32_t* intsToSwap, unsigned int num_points){ + + volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); + +} +#endif + + +#endif |