diff options
author | Nathan West <nathan.west@okstate.edu> | 2014-10-22 14:09:44 -0500 |
---|---|---|
committer | Nathan West <nathan.west@okstate.edu> | 2014-10-22 14:09:44 -0500 |
commit | 2c4c371885c31222362f70a1cd714415d1398021 (patch) | |
tree | acdb73791d5a10249dd5e30513bfe5a12496492f /volk | |
parent | 398f96a94e2b0b0df8870ad4ccc956bce4c9633b (diff) |
volk: add neon kernel for 64u_byteswap and puppets for 64/16 byteswap
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 6 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16u_byteswappuppet_16u.h | 46 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_byteswap.h | 55 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_byteswappuppet_64u.h | 45 |
4 files changed, 149 insertions, 3 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index b1f10f2405..4167f4de1f 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -157,7 +157,7 @@ int main(int argc, char *argv[]) { //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_16u_byteswappuppet_16u, volk_16u_byteswap, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); @@ -175,7 +175,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex); //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204602, 100, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204602, 10000, &results, benchmark_mode, kernel_regex); @@ -224,7 +224,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_64u_byteswappuppet_64u, volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PUPPET_PROFILE(volk_64u_popcntpuppet_64u, volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204602, 3000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204602, 3000, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_16u_byteswappuppet_16u.h b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h new file mode 100644 index 0000000000..699a7586bd --- /dev/null +++ b/volk/kernels/volk/volk_16u_byteswappuppet_16u.h @@ -0,0 +1,46 @@ +#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H +#define INCLUDED_volk_16u_byteswappuppet_16u_H + + +#include <stdint.h> +#include <volk/volk_16u_byteswap.h> +#include <string.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* intsToSwap, unsigned int num_points){ + + volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); + +} +#endif + + +#endif diff --git a/volk/kernels/volk/volk_64u_byteswap.h b/volk/kernels/volk/volk_64u_byteswap.h index df71f0ee5e..dce883278d 100644 --- a/volk/kernels/volk/volk_64u_byteswap.h +++ b/volk/kernels/volk/volk_64u_byteswap.h @@ -104,7 +104,62 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Byteswaps (in-place) a vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int number = 0; + unsigned int n8points = num_points / 4; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indeces in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(2269495096316185); + int_lookup23 = vcreate_u8(146949840772469531); + int_lookup45 = vcreate_u8(291630186448622877); + int_lookup67 = vcreate_u8(436310532124776223); + + for(number = 0; number < n8points; ++number){ + input_table = vld4_u8((uint8_t*) inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*) inputPtr, swapped_int01); + vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); + vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); + vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); + + inputPtr += 4; + } + + for(number = n8points * 4; number < num_points; ++number){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } + +} +#endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_64u_byteswap_u_H */ diff --git a/volk/kernels/volk/volk_64u_byteswappuppet_64u.h b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h new file mode 100644 index 0000000000..591b223e32 --- /dev/null +++ b/volk/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -0,0 +1,45 @@ +#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H +#define INCLUDED_volk_64u_byteswappuppet_64u_H + + +#include <stdint.h> +#include <volk/volk_64u_byteswap.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + +#ifdef LV_HAVE_SSE2 +static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* intsToSwap, unsigned int num_points){ + + volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); + +} +#endif + + +#endif |