diff options
author | Nathan West <nathan.west@okstate.edu> | 2014-10-18 17:57:55 -0500 |
---|---|---|
committer | Nathan West <nathan.west@okstate.edu> | 2014-10-19 18:25:24 -0500 |
commit | 068bc75e2a2254f1ea8a8607d22b38bc41eeaefd (patch) | |
tree | 8982b343f3de2901f687ded6b18ad51505bbd304 | |
parent | 520ac293c30d725225c5f984b8bf55e6f1caecf3 (diff) |
volk: popcnt support
Add a neon protokernel for 64-bit popcnt, and puppets so 64-bit and
32-bit versions can be tested with volk_profile
-rw-r--r-- | volk/apps/volk_profile.cc | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32u_popcntpuppet_32u.h | 47 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_popcnt.h | 56 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_popcntpuppet_64u.h | 47 |
4 files changed, 152 insertions, 2 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 5030836d43..e3f0ba73c5 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -220,12 +220,12 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204602, 2000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_32u_popcntpuppet_32u, volk32u_popcnt_32u, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PUPPET_PROFILE(volk_64u_popcntpuppet_64u, volk_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204602, 3000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204602, 3000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204602, 3000, &results, benchmark_mode, kernel_regex); diff --git a/volk/kernels/volk/volk_32u_popcntpuppet_32u.h b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h new file mode 100644 index 0000000000..056983e817 --- /dev/null +++ b/volk/kernels/volk/volk_32u_popcntpuppet_32u.h @@ -0,0 +1,47 @@ +/* -*- c++ -*- */ +/* + * Copyright 2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_volk_32u_popcntpuppet_32u_H +#define INCLUDED_volk_32u_popcntpuppet_32u_H + +#include <stdint.h> +#include <volk/volk_32u_popcnt.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) ); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE_4_2 +static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) ); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/volk/kernels/volk/volk_64u_popcnt.h b/volk/kernels/volk/volk_64u_popcnt.h index d425cd5206..5eb28c7898 100644 --- a/volk/kernels/volk/volk_64u_popcnt.h +++ b/volk/kernels/volk/volk_64u_popcnt.h @@ -71,4 +71,60 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) #endif /*LV_HAVE_SSE4_2*/ +#if LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) { + /* TABLE LUP + unsigned char table[] = {0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 2, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6}; + + // we're stuck with a 64-element table, so treat the MSBs + // of each byte as 0 and sum them individually. + uint64_t input_7bit = values & 0x7F7F7F7F7F7F7F7F; + uint64_t input_msbs = value & 0x8080808080808080; + uint64_t sum = (input_msbs >> 8) ; + sum += (input_msbs >> 16); + sum += (input_msbs >> 24); + sum += (input_msbs >> 32); + sum += (input_msbs >> 40); + sum += (input_msbs >> 48); + sum += (input_msbs >> 56); + sum += (input_msbs >> 64); + + uint8x8x4_t table_val; + uint8x8_t input_val; + uint16x8x2_t intermediate_sum; + uint32x8_t intermediate_sum; + + // load the table and input value + table_val = vld4q_u8(table); + input_val = vld1_u8((unsigned char *) &value); + + // perform the lookup, output is uint8x8_t + input_val = vtbl4_u8(table_val, input_val); + */ + + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((unsigned char *) &value); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(ret, count64x1_val); + + //*ret = _mm_popcnt_u64(value); + +} +#endif /*LV_HAVE_NEON*/ + #endif /*INCLUDED_volk_64u_popcnt_a_H*/ diff --git a/volk/kernels/volk/volk_64u_popcntpuppet_64u.h b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h new file mode 100644 index 0000000000..5837d0f3fb --- /dev/null +++ b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -0,0 +1,47 @@ +/* -*- c++ -*- */ +/* + * Copyright 2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H +#define INCLUDED_volk_64u_popcntpuppet_64u_H + +#include <stdint.h> +#include <volk/volk_64u_popcnt.h> + +#ifdef LV_HAVE_GENERIC +static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_64u_popcnt_generic(outVector+ii, *(inVector+ii) ); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_NEON +static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ + unsigned int ii; + for(ii=0; ii < num_points; ++ii) { + volk_64u_popcnt_neon(outVector+ii, *(inVector+ii) ); + } +} +#endif /* LV_HAVE_NEON */ + +#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ |