diff options
author | Nathan West <nathan.west@okstate.edu> | 2014-10-18 17:59:43 -0500 |
---|---|---|
committer | Nathan West <nathan.west@okstate.edu> | 2014-10-20 19:11:04 -0500 |
commit | 085ab2179d77aa7f7e00c95200fb0c4f3c36fca4 (patch) | |
tree | 3f3dad6c76997e0f8f3b6e1e53d9c5a06af2cda5 | |
parent | 068bc75e2a2254f1ea8a8607d22b38bc41eeaefd (diff) |
volk: adding popcnt puppets to qa
-rw-r--r-- | volk/kernels/volk/volk_64u_popcnt.h | 36 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_popcntpuppet_64u.h | 5 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 4 |
3 files changed, 5 insertions, 40 deletions
diff --git a/volk/kernels/volk/volk_64u_popcnt.h b/volk/kernels/volk/volk_64u_popcnt.h index 5eb28c7898..0ec72e3404 100644 --- a/volk/kernels/volk/volk_64u_popcnt.h +++ b/volk/kernels/volk/volk_64u_popcnt.h @@ -74,42 +74,6 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) #if LV_HAVE_NEON #include <arm_neon.h> static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) { - /* TABLE LUP - unsigned char table[] = {0, 1, 1, 2, 1, 2, 2, 3, - 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 2, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6}; - - // we're stuck with a 64-element table, so treat the MSBs - // of each byte as 0 and sum them individually. - uint64_t input_7bit = values & 0x7F7F7F7F7F7F7F7F; - uint64_t input_msbs = value & 0x8080808080808080; - uint64_t sum = (input_msbs >> 8) ; - sum += (input_msbs >> 16); - sum += (input_msbs >> 24); - sum += (input_msbs >> 32); - sum += (input_msbs >> 40); - sum += (input_msbs >> 48); - sum += (input_msbs >> 56); - sum += (input_msbs >> 64); - - uint8x8x4_t table_val; - uint8x8_t input_val; - uint16x8x2_t intermediate_sum; - uint32x8_t intermediate_sum; - - // load the table and input value - table_val = vld4q_u8(table); - input_val = vld1_u8((unsigned char *) &value); - - // perform the lookup, output is uint8x8_t - input_val = vtbl4_u8(table_val, input_val); - */ - uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; diff --git a/volk/kernels/volk/volk_64u_popcntpuppet_64u.h b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h index 5837d0f3fb..3903e0d561 100644 --- a/volk/kernels/volk/volk_64u_popcntpuppet_64u.h +++ b/volk/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -30,7 +30,8 @@ static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ unsigned int ii; for(ii=0; ii < num_points; ++ii) { - volk_64u_popcnt_generic(outVector+ii, *(inVector+ii) ); + volk_64u_popcnt_generic(outVector+ii, num_points ); + } } #endif /* LV_HAVE_GENERIC */ @@ -39,7 +40,7 @@ static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ unsigned int ii; for(ii=0; ii < num_points; ++ii) { - volk_64u_popcnt_neon(outVector+ii, *(inVector+ii) ); + volk_64u_popcnt_neon(outVector+ii, num_points ); } } #endif /* LV_HAVE_NEON */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 7d1826b719..a3d8766378 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -97,12 +97,12 @@ VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20462, 1); VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20462, 1); -//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_32u_popcntpuppet_32u, 0, 0, 2046, 10000); VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20462, 1); -//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_64u_popcntpuppet_64u, 0, 0, 2046, 10000); VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20462, 1); VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20462, 1); VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20462, 1); |