diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/CMakeLists.txt | 6 | ||||
-rw-r--r-- | volk/apps/CMakeLists.txt | 4 | ||||
-rw-r--r-- | volk/apps/volk_profile.cc | 201 | ||||
-rw-r--r-- | volk/cmake/CMakeParseArgumentsCopy.cmake | 138 | ||||
-rw-r--r-- | volk/cmake/msvc/stdbool.h | 45 | ||||
-rw-r--r-- | volk/gen/archs.xml | 1 | ||||
-rw-r--r-- | volk/gen/volk_arch_defs.py | 7 | ||||
-rw-r--r-- | volk/gen/volk_kernel_defs.py | 343 | ||||
-rw-r--r-- | volk/gen/volk_machine_defs.py | 4 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_convert_8i_a.h | 69 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_s32f_convert_32f_a.h | 119 | ||||
-rw-r--r-- | volk/include/volk/volk_16u_byteswap_u.h | 63 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_convert_64f_a.h | 70 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_convert_64f_u.h | 70 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_convert_16i_a.h | 150 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_convert_32i_u.h | 142 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_convert_8i_a.h | 155 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_multiply_32f_u.h | 102 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_add_32f_u.h | 66 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 290 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 290 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_multiply_32f_u.h | 106 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_conjugate_32fc_u.h | 64 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h | 78 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_magnitude_32f_u.h | 118 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_magnitude_squared_32f_a.h | 114 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_magnitude_squared_32f_u.h | 114 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 87 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 145 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h | 116 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_32fc_u.h | 77 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 81 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 81 | ||||
-rw-r--r-- | volk/include/volk/volk_32i_s32f_convert_32f_a.h | 73 | ||||
-rw-r--r-- | volk/include/volk/volk_32i_s32f_convert_32f_u.h | 75 | ||||
-rw-r--r-- | volk/include/volk/volk_32u_byteswap_u.h | 77 | ||||
-rw-r--r-- | volk/include/volk/volk_64f_convert_32f_a.h | 67 | ||||
-rw-r--r-- | volk/include/volk/volk_64f_convert_32f_u.h | 67 | ||||
-rw-r--r-- | volk/include/volk/volk_64u_byteswap_u.h | 88 | ||||
-rw-r--r-- | volk/include/volk/volk_8i_convert_16i_u.h | 73 | ||||
-rw-r--r-- | volk/include/volk/volk_8i_s32f_convert_32f_u.h | 94 | ||||
-rw-r--r-- | volk/include/volk/volk_prefs.h | 15 | ||||
-rw-r--r-- | volk/kernels/README.txt | 67 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h (renamed from volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_branch_4_state_8.h (renamed from volk/include/volk/volk_16i_branch_4_state_8_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_convert_8i.h (renamed from volk/include/volk/volk_16i_convert_8i_u.h) | 71 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_max_star_16i.h (renamed from volk/include/volk/volk_16i_max_star_16i_a.h) | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_max_star_horizontal_16i.h (renamed from volk/include/volk/volk_16i_max_star_horizontal_16i_a.h) | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_permute_and_scalar_add.h (renamed from volk/include/volk/volk_16i_permute_and_scalar_add_a.h) | 7 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_s32f_convert_32f.h (renamed from volk/include/volk/volk_16i_s32f_convert_32f_u.h) | 121 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h (renamed from volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h) | 9 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h (renamed from volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h) | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h (renamed from volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_deinterleave_real_16i.h (renamed from volk/include/volk/volk_16ic_deinterleave_real_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_deinterleave_real_8i.h (renamed from volk/include/volk/volk_16ic_deinterleave_real_8i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_magnitude_16i.h (renamed from volk/include/volk/volk_16ic_magnitude_16i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h (renamed from volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h (renamed from volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16u_byteswap.h (renamed from volk/include/volk/volk_16u_byteswap_a.h) | 65 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_accumulator_s32f.h (renamed from volk/include/volk/volk_32f_accumulator_s32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_convert_64f.h | 140 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_index_max_16u.h (renamed from volk/include/volk/volk_32f_index_max_16u_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h (renamed from volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h (renamed from volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_16i.h (renamed from volk/include/volk/volk_32f_s32f_convert_16i_u.h) | 152 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_32i.h (renamed from volk/include/volk/volk_32f_s32f_convert_32i_a.h) | 142 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_8i.h (renamed from volk/include/volk/volk_32f_s32f_convert_8i_u.h) | 157 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_multiply_32f.h (renamed from volk/include/volk/volk_32f_s32f_multiply_32f_a.h) | 104 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_normalize.h (renamed from volk/include/volk/volk_32f_s32f_normalize_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_power_32f.h (renamed from volk/include/volk/volk_32f_s32f_power_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_stddev_32f.h (renamed from volk/include/volk/volk_32f_s32f_stddev_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_sqrt_32f.h (renamed from volk/include/volk/volk_32f_sqrt_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h (renamed from volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_add_32f.h (renamed from volk/include/volk/volk_32f_x2_add_32f_a.h) | 68 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_divide_32f.h (renamed from volk/include/volk/volk_32f_x2_divide_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_dot_prod_16i.h (renamed from volk/include/volk/volk_32f_x2_dot_prod_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_dot_prod_32f.h | 580 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_interleave_32fc.h (renamed from volk/include/volk/volk_32f_x2_interleave_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_max_32f.h (renamed from volk/include/volk/volk_32f_x2_max_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_min_32f.h (renamed from volk/include/volk/volk_32f_x2_min_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_multiply_32f.h (renamed from volk/include/volk/volk_32f_x2_multiply_32f_a.h) | 108 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h (renamed from volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_subtract_32f.h (renamed from volk/include/volk/volk_32f_x2_subtract_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h (renamed from volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h) | 7 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_32f_multiply_32fc_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_conjugate_32fc.h (renamed from volk/include/volk/volk_32fc_conjugate_32fc_a.h) | 64 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h | 156 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h (renamed from volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_real_32f.h (renamed from volk/include/volk/volk_32fc_deinterleave_real_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_real_64f.h (renamed from volk/include/volk/volk_32fc_deinterleave_real_64f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_index_max_16u.h (renamed from volk/include/volk/volk_32fc_index_max_16u_a.h) | 9 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_magnitude_32f.h (renamed from volk/include/volk/volk_32fc_magnitude_32f_a.h) | 120 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_magnitude_squared_32f.h | 228 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_atan2_32f.h (renamed from volk/include/volk/volk_32fc_s32f_atan2_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h (renamed from volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h (renamed from volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_power_32fc.h (renamed from volk/include/volk/volk_32fc_s32f_power_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h (renamed from volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h (renamed from volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h) | 87 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h) | 10 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h) | 6 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h) | 161 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h) | 136 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_x2_multiply_32fc_a.h) | 79 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h | 162 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h (renamed from volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h) | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_square_dist_32f.h (renamed from volk/include/volk/volk_32fc_x2_square_dist_32f_a.h) | 8 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32i_s32f_convert_32f.h | 148 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32i_x2_and_32i.h (renamed from volk/include/volk/volk_32i_x2_and_32i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32i_x2_or_32i.h (renamed from volk/include/volk/volk_32i_x2_or_32i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32u_byteswap.h (renamed from volk/include/volk/volk_32u_byteswap_a.h) | 77 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32u_popcnt.h (renamed from volk/include/volk/volk_32u_popcnt_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64f_convert_32f.h | 134 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64f_x2_max_64f.h (renamed from volk/include/volk/volk_64f_x2_max_64f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64f_x2_min_64f.h (renamed from volk/include/volk/volk_64f_x2_min_64f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_byteswap.h (renamed from volk/include/volk/volk_64u_byteswap_a.h) | 88 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_popcnt.h (renamed from volk/include/volk/volk_64u_popcnt_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8i_convert_16i.h (renamed from volk/include/volk/volk_8i_convert_16i_a.h) | 75 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8i_s32f_convert_32f.h (renamed from volk/include/volk/volk_8i_s32f_convert_32f_a.h) | 96 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h (renamed from volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_deinterleave_real_16i.h (renamed from volk/include/volk/volk_8ic_deinterleave_real_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_deinterleave_real_8i.h (renamed from volk/include/volk/volk_8ic_deinterleave_real_8i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h (renamed from volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h (renamed from volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h (renamed from volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/lib/CMakeLists.txt | 3 | ||||
-rw-r--r-- | volk/lib/qa_utils.cc | 36 | ||||
-rw-r--r-- | volk/lib/qa_utils.h | 2 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 190 | ||||
-rw-r--r-- | volk/lib/volk_prefs.c | 39 | ||||
-rw-r--r-- | volk/lib/volk_rank_archs.c | 111 | ||||
-rw-r--r-- | volk/lib/volk_rank_archs.h | 40 | ||||
-rw-r--r-- | volk/python/volk_modtool/CMakeLists.txt | 39 | ||||
-rw-r--r-- | volk/python/volk_modtool/README | 114 | ||||
-rw-r--r-- | volk/python/volk_modtool/__init__.py | 24 | ||||
-rw-r--r-- | volk/python/volk_modtool/cfg.py | 104 | ||||
-rwxr-xr-x | volk/python/volk_modtool/volk_modtool | 128 | ||||
-rw-r--r-- | volk/python/volk_modtool/volk_modtool_generate.py | 310 | ||||
-rw-r--r-- | volk/tmpl/volk.tmpl.c | 121 | ||||
-rw-r--r-- | volk/tmpl/volk.tmpl.h | 55 | ||||
-rw-r--r-- | volk/tmpl/volk_machine_xxx.tmpl.c | 30 | ||||
-rw-r--r-- | volk/tmpl/volk_machines.tmpl.h | 14 | ||||
-rw-r--r-- | volk/tmpl/volk_typedefs.tmpl.h | 2 |
148 files changed, 5290 insertions, 4021 deletions
diff --git a/volk/CMakeLists.txt b/volk/CMakeLists.txt index 99f7052560..a04c2adefd 100644 --- a/volk/CMakeLists.txt +++ b/volk/CMakeLists.txt @@ -115,12 +115,15 @@ install( # Install all headers in the include directories ######################################################################## install( - DIRECTORY ${CMAKE_SOURCE_DIR}/include/volk + DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk DESTINATION include COMPONENT "volk_devel" FILES_MATCHING PATTERN "*.h" ) install(FILES + ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h ${CMAKE_BINARY_DIR}/include/volk/volk.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h @@ -138,6 +141,7 @@ add_subdirectory(lib) # And the utility apps ######################################################################## add_subdirectory(apps) +add_subdirectory(python/volk_modtool) ######################################################################## # Print summary diff --git a/volk/apps/CMakeLists.txt b/volk/apps/CMakeLists.txt index 03ad92b792..577b7ef137 100644 --- a/volk/apps/CMakeLists.txt +++ b/volk/apps/CMakeLists.txt @@ -25,11 +25,11 @@ if(MSVC) endif(MSVC) include_directories( + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_BINARY_DIR}/include ${CMAKE_SOURCE_DIR}/lib - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR} ${Boost_INCLUDE_DIRS} ) diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index e0919a278a..edc32183a2 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -18,117 +18,100 @@ int main(int argc, char *argv[]) { std::vector<std::string> results; - //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results); - //VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results); - VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc_a, volk_32fc_s32fc_x2_rotator_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results); - VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 204600, 1000, &results); - VOLK_PROFILE(volk_16ic_deinterleave_real_16i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_magnitude_16i_a, 1, 0, 204600, 100, &results); - VOLK_PROFILE(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 204600, 1000, &results); - VOLK_PROFILE(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_convert_8i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_convert_8i_u, 0, 0, 204600, 10000, &results); - //VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results); - //VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results); - //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results); - //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results); - VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 204600, 100, &results); - //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results); - VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_magnitude_32f_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_magnitude_squared_32f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_magnitude_squared_32f_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_32i_u, 1, 2<<31, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_convert_64f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_convert_64f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_8i_a, 1, 128, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_8i_u, 1, 128, 204600, 10000, &results); - //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000, &results); - VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 20460, 100, &results); - VOLK_PROFILE(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results); - //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results); - VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results); - VOLK_PROFILE(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_multiply_32f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100, &results); - VOLK_PROFILE(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100, &results); - VOLK_PROFILE(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 204600, 3000, &results); - VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 204600, 3000, &results); - VOLK_PROFILE(volk_32f_x2_subtract_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32i_x2_and_32i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32i_s32f_convert_32f_a, 1e-4, 100, 204600, 10000, &results); - VOLK_PROFILE(volk_32i_s32f_convert_32f_u, 1e-4, 100, 204600, 10000, &results); - VOLK_PROFILE(volk_32i_x2_or_32i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32u_byteswap_a, 0, 0, 204600, 2000, &results); - //VOLK_PROFILE(volk_32u_popcnt_a, 0, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_64f_convert_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_64f_convert_32f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_64f_x2_max_64f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_64f_x2_min_64f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_64u_byteswap_a, 0, 0, 204600, 1000, &results); - //VOLK_PROFILE(volk_64u_popcnt_a, 0, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_8ic_deinterleave_16i_x2_a, 0, 0, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_deinterleave_real_16i_a, 0, 256, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 204600, 400, &results); - VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 204600, 400, &results); - VOLK_PROFILE(volk_8i_convert_16i_a, 0, 0, 204600, 20000, &results); - VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results); - VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results); - //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 1.0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 204600, 1000, &results); + //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results); + //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results); + VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results); + VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_deinterleave_16i_x2, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 204600, 1000, &results); + VOLK_PROFILE(volk_16ic_deinterleave_real_16i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_magnitude_16i, 1, 0, 204600, 100, &results); + VOLK_PROFILE(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 204600, 1000, &results); + VOLK_PROFILE(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 204600, 10000, &results); + VOLK_PROFILE(volk_16i_convert_8i, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results); + //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204600, 50, &results); + VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204600, 100, &results); + //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_real_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_real_64f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_index_max_16u, 3, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_s32f_magnitude_16i, 1, 32768, 204600, 100, &results); + VOLK_PROFILE(volk_32fc_magnitude_32f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_magnitude_squared_32f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_conjugate_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32f_s32f_convert_16i, 1, 32768, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_convert_32i, 1, 2<<31, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_convert_64f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_convert_8i, 1, 128, 204600, 10000, &results); + //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results); + VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 20460, 100, &results); + VOLK_PROFILE(volk_32fc_x2_square_dist_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_divide_32f, 1e-4, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 5000, &results); + //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results); + VOLK_PROFILE(volk_32f_index_max_16u, 3, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic, 1, 32768, 204600, 3000, &results); + VOLK_PROFILE(volk_32f_x2_interleave_32fc, 0, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_max_32f, 1e-4, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_32f_x2_min_32f, 1e-4, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_32f_x2_multiply_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_normalize, 1e-4, 100, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_power_32f, 1e-4, 4, 204600, 100, &results); + VOLK_PROFILE(volk_32f_sqrt_32f, 1e-4, 0, 204600, 100, &results); + VOLK_PROFILE(volk_32f_s32f_stddev_32f, 1e-4, 100, 204600, 3000, &results); + VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 204600, 3000, &results); + VOLK_PROFILE(volk_32f_x2_subtract_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204600, 10000, &results); + VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204600, 2000, &results); + //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204600, 1000, &results); + //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 204600, 400, &results); + VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 204600, 400, &results); + VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 20000, &results); + VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_8i_s32f_convert_32f, 1e-4, 100, 204600, 2000, &results); + //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204600, 10000, &results); + char path[1024]; - get_config_path(path); + volk_get_config_path(path); + const fs::path config_path(path); if (not fs::exists(config_path.branch_path())) diff --git a/volk/cmake/CMakeParseArgumentsCopy.cmake b/volk/cmake/CMakeParseArgumentsCopy.cmake new file mode 100644 index 0000000000..7ce4c49ae5 --- /dev/null +++ b/volk/cmake/CMakeParseArgumentsCopy.cmake @@ -0,0 +1,138 @@ +# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...) +# +# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for +# parsing the arguments given to that macro or function. +# It processes the arguments and defines a set of variables which hold the +# values of the respective options. +# +# The <options> argument contains all options for the respective macro, +# i.e. keywords which can be used when calling the macro without any value +# following, like e.g. the OPTIONAL keyword of the install() command. +# +# The <one_value_keywords> argument contains all keywords for this macro +# which are followed by one value, like e.g. DESTINATION keyword of the +# install() command. +# +# The <multi_value_keywords> argument contains all keywords for this macro +# which can be followed by more than one value, like e.g. the TARGETS or +# FILES keywords of the install() command. +# +# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the +# keywords listed in <options>, <one_value_keywords> and +# <multi_value_keywords> a variable composed of the given <prefix> +# followed by "_" and the name of the respective keyword. +# These variables will then hold the respective value from the argument list. +# For the <options> keywords this will be TRUE or FALSE. +# +# All remaining arguments are collected in a variable +# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether +# your macro was called with unrecognized parameters. +# +# As an example here a my_install() macro, which takes similar arguments as the +# real install() command: +# +# function(MY_INSTALL) +# set(options OPTIONAL FAST) +# set(oneValueArgs DESTINATION RENAME) +# set(multiValueArgs TARGETS CONFIGURATIONS) +# cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) +# ... +# +# Assume my_install() has been called like this: +# my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub) +# +# After the cmake_parse_arguments() call the macro will have set the following +# variables: +# MY_INSTALL_OPTIONAL = TRUE +# MY_INSTALL_FAST = FALSE (this option was not used when calling my_install() +# MY_INSTALL_DESTINATION = "bin" +# MY_INSTALL_RENAME = "" (was not used) +# MY_INSTALL_TARGETS = "foo;bar" +# MY_INSTALL_CONFIGURATIONS = "" (was not used) +# MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL" +# +# You can the continue and process these variables. +# +# Keywords terminate lists of values, e.g. if directly after a one_value_keyword +# another recognized keyword follows, this is interpreted as the beginning of +# the new option. +# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in +# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would +# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor. + +#============================================================================= +# Copyright 2010 Alexander Neundorf <neundorf@kde.org> +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + + +if(__CMAKE_PARSE_ARGUMENTS_INCLUDED) + return() +endif() +set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE) + + +function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames) + # first set all result variables to empty/FALSE + foreach(arg_name ${_singleArgNames} ${_multiArgNames}) + set(${prefix}_${arg_name}) + endforeach(arg_name) + + foreach(option ${_optionNames}) + set(${prefix}_${option} FALSE) + endforeach(option) + + set(${prefix}_UNPARSED_ARGUMENTS) + + set(insideValues FALSE) + set(currentArgName) + + # now iterate over all arguments and fill the result variables + foreach(currentArg ${ARGN}) + list(FIND _optionNames "${currentArg}" optionIndex) # ... then this marks the end of the arguments belonging to this keyword + list(FIND _singleArgNames "${currentArg}" singleArgIndex) # ... then this marks the end of the arguments belonging to this keyword + list(FIND _multiArgNames "${currentArg}" multiArgIndex) # ... then this marks the end of the arguments belonging to this keyword + + if(${optionIndex} EQUAL -1 AND ${singleArgIndex} EQUAL -1 AND ${multiArgIndex} EQUAL -1) + if(insideValues) + if("${insideValues}" STREQUAL "SINGLE") + set(${prefix}_${currentArgName} ${currentArg}) + set(insideValues FALSE) + elseif("${insideValues}" STREQUAL "MULTI") + list(APPEND ${prefix}_${currentArgName} ${currentArg}) + endif() + else(insideValues) + list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg}) + endif(insideValues) + else() + if(NOT ${optionIndex} EQUAL -1) + set(${prefix}_${currentArg} TRUE) + set(insideValues FALSE) + elseif(NOT ${singleArgIndex} EQUAL -1) + set(currentArgName ${currentArg}) + set(${prefix}_${currentArgName}) + set(insideValues "SINGLE") + elseif(NOT ${multiArgIndex} EQUAL -1) + set(currentArgName ${currentArg}) + set(${prefix}_${currentArgName}) + set(insideValues "MULTI") + endif() + endif() + + endforeach(currentArg) + + # propagate the result variables to the caller: + foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames}) + set(${prefix}_${arg_name} ${${prefix}_${arg_name}} PARENT_SCOPE) + endforeach(arg_name) + set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE) + +endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs) diff --git a/volk/cmake/msvc/stdbool.h b/volk/cmake/msvc/stdbool.h new file mode 100644 index 0000000000..ca4581d37a --- /dev/null +++ b/volk/cmake/msvc/stdbool.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2005, 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef STDBOOL_WIN32_H +#define STDBOOL_WIN32_H + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef __cplusplus + +typedef unsigned char bool; + +#define true 1 +#define false 0 + +#ifndef CASSERT +#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1]; +#endif + +CASSERT(sizeof(bool) == 1, bool_is_one_byte) +CASSERT(true, true_is_true) +CASSERT(!false, false_is_false) + +#endif + +#endif diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml index a18455801d..2c9ab41a55 100644 --- a/volk/gen/archs.xml +++ b/volk/gen/archs.xml @@ -2,7 +2,6 @@ <grammar> <arch name="generic"> <!-- name is required--> - <alignment>1</alignment> </arch> <arch name="altivec"> diff --git a/volk/gen/volk_arch_defs.py b/volk/gen/volk_arch_defs.py index 41154d5a7a..3c75e1374e 100644 --- a/volk/gen/volk_arch_defs.py +++ b/volk/gen/volk_arch_defs.py @@ -18,9 +18,6 @@ archs = list() arch_dict = dict() -#TODO enable this when we are ready -create_unaligned_archs = False - class arch_class: def __init__(self, flags, checks, **kwargs): for key, cast, failval in ( @@ -49,10 +46,6 @@ def register_arch(**kwargs): arch = arch_class(**kwargs) archs.append(arch) arch_dict[arch.name] = arch - if arch.alignment > 1 and create_unaligned_archs: - kwargs['name'] += '_u' - kwargs['alignment'] = 1 - register_arch(**kwargs) ######################################################################## # register the arches diff --git a/volk/gen/volk_kernel_defs.py b/volk/gen/volk_kernel_defs.py index 52cdb684c2..f246db0f96 100644 --- a/volk/gen/volk_kernel_defs.py +++ b/volk/gen/volk_kernel_defs.py @@ -24,201 +24,186 @@ import re import sys import glob -from volk_arch_defs import archs - -remove_after_underscore = re.compile("_.*"); -space_remove = re.compile(" "); -leading_space_remove = re.compile("^ *"); -replace_arch = re.compile(", const char\* arch"); -replace_bracket = re.compile(" {"); -replace_volk = re.compile("volk"); - -def strip_trailing(tostrip, stripstr): - lindex = tostrip.rfind(stripstr) - tostrip = tostrip[0:lindex] + tostrip[lindex:len(tostrip)].replace(stripstr, ""); - return tostrip +######################################################################## +# Strip comments from a c/cpp file. +# Input is code string, output is code string without comments. +# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments +######################################################################## +def comment_remover(text): + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return "" + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + +######################################################################## +# Split code into nested sections according to ifdef preprocessor macros +######################################################################## +def split_into_nested_ifdef_sections(code): + sections = list() + section = '' + header = 'text' + in_section_depth = 0 + for i, line in enumerate(code.splitlines()): + m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line) + line_is = 'normal' + if m: + p0, p1, fcn, stuff = m.groups() + if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if' + if fcn in ('else', 'elif'): line_is = 'else' + if fcn in ('endif',): line_is = 'end' + + if line_is == 'if': in_section_depth += 1 + if line_is == 'end': in_section_depth -= 1 + + if in_section_depth == 1 and line_is == 'if': + sections.append((header, section)) + section = '' + header = line + continue -srcdir = os.path.dirname(os.path.dirname(__file__)) -hdr_files = glob.glob(os.path.join(srcdir, "include/volk/*.h")) - -datatypes = []; -functions = []; - -for line in hdr_files: - subline = re.search(".*_(a|u)\.h.*", os.path.basename(line)) - if subline: - subsubline = re.search("(?<=volk_).*", subline.group(0)); - if subsubline: - dtype = remove_after_underscore.sub("", subsubline.group(0)); - subdtype = re.search("[0-9]+[A-z]+", dtype); - if subdtype: - datatypes.append(subdtype.group(0)); - - -datatypes = set(datatypes); - -for line in hdr_files: - for dt in datatypes: - if dt in line: - subline = re.search("(volk_" + dt +"_.*(a|u).*\.h)", line); - if subline: - - subsubline = re.search(".+(?=\.h)", subline.group(0)); - functions.append(subsubline.group(0)); - -archs_or = "(" -for arch in archs: - archs_or = archs_or + arch.name.upper() + "|"; -archs_or = archs_or[0:len(archs_or)-1]; -archs_or = archs_or + ")"; - -taglist = []; -fcountlist = []; -arched_arglist = []; -retlist = []; -my_arglist = []; -my_argtypelist = []; -for func in functions: - tags = []; - fcount = []; - infile_source = open(os.path.join(srcdir, 'include', 'volk', func + ".h")) - begun_name = 0; - begun_paren = 0; - sourcefile = infile_source.readlines(); - infile_source.close(); - for line in sourcefile: -#FIXME: make it work for multiple #if define()s - archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); - if archline: - arch = archline.group(0); - archline = re.findall(archs_or + "(?=( |\n|&))", line); - if archline: - archsublist = []; - for tup in archline: - archsublist.append(tup[0]); - fcount.append(archsublist); - testline = re.search("static inline.*?" + func, line); - if (not testline): + if in_section_depth == 1 and line_is == 'else': + sections.append((header, section)) + section = '' + header = line continue - tagline = re.search(func + "_.+", line); - if tagline: - tag = re.search("(?<=" + func + "_)\w+(?= *\()",line); - if tag: - tag = re.search("\w+", tag.group(0)); - if tag: - tags.append(tag.group(0)); + if in_section_depth == 0 and line_is == 'end': + sections.append((header, section)) + section = '' + header = 'text' + continue - if begun_name == 0: - retline = re.search(".+(?=" + func + ")", line); - if retline: - ret = retline.group(0); + section += line + '\n' + sections.append((header, section)) #and pack remainder into sections + sections = [sec for sec in sections if sec[1].strip()] #filter empty sections + #recurse into non-text sections to fill subsections + for i, (header, section) in enumerate(sections): + if header == 'text': continue + sections[i] = (header, split_into_nested_ifdef_sections(section)) + return sections - subline = re.search(func + ".*", line); - if subline: - subsubline = re.search("\(.*?\)", subline.group(0)); - if subsubline: - args = subsubline.group(0); +######################################################################## +# Recursive print of sections to test code above +######################################################################## +def print_sections(sections, indent = ' '): + for header, body in sections: + if header == 'text': + print indent, ('\n'+indent).join(body.splitlines()) + continue + print indent.replace(' ', '-') + '>', header + print_sections(body, indent + ' ') + +######################################################################## +# Flatten a section to just body text +######################################################################## +def flatten_section_text(sections): + output = '' + for hdr, bdy in sections: + if hdr != 'text': output += flatten_section_text(bdy) + else: output += bdy + return output + +######################################################################## +# Extract kernel info from section, represent as an implementation +######################################################################## +class impl_class: + def __init__(self, kern_name, header, body): + #extract LV_HAVE_* + self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header))) + #extract function suffix and args + body = flatten_section_text(body) + try: + fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE) + body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket + m = fcn_matcher.match(body) + impl_name, the_rest = m.groups() + self.name = impl_name.replace(kern_name+'_', '') + self.args = list() + fcn_args = the_rest.split(',') + for fcn_arg in fcn_args: + arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE) + m = arg_matcher.match(fcn_arg) + arg_type, arg_name = m.groups() + self.args.append((arg_type, arg_name)) + except Exception as ex: + raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex) + + assert self.name + self.is_aligned = self.name.startswith('a_') - else: - begun_name = 1; - subsubline = re.search("\(.*", subline.group(0)); - if subsubline: - args = subsubline.group(0); - begun_paren = 1; - else: - if begun_paren == 1: - subline = re.search(".*?\)", line); - if subline: - args = args + subline.group(0); - begun_name = 0; - begun_paren = 0; - else: - subline = re.search(".*", line); - args = args + subline.group(0); - else: - subline = re.search("\(.*?\)", line); - if subline: - args = subline.group(0); - begun_name = 0; - else: - subline = re.search("\(.*", line); - if subline: - args = subline.group(0); - begun_paren = 1; - - replace = re.compile("static "); - ret = replace.sub("", ret); - replace = re.compile("inline "); - ret = replace.sub("", ret); - arched_args = args[args.find('(')+1:args.find(')')] - - remove = re.compile('\)|\(|{'); - rargs = remove.sub("", args); - sargs = rargs.split(','); - - - - margs = []; - atypes = []; - for arg in sargs: - temp = arg.split(" "); - margs.append(temp[-1]); - replace = re.compile(" " + temp[-1]); - atypes.append(replace.sub("", arg)); - - - my_args = "" - arg_types = "" - for arg in range(0, len(margs) - 1): - this_arg = leading_space_remove.sub("", margs[arg]); - my_args = my_args + this_arg + ", "; - this_type = leading_space_remove.sub("", atypes[arg]); - arg_types = arg_types + this_type + ", "; - - this_arg = leading_space_remove.sub("", margs[-1]); - my_args = my_args + this_arg; - this_type = leading_space_remove.sub("", atypes[-1]); - arg_types = arg_types + this_type; - my_argtypelist.append(arg_types); - - if(ret[-1] != ' '): - ret = ret + ' '; - - arched_arglist.append(arched_args); #!!!!!!!!!!! - my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! - retlist.append(ret); - fcountlist.append(fcount); - taglist.append(tags); + def __repr__(self): + return self.name +######################################################################## +# Get sets of LV_HAVE_* from the code +######################################################################## +def extract_lv_haves(code): + haves = list() + for line in code.splitlines(): + if not line.strip().startswith('#'): continue + have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line))) + if have_set: haves.append(have_set) + return haves + +######################################################################## +# Represent a processing kernel, parse from file +######################################################################## class kernel_class: - def __init__(self, index): - self.name = functions[index] + def __init__(self, kernel_file): + self.name = os.path.splitext(os.path.basename(kernel_file))[0] self.pname = self.name.replace('volk_', 'p_') - self.rettype = retlist[index] - self.arglist_defs = my_argtypelist[index] - self.arglist_namedefs = arched_arglist[index] - self.arglist_names = my_arglist[index] - self._tagdeps = fcountlist[index] - self._taglist = taglist[index] - - def get_tags(self, archs): - def is_in(x): return x.lower() in archs - taglist = list() - tagdeps = list() - for i in range(len(self._tagdeps)): - if all(map(is_in, self._tagdeps[i])): - taglist.append(self._taglist[i]) - tagdeps.append(self._tagdeps[i]) - return taglist, tagdeps + code = open(kernel_file, 'r').read() + code = comment_remover(code) + sections = split_into_nested_ifdef_sections(code) + self._impls = list() + for header, section in sections: + if 'ifndef' not in header.lower(): continue + for sub_hdr, body in section: + if 'if' not in sub_hdr.lower(): continue + if 'LV_HAVE_' not in sub_hdr: continue + self._impls.append(impl_class( + kern_name=self.name, header=sub_hdr, body=body, + )) + assert(self._impls) + self.has_dispatcher = False + for impl in self._impls: + if impl.name == 'dispatcher': + self._impls.remove(impl) + self.has_dispatcher = True + break + self.args = self._impls[0].args + self.arglist_types = ', '.join([a[0] for a in self.args]) + self.arglist_full = ', '.join(['%s %s'%a for a in self.args]) + self.arglist_names = ', '.join([a[1] for a in self.args]) + + def get_impls(self, archs): + archs = set(archs) + impls = list() + for impl in self._impls: + if impl.deps.intersection(archs) == impl.deps: + impls.append(impl) + return impls def __repr__(self): return self.name -kernels = map(kernel_class, range(len(retlist))) +######################################################################## +# Extract information from the VOLK kernels +######################################################################## +__file__ = os.path.abspath(__file__) +srcdir = os.path.dirname(os.path.dirname(__file__)) +kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h")) +kernels = map(kernel_class, kernel_files) if __name__ == '__main__': print kernels diff --git a/volk/gen/volk_machine_defs.py b/volk/gen/volk_machine_defs.py index d1a8569818..7293d47462 100644 --- a/volk/gen/volk_machine_defs.py +++ b/volk/gen/volk_machine_defs.py @@ -30,10 +30,6 @@ class machine_class: arch = arch_dict[arch_name] self.archs.append(arch) self.arch_names.append(arch_name) - arch_name += '_u' - if arch.alignment > 1 and arch_dict.has_key(arch_name): - arch = arch_dict[arch_name] - self.archs.append(arch) self.alignment = max(map(lambda a: a.alignment, self.archs)) def __repr__(self): return self.name diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h deleted file mode 100644 index 84548c8c50..0000000000 --- a/volk/include/volk/volk_16i_convert_8i_a.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_volk_16i_convert_8i_a_H -#define INCLUDED_volk_16i_convert_8i_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h deleted file mode 100644 index 7108ff6590..0000000000 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H -#define INCLUDED_volk_16i_s32f_convert_32f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE4_1 -#include <smmintrin.h> - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h deleted file mode 100644 index 8ef627a628..0000000000 --- a/volk/include/volk/volk_16u_byteswap_u.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef INCLUDED_volk_16u_byteswap_u_H -#define INCLUDED_volk_16u_byteswap_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - -/*! - \brief Byteswaps (in-place) an unaligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an unaligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_16u_byteswap_u_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h deleted file mode 100644 index 2c469ac421..0000000000 --- a/volk/include/volk/volk_32f_convert_64f_a.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_a_H -#define INCLUDED_volk_32f_convert_64f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h deleted file mode 100644 index 10d8a4f6c0..0000000000 --- a/volk/include/volk/volk_32f_convert_64f_u.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_u_H -#define INCLUDED_volk_32f_convert_64f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h deleted file mode 100644 index 9df4946f24..0000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H -#define INCLUDED_volk_32f_s32f_convert_16i_a_H - -#include <volk/volk_common.h> -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - __m128 ret1, ret2; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - // Scale and clip - ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - // Scale and clip - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -32768; - float max_val = 32767; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r < min_val) - r = min_val; - else if(r > max_val) - r = max_val; - *outputVectorPtr++ = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h deleted file mode 100644 index ee15edb464..0000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H -#define INCLUDED_volk_32f_s32f_convert_32i_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = -2147483647; - float max_val = 2147483647; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm_cvtps_epi32(inputVal1); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = -2147483647; - float max_val = 2147483647; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -2147483647; - float max_val = 2147483647; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int32_t)(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h deleted file mode 100644 index 800017d5da..0000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H -#define INCLUDED_volk_32f_s32f_convert_8i_a_H - -#include <volk/volk_common.h> -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - - float min_val = -128; - float max_val = 127; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); - inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(inputVal1); - intInputVal2 = _mm_cvtps_epi32(inputVal2); - intInputVal3 = _mm_cvtps_epi32(inputVal3); - intInputVal4 = _mm_cvtps_epi32(inputVal4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int8_t)(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - - float min_val = -128; - float max_val = 127; - float r; - - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int8_t)(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -128; - float max_val = 127; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int8_t)(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h deleted file mode 100644 index b3fae9b053..0000000000 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H -#define INCLUDED_volk_32f_s32f_multiply_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> -/*! - \brief Scalar float multiply - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param scalar the scalar value - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, bVal, cVal; - bVal = _mm_set_ps1(scalar); - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX -#include <immintrin.h> -/*! - \brief Scalar float multiply - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param scalar the scalar value - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m256 aVal, bVal, cVal; - bVal = _mm256_set1_ps(scalar); - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Scalar float multiply - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param scalar the scalar value - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* inputPtr = aVector; - float* outputPtr = cVector; - for(number = 0; number < num_points; number++){ - *outputPtr = (*inputPtr) * scalar; - inputPtr++; - outputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h deleted file mode 100644 index 52e8286bc2..0000000000 --- a/volk/include/volk/volk_32f_x2_add_32f_u.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_add_32f_u_H -#define INCLUDED_volk_32f_x2_add_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h deleted file mode 100644 index 067c33ad89..0000000000 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ /dev/null @@ -1,290 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H - -#include <volk/volk_common.h> -#include<stdio.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#ifdef LV_HAVE_SSE - - -static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include <pmmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); - dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); - dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); - dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include <smmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_AVX - -#include <immintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val; - __m256 b0Val, b1Val; - __m256 c0Val, c1Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_AVX*/ - -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h deleted file mode 100644 index b24e8b1f79..0000000000 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ /dev/null @@ -1,290 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H - -#include <volk/volk_common.h> -#include<stdio.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#ifdef LV_HAVE_SSE - - -static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include <pmmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); - dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); - dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); - dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include <smmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_AVX - -#include <immintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val; - __m256 b0Val, b1Val; - __m256 c0Val, c1Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_AVX*/ - -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h deleted file mode 100644 index bfb896d602..0000000000 --- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H -#define INCLUDED_volk_32f_x2_multiply_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX -#include <immintrin.h> -/*! - \brief Multiplies the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h deleted file mode 100644 index e0d79ea7bc..0000000000 --- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H -#define INCLUDED_volk_32fc_conjugate_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - - x = _mm_xor_ps(x, conjugator); // conjugate register - - _mm_storeu_ps((float*)c,x); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = lv_conj(*a); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h deleted file mode 100644 index 77566e671d..0000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h deleted file mode 100644 index c8b3f0a088..0000000000 --- a/volk/include/volk/volk_32fc_magnitude_32f_u.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H -#define INCLUDED_volk_32fc_magnitude_32f_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h deleted file mode 100644 index d3ac9717a8..0000000000 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H -#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H - -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h deleted file mode 100644 index 53a4e68eb4..0000000000 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H -#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h deleted file mode 100644 index 5c7d15b02f..0000000000 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H -#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> -/*! - \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm_set_ps1(lv_creal(scalar)); - yh = _mm_set_ps1(lv_cimag(scalar)); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = num_points; - - // unwrap loop - while (number >= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; - } - - // clean up any remaining - while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h deleted file mode 100644 index e7493413f7..0000000000 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H -#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H - - -#include<volk/volk_complex.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - -#ifdef LV_HAVE_SSE3 - -#include <xmmintrin.h> -#include <pmmintrin.h> -#include <mmintrin.h> - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - // Variable never used? - //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - union HalfMask { - uint32_t intRep[4]; - __m128 vec; - } halfMask; - - union NegMask { - int intRep[4]; - __m128 vec; - } negMask; - - unsigned int offset = 0; - float Rsum=0, Isum=0; - float Im,Re; - - __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; - __m128 zv = {0,0,0,0}; - - halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; - halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; - - negMask.intRep[0] = negMask.intRep[2] = 0x80000000; - negMask.intRep[1] = negMask.intRep[3] = 0; - - // main loop - while(num_bytes >= 4*sizeof(float)){ - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_mul_ps(in1, in2); - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_mul_ps(in1, fehg); - Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - num_bytes -= 4*sizeof(float); - offset += 2; - Rsum += Re; - Isum += Im; - } - - // handle the last complex case ... - if(num_bytes > 0){ - - if(num_bytes != 4){ - // bad things are happening - } - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); - Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - Rsum += Re; - Isum += Im; - } - - result[0] = lv_cmake(Rsum,Isum); - return; -} - -#endif /*LV_HAVE_SSE3*/ - - -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ - - - diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h deleted file mode 100644 index 7c0dba7fd8..0000000000 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H -#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H - -#include <volk/volk_common.h> -#include <volk/volk_complex.h> -#include <stdio.h> -#include <string.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_points/2; - unsigned int isodd = num_points &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[num_points - 1] * taps[num_points - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - -#ifdef LV_HAVE_SSE3 - -#include <pmmintrin.h> - -static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_points/2; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - - _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if(num_points % 1 != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h deleted file mode 100644 index a998d6184e..0000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H -#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h deleted file mode 100644 index 2755192e96..0000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H -#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - y = _mm_xor_ps(y, conjugator); // conjugate y - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * lv_conj(*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h deleted file mode 100644 index 09dcd635b9..0000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H -#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - y = _mm_xor_ps(y, conjugator); // conjugate y - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * lv_conj(*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h deleted file mode 100644 index 8f4123d719..0000000000 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H -#define INCLUDED_volk_32i_s32f_convert_32f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h deleted file mode 100644 index b3a8ab2015..0000000000 --- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H -#define INCLUDED_volk_32i_s32f_convert_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h deleted file mode 100644 index e27d1f03dd..0000000000 --- a/volk/include/volk/volk_32u_byteswap_u.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32u_byteswap_u_H -#define INCLUDED_volk_32u_byteswap_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32u_byteswap_u_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h deleted file mode 100644 index 11d51702bc..0000000000 --- a/volk/include/volk/volk_64f_convert_32f_a.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_a_H -#define INCLUDED_volk_64f_convert_32f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h deleted file mode 100644 index 31dc5b5fe9..0000000000 --- a/volk/include/volk/volk_64f_convert_32f_u.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_u_H -#define INCLUDED_volk_64f_convert_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h deleted file mode 100644 index 41a4a3130f..0000000000 --- a/volk/include/volk/volk_64u_byteswap_u.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef INCLUDED_volk_64u_byteswap_u_H -#define INCLUDED_volk_64u_byteswap_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - uint64_t number = 0; - const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64u_byteswap_u_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h deleted file mode 100644 index 7d7104f52b..0000000000 --- a/volk/include/volk/volk_8i_convert_16i_u.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_8i_convert_16i_u_H -#define INCLUDED_volk_8i_convert_16i_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE4_1 -#include <smmintrin.h> - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h deleted file mode 100644 index 8bb2c0d1a4..0000000000 --- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H -#define INCLUDED_volk_8i_s32f_convert_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE4_1 -#include <smmintrin.h> - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h index 83d9baf89d..690e5f99f6 100644 --- a/volk/include/volk/volk_prefs.h +++ b/volk/include/volk/volk_prefs.h @@ -2,23 +2,26 @@ #define INCLUDED_VOLK_PREFS_H #include <volk/volk_common.h> +#include <stdlib.h> __VOLK_DECL_BEGIN -struct volk_arch_pref { - char name[128]; - char arch[32]; -}; +typedef struct volk_arch_pref +{ + char name[128]; //name of the kernel + char impl_a[128]; //best aligned impl + char impl_u[128]; //best unaligned impl +} volk_arch_pref_t; //////////////////////////////////////////////////////////////////////// // get path to volk_config profiling info //////////////////////////////////////////////////////////////////////// -VOLK_API void get_config_path(char *); +VOLK_API void volk_get_config_path(char *); //////////////////////////////////////////////////////////////////////// // load prefs into global prefs struct //////////////////////////////////////////////////////////////////////// -VOLK_API int load_preferences(struct volk_arch_pref **); +VOLK_API size_t volk_load_preferences(volk_arch_pref_t **); __VOLK_DECL_END diff --git a/volk/kernels/README.txt b/volk/kernels/README.txt new file mode 100644 index 0000000000..5dd7434b54 --- /dev/null +++ b/volk/kernels/README.txt @@ -0,0 +1,67 @@ +######################################################################## +# How to create custom kernel dispatchers +######################################################################## +A kernel dispatcher is kernel implementation that calls other kernel implementations. +By default, a dispatcher is generated by the build system for every kernel such that: + * the best aligned implemention is called when all pointer arguments are aligned, + * and otherwise the best unaligned implementation is called. + +The author of a VOLK kernel may create a custom dispatcher, +to be called in place of the automatically generated one. +A custom dispatcher may be useful to handle head and tail cases, +or to implement different alignment and bounds checking logic. + +######################################################################## +# Code for an example dispatcher w/ tail case +######################################################################## +#include <volk/volk_common.h> + +#ifdef LV_HAVE_DISPATCHER + +static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) +{ + const unsigned int num_points_r = num_points%4; + const unsigned int num_points_x = num_points - num_points_r; + + if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector)))) + { + volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x); + } + else + { + volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x); + } + + volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r); +} + +#endif //LV_HAVE_DISPATCHER + +######################################################################## +# Code for an example dispatcher w/ tail case and accumulator +######################################################################## +#include <volk/volk_common.h> + +#ifdef LV_HAVE_DISPATCHER + +static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points) +{ + const unsigned int num_points_r = num_points%16; + const unsigned int num_points_x = num_points - num_points_r; + + if (volk_is_aligned(VOLK_OR_PTR(input, taps))) + { + volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x); + } + else + { + volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x); + } + + float result_tail = 0; + volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r); + + *result += result_tail; +} + +#endif //LV_HAVE_DISPATCHER diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index 1f6554af8b..8bc1569f61 100644 --- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { +static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { static const int N_UNROLL = 4; diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/kernels/volk/volk_16i_branch_4_state_8.h index 6338fbdd17..cdfbc7ba13 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a.h +++ b/volk/kernels/volk/volk_16i_branch_4_state_8.h @@ -138,7 +138,7 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC -static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { +static inline void volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; int bound = 4; diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/kernels/volk/volk_16i_convert_8i.h index 80608a1412..3789b2e4ab 100644 --- a/volk/include/volk/volk_16i_convert_8i_u.h +++ b/volk/kernels/volk/volk_16i_convert_8i.h @@ -54,7 +54,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_ \param num_points The number of data values to be converted \note Input and output buffers do NOT need to be properly aligned */ -static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ +static inline void volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -69,3 +69,72 @@ static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int #endif /* INCLUDED_volk_16i_convert_8i_u_H */ +#ifndef INCLUDED_volk_16i_convert_8i_a_H +#define INCLUDED_volk_16i_convert_8i_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/kernels/volk/volk_16i_max_star_16i.h index edfff8a82b..c67351c5fa 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/kernels/volk/volk_16i_max_star_16i.h @@ -12,9 +12,9 @@ #include<emmintrin.h> #include<tmmintrin.h> -static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) { - +static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) { + const unsigned int num_bytes = num_points*2; short candidate = src0[0]; short cands[8]; @@ -87,7 +87,9 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; int i = 0; diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h index c1c9084256..ef88ec094f 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -13,7 +13,9 @@ #include<emmintrin.h> #include<tmmintrin.h> -static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; @@ -110,7 +112,9 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { +static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; int i = 0; diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h index 47e3cbf9cb..7a01d172a3 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h +++ b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h @@ -13,8 +13,9 @@ #include<xmmintrin.h> #include<emmintrin.h> -static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { +static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) { + const unsigned int num_bytes = num_points*2; __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -117,7 +118,9 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short #ifdef LV_HAVE_GENERIC -static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { +static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; int i = 0; diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h index 4ce8e8f35b..a810a601a0 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h +++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h @@ -105,7 +105,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in \param num_points The number of data values to be converted \note Output buffer does NOT need to be properly aligned */ -static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -120,3 +120,122 @@ static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, cons #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H +#define INCLUDED_volk_16i_s32f_convert_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h index 0d84985530..56b2cc07ab 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h +++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -13,10 +13,9 @@ #include<emmintrin.h> -static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - +static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { + const unsigned int num_bytes = num_points*2; int i = 0; @@ -168,7 +167,9 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { +static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; int i = 0; diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h index 5560b92d92..9b6d19fd66 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h +++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h @@ -13,7 +13,9 @@ #include<xmmintrin.h> #include<emmintrin.h> -static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { +static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; @@ -113,7 +115,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { +static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { + + const unsigned int num_bytes = num_points*2; int i = 0; diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h index f8aa30874f..9ce8012640 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h +++ b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h @@ -128,7 +128,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; @@ -149,7 +149,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int \param num_points The number of complex data values to be deinterleaved */ extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h index bac1f2e4b0..f6eccd77ee 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h +++ b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h @@ -103,7 +103,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h index cd2fabb521..f3d0c83524 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h +++ b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h @@ -66,7 +66,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -85,7 +85,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, con \param num_points The number of complex data values to be deinterleaved */ extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/kernels/volk/volk_16ic_magnitude_16i.h index 317075e85e..b33306a123 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a.h +++ b/volk/kernels/volk/volk_16ic_magnitude_16i.h @@ -161,7 +161,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -182,7 +182,7 @@ static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, c \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h index 1300395ff0..55243b4aa8 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h @@ -78,7 +78,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -100,7 +100,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, \param num_points The number of complex data values to be deinterleaved */ extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h index 5e2d82b947..57d078a595 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h +++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h @@ -108,7 +108,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h index d20eea1a79..27901cb9ac 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h +++ b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h @@ -149,7 +149,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -171,7 +171,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/kernels/volk/volk_16u_byteswap.h index fc3eb5fa7a..57f2008991 100644 --- a/volk/include/volk/volk_16u_byteswap_a.h +++ b/volk/kernels/volk/volk_16u_byteswap.h @@ -1,3 +1,66 @@ +#ifndef INCLUDED_volk_16u_byteswap_u_H +#define INCLUDED_volk_16u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_16u_byteswap_u_H */ #ifndef INCLUDED_volk_16u_byteswap_a_H #define INCLUDED_volk_16u_byteswap_a_H @@ -68,7 +131,7 @@ static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned in \param numDataPoints The number of data points */ extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/kernels/volk/volk_32f_accumulator_s32f.h index 78364d0a01..a67d10f9b5 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a.h +++ b/volk/kernels/volk/volk_32f_accumulator_s32f.h @@ -50,7 +50,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ const float* aPtr = inputBuffer; unsigned int number = 0; float returnValue = 0; diff --git a/volk/kernels/volk/volk_32f_convert_64f.h b/volk/kernels/volk/volk_32f_convert_64f.h new file mode 100644 index 0000000000..2f036955dd --- /dev/null +++ b/volk/kernels/volk/volk_32f_convert_64f.h @@ -0,0 +1,140 @@ +#ifndef INCLUDED_volk_32f_convert_64f_u_H +#define INCLUDED_volk_32f_convert_64f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_u_H */ +#ifndef INCLUDED_volk_32f_convert_64f_a_H +#define INCLUDED_volk_32f_convert_64f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/kernels/volk/volk_32f_index_max_16u.h index b9ca1dd3e7..dd1aed2459 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/kernels/volk/volk_32f_index_max_16u.h @@ -124,7 +124,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa #endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ float max = src0[0]; unsigned int index = 0; diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h index 43713f8b5a..71881c2d5f 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h @@ -87,7 +87,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. \param num_points The number of real values in the input vector. */ -static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ +static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ if (num_points < 1) { return; } diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h index db61e359d6..bf05a882d5 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h @@ -128,7 +128,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 \param noiseFloorAmplitude The noise floor of the input spectrum, in dB */ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ float sumMean = 0.0; unsigned int number; // find the sum (for mean), etc diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h index 56e42c9bd5..9fd758655f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h @@ -127,7 +127,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const \param num_points The number of data values to be converted \note Input buffer does NOT need to be properly aligned */ -static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -150,3 +150,153 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H +#define INCLUDED_volk_32f_s32f_convert_16i_a_H + +#include <volk/volk_common.h> +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/kernels/volk/volk_32f_s32f_convert_32i.h index 38e6b2e745..1a46093ee2 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_32i.h @@ -1,3 +1,145 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H +#define INCLUDED_volk_32f_s32f_convert_32i_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H #define INCLUDED_volk_32f_s32f_convert_32i_a_H diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_8i.h index 870e9419bb..b451505221 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_8i.h @@ -132,7 +132,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl \param num_points The number of data values to be converted \note Input buffer does NOT need to be properly aligned */ -static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -155,3 +155,158 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H +#define INCLUDED_volk_32f_s32f_convert_8i_a_H + +#include <volk/volk_common.h> +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + + float min_val = -128; + float max_val = 127; + float r; + + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int8_t)(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h index 99b8e68c5b..2dd86a17c2 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h @@ -1,3 +1,105 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H +#define INCLUDED_volk_32f_s32f_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H #define INCLUDED_volk_32f_s32f_multiply_32f_a_H @@ -108,7 +210,7 @@ static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const fl \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/kernels/volk/volk_32f_s32f_normalize.h index f5fd0d1dba..a0bd33c7dc 100644 --- a/volk/include/volk/volk_32f_s32f_normalize_a.h +++ b/volk/kernels/volk/volk_32f_s32f_normalize.h @@ -49,7 +49,7 @@ static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float s \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; float* inputPtr = vecBuffer; const float invScalar = 1.0 / scalar; @@ -69,7 +69,7 @@ static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const flo \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/kernels/volk/volk_32f_s32f_power_32f.h index 633ad14b09..2822444686 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_power_32f.h @@ -127,7 +127,7 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h index 98401b2d42..0622b278a6 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h @@ -120,7 +120,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ const float* aPtr = inputBuffer; diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/kernels/volk/volk_32f_sqrt_32f.h index d9b16fc0fb..ab9fffd7dc 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a.h +++ b/volk/kernels/volk/volk_32f_sqrt_32f.h @@ -47,7 +47,7 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; @@ -66,7 +66,7 @@ extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int); \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points){ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); } diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h index 7de32f7b18..9bded6713d 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h @@ -143,7 +143,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/kernels/volk/volk_32f_x2_add_32f.h index 51e63e54d2..42278f6068 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_add_32f.h @@ -1,3 +1,69 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ #ifndef INCLUDED_volk_32f_x2_add_32f_a_H #define INCLUDED_volk_32f_x2_add_32f_a_H @@ -72,7 +138,7 @@ static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aV \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/kernels/volk/volk_32f_x2_divide_32f.h index 7b60fb22ef..d5a7c7d7c0 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_divide_32f.h @@ -51,7 +51,7 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h index 961c2418ca..8fcc7deaed 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h +++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { float dotProduct = 0; const float* aPtr = input; diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h new file mode 100644 index 0000000000..b91252e36f --- /dev/null +++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h @@ -0,0 +1,580 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include <smmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_AVX + +#include <immintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include <smmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_AVX + +#include <immintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr+8); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h index 52d80b6bb3..0935cb32bd 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h +++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h @@ -56,7 +56,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c \param complexVector The complex output vector \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ +static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ float* complexVectorPtr = (float*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/kernels/volk/volk_32f_x2_max_32f.h index 79f2d04b56..27633acae8 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_max_32f.h @@ -53,7 +53,7 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -76,7 +76,7 @@ static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aV \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/kernels/volk/volk_32f_x2_min_32f.h index 42cac08339..4773d13211 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_min_32f.h @@ -53,7 +53,7 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -76,7 +76,7 @@ static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aV \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h index 340e051657..9fdbec0a2c 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h @@ -1,3 +1,109 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H +#define INCLUDED_volk_32f_x2_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H #define INCLUDED_volk_32f_x2_multiply_32f_a_H @@ -111,7 +217,7 @@ static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const floa \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h index 10fc267dcd..ce7b91a318 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h +++ b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h @@ -137,7 +137,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ int16_t* complexVectorPtr = (int16_t*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h index e2b8be797f..8ea491f988 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h @@ -51,7 +51,7 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const floa \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index 3c530628c8..e975f14e92 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -13,8 +13,9 @@ #include<xmmintrin.h> #include<pmmintrin.h> -static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { +static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { + const unsigned int num_bytes = num_points*4; float result = 0.0; float fst = 0.0; @@ -100,9 +101,9 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - +static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { + const unsigned int num_bytes = num_points*4; float result = 0.0; float fst = 0.0; diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 109b787e8c..e0a8a59ced 100644 --- a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { +static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { float res[2]; float *realpt = &res[0], *imagpt = &res[1]; diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h index 28d584bf2c..104e3250e6 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -64,7 +64,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; const float* bPtr= bVector; @@ -85,7 +85,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h index 919280d510..dce897ff57 100644 --- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h @@ -1,3 +1,67 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H #define INCLUDED_volk_32fc_conjugate_32fc_a_H diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h index 4106f38513..0d33ed7e28 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -57,7 +57,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h new file mode 100644 index 0000000000..4a4c5509bd --- /dev/null +++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h @@ -0,0 +1,156 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_storeu_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_storeu_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h index c88809bebd..b1968296f5 100644 --- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l \param qBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h index 0d6c6b7af4..3d57598135 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h index 1e346bacaf..1fa66e8add 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h @@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/kernels/volk/volk_32fc_index_max_16u.h index 842a6a0420..c8d7212401 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/kernels/volk/volk_32fc_index_max_16u.h @@ -11,9 +11,9 @@ #include<pmmintrin.h> -static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - +static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) { + const unsigned int num_bytes = num_points*8; union bit128 holderf; union bit128 holderi; @@ -189,7 +189,10 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { +static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + float sq_dist = 0.0; float max = 0.0; unsigned int index = 0; diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h index efb84a904b..64e99cc1be 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a.h +++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h @@ -1,3 +1,121 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H +#define INCLUDED_volk_32fc_magnitude_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H #define INCLUDED_volk_32fc_magnitude_32f_a_H @@ -123,7 +241,7 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h new file mode 100644 index 0000000000..0af81401a8 --- /dev/null +++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -0,0 +1,228 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h index d86bd63c1c..b076ab44ef 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -139,7 +139,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.0 / normalizeFactor; diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h index 1c17fb70c6..9e10217a0f 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h @@ -63,7 +63,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* iBufferPtr = iBuffer; unsigned int number = 0; diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h index 38fd609d31..09abd967d6 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h @@ -129,7 +129,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -150,7 +150,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVect \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h index 3106edbefd..d4a1d17469 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h @@ -94,7 +94,7 @@ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_ \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ +static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = 0; diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h index 30a77dbc18..f76d9d35e4 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h @@ -96,7 +96,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu \param normalizationFactor This value is divided agains all the input values before the power is calculated \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h index 27f755351d..e73eb09f8f 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h @@ -103,7 +103,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo \param rbw The resolution bandwith of the fft spectrum \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h index f206c5e874..668a047609 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h @@ -1,3 +1,90 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h index eee9f0064f..ab6b7fb1df 100644 --- a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h @@ -4,7 +4,7 @@ #include <volk/volk_complex.h> #include <stdio.h> -#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h> +#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h> #ifdef LV_HAVE_GENERIC @@ -19,9 +19,9 @@ */ -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; - volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points); + volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -32,7 +32,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVe static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; - volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points); + volk_32fc_s32fc_x2_rotator_32fc_sse4_1(outVector, inVector, phase_inc, phase, num_points); } @@ -58,7 +58,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; - volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points); + volk_32fc_s32fc_x2_rotator_32fc_avx(outVector, inVector, phase_inc, phase, num_points); } diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h index 51b6041ec0..ffbbdff690 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h @@ -20,7 +20,7 @@ */ -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ unsigned int i = 0; int j = 0; for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { @@ -42,7 +42,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVecto #ifdef LV_HAVE_SSE4_1 #include <smmintrin.h> -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; @@ -153,7 +153,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index e3dedf2fcd..e6ccf5c384 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -1,3 +1,152 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H + + +#include<volk/volk_complex.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE3 + +#include <xmmintrin.h> +#include <pmmintrin.h> +#include <mmintrin.h> + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int num_bytes = num_points*8; + + // Variable never used? + //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + union HalfMask { + uint32_t intRep[4]; + __m128 vec; + } halfMask; + + union NegMask { + int intRep[4]; + __m128 vec; + } negMask; + + unsigned int offset = 0; + float Rsum=0, Isum=0; + float Im,Re; + + __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; + __m128 zv = {0,0,0,0}; + + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; + halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; + + negMask.intRep[0] = negMask.intRep[2] = 0x80000000; + negMask.intRep[1] = negMask.intRep[3] = 0; + + // main loop + while(num_bytes >= 4*sizeof(float)){ + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_mul_ps(in1, in2); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_mul_ps(in1, fehg); + Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + num_bytes -= 4*sizeof(float); + offset += 2; + Rsum += Re; + Isum += Im; + } + + // handle the last complex case ... + if(num_bytes > 0){ + + if(num_bytes != 4){ + // bad things are happening + } + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); + Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + Rsum += Re; + Isum += Im; + } + + result[0] = lv_cmake(Rsum,Isum); + return; +} + +#endif /*LV_HAVE_SSE3*/ + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ + + + #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H @@ -9,7 +158,9 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; float * res = (float*) result; float * in = (float*) input; @@ -63,7 +214,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; @@ -204,7 +357,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, #endif #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index caef3e6f0d..066bed4439 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -1,3 +1,119 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H + +#include <volk/volk_common.h> +#include <volk/volk_complex.h> +#include <stdio.h> +#include <string.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[num_points - 1] * taps[num_points - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_points/2; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if(num_points % 1 != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H @@ -10,7 +126,9 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; float * res = (float*) result; float * in = (float*) input; @@ -46,8 +164,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + const unsigned int num_bytes = num_points*8; asm ( @@ -175,11 +294,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes); +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); #if 0 + const unsigned int num_bytes = num_points*8; asm volatile ( " #pushl %%ebp\n\t" @@ -299,8 +418,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const #include <pmmintrin.h> -static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + const unsigned int num_bytes = num_points*8; lv_32fc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(float)); @@ -356,7 +476,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv #include <smmintrin.h> -static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; float *p_input, *p_taps; diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h index f79ddb59bf..7db68c1bd8 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -1,3 +1,80 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H @@ -81,7 +158,7 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h new file mode 100644 index 0000000000..cfd6c007f1 --- /dev/null +++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -0,0 +1,162 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index 75eb9173d5..cb2e945015 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -10,8 +10,9 @@ #include<xmmintrin.h> #include<pmmintrin.h> -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { + const unsigned int num_bytes = num_points*8; __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; @@ -106,7 +107,10 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + lv_32fc_t diff; float sq_dist; unsigned int i = 0; diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h index b819eaffd4..27a081b7cf 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -9,8 +9,9 @@ #include<xmmintrin.h> #include<pmmintrin.h> -static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { +static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { + const unsigned int num_bytes = num_points*8; __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -92,7 +93,10 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { +static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + lv_32fc_t diff; float sq_dist; unsigned int i = 0; diff --git a/volk/kernels/volk/volk_32i_s32f_convert_32f.h b/volk/kernels/volk/volk_32i_s32f_convert_32f.h new file mode 100644 index 0000000000..7a09883453 --- /dev/null +++ b/volk/kernels/volk/volk_32i_s32f_convert_32f.h @@ -0,0 +1,148 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H +#define INCLUDED_volk_32i_s32f_convert_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H +#define INCLUDED_volk_32i_s32f_convert_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/kernels/volk/volk_32i_x2_and_32i.h index e5330847b3..54ecb79812 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a.h +++ b/volk/kernels/volk/volk_32i_x2_and_32i.h @@ -51,7 +51,7 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/kernels/volk/volk_32i_x2_or_32i.h index 24045894c6..acadd5a57f 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a.h +++ b/volk/kernels/volk/volk_32i_x2_or_32i.h @@ -51,7 +51,7 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/kernels/volk/volk_32u_byteswap.h index 71ae027d37..8f6e3ad7b5 100644 --- a/volk/include/volk/volk_32u_byteswap_a.h +++ b/volk/kernels/volk/volk_32u_byteswap.h @@ -1,3 +1,80 @@ +#ifndef INCLUDED_volk_32u_byteswap_u_H +#define INCLUDED_volk_32u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_u_H */ #ifndef INCLUDED_volk_32u_byteswap_a_H #define INCLUDED_volk_32u_byteswap_a_H diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/kernels/volk/volk_32u_popcnt.h index b72d605c67..9783569729 100644 --- a/volk/include/volk/volk_32u_popcnt_a.h +++ b/volk/kernels/volk/volk_32u_popcnt.h @@ -7,7 +7,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) { +static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) { // This is faster than a lookup table uint32_t retVal = value; diff --git a/volk/kernels/volk/volk_64f_convert_32f.h b/volk/kernels/volk/volk_64f_convert_32f.h new file mode 100644 index 0000000000..c27526ffaf --- /dev/null +++ b/volk/kernels/volk/volk_64f_convert_32f.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_64f_convert_32f_u_H +#define INCLUDED_volk_64f_convert_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_u_H */ +#ifndef INCLUDED_volk_64f_convert_32f_a_H +#define INCLUDED_volk_64f_convert_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/kernels/volk/volk_64f_x2_max_64f.h index 33aae6d102..f9a04c2c40 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a.h +++ b/volk/kernels/volk/volk_64f_x2_max_64f.h @@ -53,7 +53,7 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/kernels/volk/volk_64f_x2_min_64f.h index 25d8b4c982..c77ca87fbd 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a.h +++ b/volk/kernels/volk/volk_64f_x2_min_64f.h @@ -53,7 +53,7 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/kernels/volk/volk_64u_byteswap.h index 3d1d87623e..e05daf6d5c 100644 --- a/volk/include/volk/volk_64u_byteswap_a.h +++ b/volk/kernels/volk/volk_64u_byteswap.h @@ -1,3 +1,91 @@ +#ifndef INCLUDED_volk_64u_byteswap_u_H +#define INCLUDED_volk_64u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_u_H */ #ifndef INCLUDED_volk_64u_byteswap_a_H #define INCLUDED_volk_64u_byteswap_a_H diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/kernels/volk/volk_64u_popcnt.h index 5e68ed2083..466cfa5dad 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/kernels/volk/volk_64u_popcnt.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { +static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) { //const uint32_t* valueVector = (const uint32_t*)&value; diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/kernels/volk/volk_8i_convert_16i.h index 9104f90cb0..3e5c92723f 100644 --- a/volk/include/volk/volk_8i_convert_16i_a.h +++ b/volk/kernels/volk/volk_8i_convert_16i.h @@ -1,3 +1,76 @@ +#ifndef INCLUDED_volk_8i_convert_16i_u_H +#define INCLUDED_volk_8i_convert_16i_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ #ifndef INCLUDED_volk_8i_convert_16i_a_H #define INCLUDED_volk_8i_convert_16i_a_H @@ -73,7 +146,7 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in \param num_points The number of data values to be converted */ extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ +static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/kernels/volk/volk_8i_s32f_convert_32f.h index 02a7f356e0..bd7ff82d9a 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h +++ b/volk/kernels/volk/volk_8i_s32f_convert_32f.h @@ -1,3 +1,97 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H +#define INCLUDED_volk_8i_s32f_convert_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H #define INCLUDED_volk_8i_s32f_convert_32f_a_H @@ -95,7 +189,7 @@ static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const \param num_points The number of data values to be converted */ extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); } diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h index 8f13da32ff..b59d22d186 100644 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h @@ -59,7 +59,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16 \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h index d26b3d0d0d..82cedb2bb7 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h @@ -49,7 +49,7 @@ static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, con \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h index 21efed83e7..c8ff18e67b 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h @@ -50,7 +50,7 @@ static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h index d82da59fb1..9e244c8fc2 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h @@ -146,7 +146,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h index b2c15d3a30..56a1adcbb5 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h +++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h @@ -116,7 +116,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h index f85fdb9995..685a21ddcd 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h @@ -75,7 +75,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ unsigned int number = 0; int16_t* c16Ptr = (int16_t*)cVector; int8_t* a8Ptr = (int8_t*)aVector; diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h index 4b16171cec..edb52ff509 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h @@ -95,7 +95,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ unsigned int number = 0; float* cPtr = (float*)cVector; const float invScalar = 1.0 / scalar; diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index 79655f1bd2..68fadc35b0 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -202,7 +202,7 @@ message(STATUS "Available machines: ${available_machines}") #dependencies are all python, xml, and header implementation files file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml) file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py) -file(GLOB h_files ${CMAKE_SOURCE_DIR}/include/volk/*.h) +file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h) macro(gen_template tmpl output) list(APPEND volk_gen_sources ${output}) @@ -253,6 +253,7 @@ endforeach(machine_name) include_directories( ${CMAKE_BINARY_DIR}/include ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/kernels ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 4e361aece2..e526eb2d01 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -63,12 +63,12 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { } } -static std::vector<std::string> get_arch_list(struct volk_func_desc desc) { +static std::vector<std::string> get_arch_list(volk_func_desc_t desc) { std::vector<std::string> archlist; - for(int i = 0; i < desc.n_archs; i++) { + for(size_t i = 0; i < desc.n_impls; i++) { //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc - archlist.push_back(std::string(desc.indices[i])); + archlist.push_back(std::string(desc.impl_names[i])); } return archlist; @@ -256,7 +256,7 @@ public: private: std::list<std::vector<char> > _mems; }; -bool run_volk_tests(struct volk_func_desc desc, +bool run_volk_tests(volk_func_desc_t desc, void (*manual_func)(), std::string name, float tol, @@ -442,22 +442,32 @@ bool run_volk_tests(struct volk_func_desc desc, arch_results.push_back(!fail); } - double best_time = std::numeric_limits<double>::max(); - std::string best_arch = "generic"; - for(size_t i=0; i < arch_list.size(); i++) { - if((profile_times[i] < best_time) && arch_results[i]) { - best_time = profile_times[i]; - best_arch = arch_list[i]; + double best_time_a = std::numeric_limits<double>::max(); + double best_time_u = std::numeric_limits<double>::max(); + std::string best_arch_a = "generic"; + std::string best_arch_u = "generic"; + for(size_t i=0; i < arch_list.size(); i++) + { + if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) + { + best_time_u = profile_times[i]; + best_arch_u = arch_list[i]; + } + if((profile_times[i] < best_time_a) && arch_results[i]) + { + best_time_a = profile_times[i]; + best_arch_a = arch_list[i]; } } - std::cout << "Best arch: " << best_arch << std::endl; + std::cout << "Best aligned arch: " << best_arch_a << std::endl; + std::cout << "Best unaligned arch: " << best_arch_u << std::endl; if(best_arch_vector) { if(puppet_master_name == "NULL") { - best_arch_vector->push_back(name + std::string(" ") + best_arch); + best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u); } else { - best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch); + best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u); } } diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index 1e639ac3c6..0f17cdaa34 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -21,7 +21,7 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); +bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); #define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 2e41c25daf..f133897cba 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -2,107 +2,89 @@ #include <volk/volk.h> #include <boost/test/unit_test.hpp> -//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000); -//VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000); -//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000); -//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); -//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); -VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 1); -//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); -VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 1); -VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 1); -//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 1); -VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1); -//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1); -VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); +//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000); +//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000); +VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1); +//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1); +VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1); +//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1); +VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1); +//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1); +VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c index 5e5c9dfff7..f787b5e2aa 100644 --- a/volk/lib/volk_prefs.c +++ b/volk/lib/volk_prefs.c @@ -7,7 +7,8 @@ //#include <Windows.h> //#endif -void get_config_path(char *path) { +void volk_get_config_path(char *path) +{ const char *suffix = "/.volk/volk_config"; char *home = NULL; if (home == NULL) home = getenv("HOME"); @@ -20,38 +21,30 @@ void get_config_path(char *path) { strcat(path, suffix); } -//passing by reference in C can (***********) -int load_preferences(struct volk_arch_pref **prefs) { +size_t volk_load_preferences(volk_arch_pref_t **prefs_res) +{ FILE *config_file; - char path[512], line[512], function[128], arch[32]; - int n_arch_prefs = 0; - struct volk_arch_pref *t_pref; + char path[512], line[512]; + size_t n_arch_prefs = 0; + volk_arch_pref_t *prefs = NULL; //get the config path - get_config_path(path); + volk_get_config_path(path); if (path == NULL) return n_arch_prefs; //no prefs found config_file = fopen(path, "r"); if(!config_file) return n_arch_prefs; //no prefs found - while(fgets(line, 512, config_file) != NULL) { - if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) { - n_arch_prefs++; - } - } - - //now allocate the memory required for volk_arch_prefs - (*prefs) = (struct volk_arch_pref *) malloc(n_arch_prefs * sizeof(struct volk_arch_pref)); - t_pref = (*prefs); - //reset the file pointer and write the prefs into volk_arch_prefs - rewind(config_file); - while(fgets(line, 512, config_file) != NULL) { - if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) { - strncpy(t_pref->name, function, 128); - strncpy(t_pref->arch, arch, 32); - t_pref++; + while(fgets(line, sizeof(line), config_file) != NULL) + { + prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); + volk_arch_pref_t *p = prefs + n_arch_prefs; + if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5)) + { + n_arch_prefs++; } } fclose(config_file); + *prefs_res = prefs; return n_arch_prefs; } diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c index 865d60955c..6ab013f269 100644 --- a/volk/lib/volk_rank_archs.c +++ b/volk/lib/volk_rank_archs.c @@ -1,43 +1,112 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + #include <volk_rank_archs.h> #include <volk/volk_prefs.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) { +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4 + #define __popcnt __builtin_popcount +#else + inline unsigned __popcnt(unsigned num) + { + unsigned pop = 0; + while(num) + { + if (num & 0x1) pop++; + num >>= 1; + } + return pop; + } +#endif + +int volk_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +){ unsigned int i; - for(i=0; i<n_archs; i++) { - if(!strncmp(indices[i], arch_name, 20)) { + for (i = 0; i < n_impls; i++) { + if(!strncmp(impl_names[i], impl_name, 20)) { return i; } } + //TODO return -1; //something terrible should happen here printf("Volk warning: no arch found, returning generic impl\n"); - return get_index(indices, n_archs, "generic"); //but we'll fake it for now + return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now } -unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char* name, unsigned int arch) { - unsigned int i; - unsigned int best_val = 0; - static struct volk_arch_pref *volk_arch_prefs; - static unsigned int n_arch_prefs = 0; +int volk_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +){ + size_t i; + static volk_arch_pref_t *volk_arch_prefs; + static size_t n_arch_prefs = 0; static int prefs_loaded = 0; if(!prefs_loaded) { - n_arch_prefs = load_preferences(&volk_arch_prefs); + n_arch_prefs = volk_load_preferences(&volk_arch_prefs); prefs_loaded = 1; } - //now look for the function name in the prefs list - for(i=0; i < n_arch_prefs; i++) { - if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it - return get_index(indices, n_archs, volk_arch_prefs[i].arch); - } - } + //now look for the function name in the prefs list + for(i = 0; i < n_arch_prefs; i++) + { + if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it + { + const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; + return volk_get_index(impl_names, n_impls, impl_name); + } + } - for(i=1; i < n_archs; ++i) { - if((arch_defs[i]&(!arch)) == 0) { - best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; + //return the best index with the largest deps + size_t best_index_a = 0; + size_t best_index_u = 0; + int best_value_a = -1; + int best_value_u = -1; + for(i = 0; i < n_impls; i++) + { + const signed val = __popcnt(impl_deps[i]); + if (alignment[i] && val > best_value_a) + { + best_index_a = i; + best_value_a = val; + } + if (!alignment[i] && val > best_value_u) + { + best_index_u = i; + best_value_u = val; + } } - } - return best_val; + + //when align and we found a best aligned, use it + if (align && best_value_a != -1) return best_index_a; + + //otherwise return the best unaligned + return best_index_u; } diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h index 546240d2c6..b3bf8ff17c 100644 --- a/volk/lib/volk_rank_archs.h +++ b/volk/lib/volk_rank_archs.h @@ -1,12 +1,48 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + #ifndef INCLUDED_VOLK_RANK_ARCHS_H #define INCLUDED_VOLK_RANK_ARCHS_H +#include <stdlib.h> +#include <stdbool.h> + #ifdef __cplusplus extern "C" { #endif -unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name); -unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char *name, unsigned int arch); +int volk_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +); + +int volk_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +); #ifdef __cplusplus } diff --git a/volk/python/volk_modtool/CMakeLists.txt b/volk/python/volk_modtool/CMakeLists.txt new file mode 100644 index 0000000000..6fb87f2668 --- /dev/null +++ b/volk/python/volk_modtool/CMakeLists.txt @@ -0,0 +1,39 @@ +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + +######################################################################## +# Install python files and apps +######################################################################## +include(GrPython) + +GR_PYTHON_INSTALL( + FILES + __init__.py + cfg.py + volk_modtool_generate.py + DESTINATION ${GR_PYTHON_DIR}/volk_modtool + COMPONENT "volk" +) + +GR_PYTHON_INSTALL( + PROGRAMS + volk_modtool + DESTINATION ${GR_RUNTIME_DIR} + COMPONENT "volk" +) diff --git a/volk/python/volk_modtool/README b/volk/python/volk_modtool/README new file mode 100644 index 0000000000..5413ff1f1a --- /dev/null +++ b/volk/python/volk_modtool/README @@ -0,0 +1,114 @@ +The volk_modtool tool is installed along with VOLK as a way of helping +to construct, add to, and interogate the VOLK library or companion +libraries. + +volk_modtool is installed into $prefix/bin. + +VOLK modtool enables creating standalone (out-of-tree) VOLK modules +and provides a few tools for sharing VOLK kernels between VOLK +modules. If you need to design or work with VOLK kernels away from +the canonical VOLK library, this is the tool. If you need to tailor +your own VOLK library for whatever reason, this is the tool. + +The canonical VOLK library installs a volk.h and a libvolk.so. Your +own library will install volk_$name.h and libvolk_$name.so. Ya Gronk? +Good. + +There isn't a substantial difference between the canonical VOLK +module and any other VOLK module. They're all peers. Any module +created via VOLK modtool will come complete with a default +volk_modtool.cfg file associating the module with the base from which +it came, its distinctive $name and its destination (or path). These +values (created from user input if VOLK modtool runs without a +user-supplied config file or a default config file) serve as default +values for some VOLK modtool actions. It's more or less intended for +the user to change directories to the top level of a created VOLK +module and then run volk_modtool to take advantage of the values +stored in the default volk_modtool.cfg file. + +Apart from creating new VOLK modules, VOLK modtool allows you to list +the names of kernels in other modules, list the names of kernels in +the current module, add kernels from another module into the current +module, and remove kernels from the current module. When moving +kernels between modules, VOLK modtool does its best to keep the qa +and profiling code for those kernels intact. If the base has a test +or a profiling call for some kernel, those calls will follow the +kernel when VOLK modtool adds that kernel. If QA or profiling +requires a puppet kernel, the puppet kernel will follow the original +kernel when VOLK modtool adds that original kernel. VOLK modtool +respects puppets. + +====================================================================== + +Installing a new VOLK Library: + +Run the command "volk_modtool -i". This will ask you three questions: + + name: // the name to give your VOLK library: volk_<name> + destination: // directory new source tree is built under -- must exists. + // It will create <directory>/volk_<name> + base: // the directory containing the original VOLK source code + +The name provided must be alphanumeric (and cannot start with a +number). No special characters including dashes and underscores are +allowed. + +This will build a new skeleton directory in the destination provided +with the name volk_<name>. It will contain the necessary structure to +build: + + mkdir build + cd build + cmake -DCMAKE_INSTALL_PREFIX=/opt/volk ../ + make + sudo make install + +Right now, the library is empty and contains no kernels. Kernels can +be added from another VOLK library using the '-a' option. If not +specified, the kernel will be extracted from the base VOLK +directory. Using the '-b' allows us to specify another VOLK library to +use for this purpose. + + volk_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc + +This will put the code for the new kernel into +<destination>/volk_<name>/kernels/volk_<name>/ + +Other kernels must be added by hand. See the following webpages for +more information about creating VOLK kernels: + http://gnuradio.org/doc/doxygen/volk_guide.html + http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk + + +====================================================================== + +OPTIONS + +Options for Adding and Removing Kernels: + -a, --add_kernel + Add kernel from existing VOLK module. Uses the base VOLK module + unless -b is used. Use -n to specify the kernel name. + Requires: -n. + Optional: -b + + -A, --add_all_kernels + Add all kernels from existing VOLK module. Uses the base VOLK + module unless -b is used. + Optional: -b + + -x, --remove_kernel + Remove kernel from module. + Required: -n. + Optional: -b + +Options for Listing Kernels: + -l, --list + Lists all kernels available in the base VOLK module. + + -k, --kernels + Lists all kernels in this VOLK module. + + -r, --remote-list + Lists all kernels in another VOLK module that is specified + using the -b option. + diff --git a/volk/python/volk_modtool/__init__.py b/volk/python/volk_modtool/__init__.py new file mode 100644 index 0000000000..6ddf48da05 --- /dev/null +++ b/volk/python/volk_modtool/__init__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from cfg import volk_modtool_config +from volk_modtool_generate import volk_modtool diff --git a/volk/python/volk_modtool/cfg.py b/volk/python/volk_modtool/cfg.py new file mode 100644 index 0000000000..c58dc59091 --- /dev/null +++ b/volk/python/volk_modtool/cfg.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +import ConfigParser +import sys +import os +import exceptions +import re + + +class volk_modtool_config: + def key_val_sub(self, num, stuff, section): + return re.sub('\$' + 'k' + str(num), stuff[num][0], (re.sub('\$' + str(num), stuff[num][1], section[1][num]))); + + def verify(self): + for i in self.verification: + self.verify_section(i) + def remap(self): + for i in self.remapification: + self.verify_section(i) + + def verify_section(self, section): + stuff = self.cfg.items(section[0]) + for i in range(len(section[1])): + eval(self.key_val_sub(i, stuff, section)) + try: + val = eval(self.key_val_sub(i, stuff, section)) + if val == False: + raise exceptions.ValueError + except ValueError: + raise exceptions.ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) + except: + raise exceptions.IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) + + + def __init__(self, cfg=None): + self.config_name = 'config' + self.config_defaults = ['name', 'destination', 'base'] + self.config_defaults_remap = ['1', + 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))', + 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))'] + + self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')', + 'os.path.exists(\'$1\')', + 'os.path.exists(\'$2\')'] + self.remapification = [(self.config_name, self.config_defaults_remap)] + self.verification = [(self.config_name, self.config_defaults_verify)] + default = os.path.join(os.getcwd(), 'volk_modtool.cfg') + icfg = ConfigParser.RawConfigParser() + if cfg: + icfg.read(cfg) + elif os.path.exists(default): + icfg.read(default) + else: + print "Initializing config file..." + icfg.add_section(self.config_name) + for kn in self.config_defaults: + rv = raw_input("%s: "%(kn)) + icfg.set(self.config_name, kn, rv) + self.cfg = icfg + self.remap() + self.verify() + + + + def read_map(self, name, inp): + if self.cfg.has_section(name): + self.cfg.remove_section(name) + self.cfg.add_section(name) + for i in inp: + self.cfg.set(name, i, inp[i]) + + def get_map(self, name): + retval = {} + stuff = self.cfg.items(name) + for i in stuff: + retval[i[0]] = i[1] + return retval + + + + + + + diff --git a/volk/python/volk_modtool/volk_modtool b/volk/python/volk_modtool/volk_modtool new file mode 100755 index 0000000000..74b71adde2 --- /dev/null +++ b/volk/python/volk_modtool/volk_modtool @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from volk_modtool import volk_modtool, volk_modtool_config +from optparse import OptionParser, OptionGroup + +import exceptions +import os +import sys + +if __name__ == '__main__': + parser = OptionParser(); + actions = OptionGroup(parser, 'Actions'); + actions.add_option('-i', '--install', action='store_true', + help='Create a new volk module.') + parser.add_option('-b', '--base_path', action='store', default=None, + help='Base path for action. By default, volk_modtool.cfg loads this value.') + parser.add_option('-n', '--kernel_name', action='store', default=None, + help='Kernel name for action. No default') + parser.add_option('-c', '--config', action='store', dest='config_file', default=None, + help='Config file for volk_modtool. By default, volk_modtool.cfg in the local directory will be used/created.') + actions.add_option('-a', '--add_kernel', action='store_true', + help='Add kernel from existing volk module. Requires: -n. Optional: -b') + actions.add_option('-A', '--add_all_kernels', action='store_true', + help='Add all kernels from existing volk module. Optional: -b') + actions.add_option('-x', '--remove_kernel', action='store_true', + help='Remove kernel from module. Required: -n. Optional: -b') + actions.add_option('-l', '--list', action='store_true', + help='List all kernels in the base.') + actions.add_option('-k', '--kernels', action='store_true', + help='List all kernels in the module.') + actions.add_option('-r', '--remote_list', action='store_true', + help='List all available kernels in remote volk module. Requires: -b.') + actions.add_option('-m', '--moo', action='store_true', + help='Have you mooed today?') + parser.add_option_group(actions) + + (options, args) = parser.parse_args(); + if len(sys.argv) < 2: + parser.print_help() + + elif options.moo: + print " (__) " + print " (oo) " + print " /------\/ " + print " / | || " + print " * /\---/\ " + print " ~~ ~~ " + + else: + my_cfg = volk_modtool_config(options.config_file); + + my_modtool = volk_modtool(my_cfg.get_map(my_cfg.config_name)); + + + if options.install: + my_modtool.make_module_skeleton(); + my_modtool.write_default_cfg(my_cfg.cfg); + + + if options.add_kernel: + if not options.kernel_name: + raise exceptions.IOError("This action requires the -n option."); + else: + name = options.kernel_name; + if options.base_path: + base = options.base_path; + else: + base = my_cfg.cfg.get(my_cfg.config_name, 'base'); + my_modtool.import_kernel(name, base); + + if options.remove_kernel: + if not options.kernel_name: + raise exceptions.IOError("This action requires the -n option."); + else: + name = options.kernel_name; + my_modtool.remove_kernel(name); + + if options.add_all_kernels: + + if options.base_path: + base = options.base_path; + else: + base = my_cfg.cfg.get(my_cfg.config_name, 'base'); + kernelset = my_modtool.get_current_kernels(base); + for i in kernelset: + my_modtool.import_kernel(i, base); + + if options.remote_list: + if not options.base_path: + raise exceptions.IOError("This action requires the -b option. Try -l or -k for listing kernels in the base or the module.") + else: + base = options.base_path; + kernelset = my_modtool.get_current_kernels(base); + for i in kernelset: + print i; + + if options.list: + kernelset = my_modtool.get_current_kernels(); + for i in kernelset: + print i; + + if options.kernels: + dest = my_cfg.cfg.get(my_cfg.config_name, 'destination'); + name = my_cfg.cfg.get(my_cfg.config_name, 'name'); + base = os.path.join(dest, 'volk_' + name); + kernelset = my_modtool.get_current_kernels(base); + for i in kernelset: + print i; diff --git a/volk/python/volk_modtool/volk_modtool_generate.py b/volk/python/volk_modtool/volk_modtool_generate.py new file mode 100644 index 0000000000..80c2aed598 --- /dev/null +++ b/volk/python/volk_modtool/volk_modtool_generate.py @@ -0,0 +1,310 @@ +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +import os +import sys +import re +import glob +import shutil +import exceptions +from sets import Set + +class volk_modtool: + def __init__(self, cfg): + self.volk = re.compile('volk'); + self.remove_after_underscore = re.compile("_.*"); + self.volk_run_tests = re.compile('^\s*VOLK_RUN_TESTS.*\n', re.MULTILINE); + self.volk_profile = re.compile('^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE); + self.my_dict = cfg; + self.lastline = re.compile('\s*char path\[1024\];.*'); + self.badassert = re.compile('^\s*assert\(toked\[0\] == "volk_.*\n', re.MULTILINE); + self.goodassert = ' assert(toked[0] == "volk");\n' + self.baderase = re.compile('^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE); + self.gooderase = ' toked.erase(toked.begin());\n toked.erase(toked.begin());\n'; + + def get_basename(self, base=None): + if not base: + base = self.my_dict['base'] + candidate = base.split('/')[-1]; + if len(candidate.split('_')) == 1: + return ''; + else: + return candidate.split('_')[-1]; + + def get_current_kernels(self, base=None): + if not base: + base = self.my_dict['base'] + name = self.get_basename(); + else: + name = self.get_basename(base); + if name == '': + hdr_files = glob.glob(os.path.join(base, "kernels/volk/*.h")); + begins = re.compile("(?<=volk_).*") + else: + hdr_files = glob.glob(os.path.join(base, "kernels/volk_" + name + "/*.h")); + begins = re.compile("(?<=volk_" + name + "_).*") + + datatypes = []; + functions = []; + + + for line in hdr_files: + + subline = re.search(".*\.h.*", os.path.basename(line)) + if subline: + subsubline = begins.search(subline.group(0)); + if subsubline: + dtype = self.remove_after_underscore.sub("", subsubline.group(0)); + subdtype = re.search("[0-9]+[A-z]+", dtype); + if subdtype: + datatypes.append(subdtype.group(0)); + + + datatypes = set(datatypes); + + for line in hdr_files: + for dt in datatypes: + if dt in line: + #subline = re.search("(?<=volk_)" + dt + ".*(?=\.h)", line); + subline = re.search(begins.pattern[:-2] + dt + ".*(?=\.h)", line); + if subline: + functions.append(subline.group(0)); + + return set(functions); + + def make_module_skeleton(self): + + dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name']) + if os.path.exists(dest): + raise exceptions.IOError("Destination %s already exits!"%(dest)); + + if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name'])): + os.makedirs(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name'])) + + current_kernel_names = self.get_current_kernels(); + + for root, dirnames, filenames in os.walk(self.my_dict['base']): + for name in filenames: + t_table = map(lambda a: re.search(a, name), current_kernel_names); + t_table = set(t_table); + if t_table == set([None]): + infile = os.path.join(root, name); + instring = open(infile, 'r').read(); + outstring = re.sub(self.volk, 'volk_' + self.my_dict['name'], instring); + newname = re.sub(self.volk, 'volk_' + self.my_dict['name'], name); + relpath = os.path.relpath(infile, self.my_dict['base']); + newrelpath = re.sub(self.volk, 'volk_' + self.my_dict['name'], relpath); + dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); + + if not os.path.exists(os.path.dirname(dest)): + os.makedirs(os.path.dirname(dest)) + open(dest, 'w+').write(outstring); + + + infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc'); + instring = open(infile, 'r').read(); + outstring = re.sub(self.volk_run_tests, '', instring); + open(infile, 'w+').write(outstring); + + infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc'); + instring = open(infile, 'r').read(); + outstring = re.sub(self.volk_profile, '', instring); + open(infile, 'w+').write(outstring); + + infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/qa_utils.cc'); + instring = open(infile, 'r').read(); + outstring = re.sub(self.badassert, self.goodassert, instring); + outstring = re.sub(self.baderase, self.gooderase, outstring); + open(infile, 'w+').write(outstring); + + def write_default_cfg(self, cfg): + outfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'volk_modtool.cfg'), 'wb'); + cfg.write(outfile); + outfile.close(); + + + def convert_kernel(self, oldvolk, name, base, inpath, top): + infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h'); + instring = open(infile, 'r').read(); + outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring); + newname = 'volk_' + self.my_dict['name'] + '_' + name + '.h'; + relpath = os.path.relpath(infile, base); + newrelpath = re.sub(oldvolk, 'volk_' + self.my_dict['name'], relpath); + dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); + + + + if not os.path.exists(os.path.dirname(dest)): + os.makedirs(os.path.dirname(dest)) + open(dest, 'w+').write(outstring); + + def remove_kernel(self, name): + basename = self.my_dict['name']; + if len(basename) > 0: + top = 'volk_' + basename + '_'; + else: + top = 'volk_' + base = os.path.join(self.my_dict['destination'], top[:-1]) ; + + if not name in self.get_current_kernels(): + + raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base)); + + + + inpath = os.path.abspath(base); + + + kernel = re.compile(name) + search_kernels = Set([kernel]) + profile = re.compile('^\s*VOLK_PROFILE') + puppet = re.compile('^\s*VOLK_PUPPET') + src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc'); + infile = open(src_dest); + otherlines = infile.readlines(); + open(src_dest, 'w+').write(''); + + for otherline in otherlines: + write_okay = True; + if kernel.search(otherline): + write_okay = False; + if puppet.match(otherline): + args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline) + m_func = args.group(0).split(',')[0]; + func = re.search('(?<=' + top + ').*', m_func); + search_kernels.add(re.compile(func.group(0))); + if write_okay: + open(src_dest, 'a').write(otherline); + + + src_dest = os.path.join(inpath, 'lib/testqa.cc') + infile = open(src_dest); + otherlines = infile.readlines(); + open(src_dest, 'w+').write(''); + + for otherline in otherlines: + write_okay = True; + + for kernel in search_kernels: + if kernel.search(otherline): + write_okay = False; + + if write_okay: + open(src_dest, 'a').write(otherline); + + for kernel in search_kernels: + infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h'); + print "Removing kernel %s"%(kernel.pattern) + if os.path.exists(infile): + os.remove(infile); + + def import_kernel(self, name, base): + if not (base): + base = self.my_dict['base']; + basename = self.getbasename(); + else: + basename = self.get_basename(base); + if not name in self.get_current_kernels(base): + raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base)); + + inpath = os.path.abspath(base); + if len(basename) > 0: + top = 'volk_' + basename + '_'; + else: + top = 'volk_' + oldvolk = re.compile(top[:-1]); + + self.convert_kernel(oldvolk, name, base, inpath, top); + + kernel = re.compile(name) + search_kernels = Set([kernel]) + + profile = re.compile('^\s*VOLK_PROFILE') + puppet = re.compile('^\s*VOLK_PUPPET') + infile = open(os.path.join(inpath, 'apps/', oldvolk.pattern + '_profile.cc')); + otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc')); + dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc'); + lines = infile.readlines(); + otherlines = otherinfile.readlines(); + open(dest, 'w+').write(''); + insert = False; + inserted = False + for otherline in otherlines: + + if self.lastline.match(otherline): + insert = True; + if insert and not inserted: + inserted = True; + for line in lines: + if kernel.search(line): + if profile.match(line): + outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line); + open(dest, 'a').write(outline); + elif puppet.match(line): + outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line); + open(dest, 'a').write(outline); + args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line) + m_func = args.group(0).split(',')[0]; + func = re.search('(?<=' + top + ').*', m_func); + search_kernels.add(re.compile(func.group(0))); + self.convert_kernel(oldvolk, func.group(0), base, inpath, top); + write_okay = True; + for kernel in search_kernels: + if kernel.search(otherline): + write_okay = False + if write_okay: + open(dest, 'a').write(otherline); + + for kernel in search_kernels: + print "Adding kernel %s from module %s"%(kernel.pattern,base) + + infile = open(os.path.join(inpath, 'lib/testqa.cc')); + otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc')); + dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc'); + lines = infile.readlines(); + otherlines = otherinfile.readlines(); + open(dest, 'w+').write(''); + inserted = False; + insert = False + for otherline in otherlines: + + if (re.match('\s*', otherline) == None or re.match('\s*#.*', otherline) == None): + + insert = True; + if insert and not inserted: + inserted = True; + for line in lines: + for kernel in search_kernels: + if kernel.search(line): + if self.volk_run_tests.match(line): + outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line); + open(dest, 'a').write(outline); + write_okay = True; + for kernel in search_kernels: + if kernel.search(otherline): + write_okay = False + if write_okay: + open(dest, 'a').write(otherline); + + + + + diff --git a/volk/tmpl/volk.tmpl.c b/volk/tmpl/volk.tmpl.c index c3a1544ff8..f915f157f6 100644 --- a/volk/tmpl/volk.tmpl.c +++ b/volk/tmpl/volk.tmpl.c @@ -27,6 +27,10 @@ #include <volk/volk.h> #include <stdio.h> #include <string.h> +#include <assert.h> + +static size_t __alignment = 0; +static intptr_t __alignment_mask = 0; struct volk_machine *get_machine(void) { extern struct volk_machine *volk_machines[]; @@ -46,45 +50,118 @@ struct volk_machine *get_machine(void) { } } printf("Using Volk machine: %s\n", machine->name); + __alignment = machine->alignment; + __alignment_mask = (intptr_t)(__alignment-1); return machine; } } -unsigned int volk_get_alignment(void) { - return get_machine()->alignment; +size_t volk_get_alignment(void) +{ + get_machine(); //ensures alignment is set + return __alignment; +} + +bool volk_is_aligned(const void *ptr) +{ + return ((intptr_t)(ptr) & __alignment_mask) == 0; } +#define LV_HAVE_GENERIC +#define LV_HAVE_DISPATCHER + #for $kern in $kernels -void get_$(kern.name)($kern.arglist_namedefs) { - $kern.name = get_machine()->$(kern.name)_archs[volk_rank_archs( - get_machine()->$(kern.name)_indices, - get_machine()->$(kern.name)_arch_defs, - get_machine()->$(kern.name)_n_archs, - get_machine()->$(kern.name)_name, - volk_get_lvarch() - )]; +#if $kern.has_dispatcher +#include <volk/$(kern.name).h> //pulls in the dispatcher +#end if + +static inline void __$(kern.name)_d($kern.arglist_full) +{ + #if $kern.has_dispatcher + $(kern.name)_dispatcher($kern.arglist_names); + return; + #end if + + if (volk_is_aligned( + #set $num_open_parens = 0 + #for $arg_type, $arg_name in $kern.args + #if '*' in $arg_type + VOLK_OR_PTR($arg_name, + #set $num_open_parens += 1 + #end if + #end for + 0$(')'*$num_open_parens) + )){ + $(kern.name)_a($kern.arglist_names); + } + else{ + $(kern.name)_u($kern.arglist_names); + } +} + +static inline void __init_$(kern.name)(void) +{ + const char *name = get_machine()->$(kern.name)_name; + const char **impl_names = get_machine()->$(kern.name)_impl_names; + const int *impl_deps = get_machine()->$(kern.name)_impl_deps; + const bool *alignment = get_machine()->$(kern.name)_impl_alignment; + const size_t n_impls = get_machine()->$(kern.name)_n_impls; + const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/); + const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/); + $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a]; + $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u]; + + assert($(kern.name)_a); + assert($(kern.name)_u); + + $(kern.name) = &__$(kern.name)_d; +} + +static inline void __$(kern.name)_a($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)_a($kern.arglist_names); +} + +static inline void __$(kern.name)_u($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)_u($kern.arglist_names); +} + +static inline void __$(kern.name)($kern.arglist_full) +{ + __init_$(kern.name)(); $(kern.name)($kern.arglist_names); } -$kern.pname $kern.name = &get_$(kern.name); +$kern.pname $(kern.name)_a = &__$(kern.name)_a; +$kern.pname $(kern.name)_u = &__$(kern.name)_u; +$kern.pname $(kern.name) = &__$(kern.name); -void $(kern.name)_manual($kern.arglist_namedefs, const char* arch) { - const size_t index = get_index( - get_machine()->$(kern.name)_indices, - get_machine()->$(kern.name)_n_archs, - arch +void $(kern.name)_manual($kern.arglist_full, const char* impl_name) +{ + const int index = volk_get_index( + get_machine()->$(kern.name)_impl_names, + get_machine()->$(kern.name)_n_impls, + impl_name ); - get_machine()->$(kern.name)_archs[index]( + get_machine()->$(kern.name)_impls[index]( $kern.arglist_names ); } -struct volk_func_desc $(kern.name)_get_func_desc(void) { - struct volk_func_desc desc = { - get_machine()->$(kern.name)_indices, - get_machine()->$(kern.name)_arch_defs, - get_machine()->$(kern.name)_n_archs +volk_func_desc_t $(kern.name)_get_func_desc(void) { + const char **impl_names = get_machine()->$(kern.name)_impl_names; + const int *impl_deps = get_machine()->$(kern.name)_impl_deps; + const bool *alignment = get_machine()->$(kern.name)_impl_alignment; + const size_t n_impls = get_machine()->$(kern.name)_n_impls; + volk_func_desc_t desc = { + impl_names, + impl_deps, + alignment, + n_impls }; return desc; } diff --git a/volk/tmpl/volk.tmpl.h b/volk/tmpl/volk.tmpl.h index 161579e46d..464b65598a 100644 --- a/volk/tmpl/volk.tmpl.h +++ b/volk/tmpl/volk.tmpl.h @@ -27,20 +27,59 @@ #include <volk/volk_common.h> #include <volk/volk_complex.h> +#include <stdlib.h> +#include <stdbool.h> + __VOLK_DECL_BEGIN -struct volk_func_desc { - const char **indices; - const int *arch_defs; - const int n_archs; -}; +typedef struct volk_func_desc +{ + const char **impl_names; + const int *impl_deps; + const bool *impl_alignment; + const size_t n_impls; +} volk_func_desc_t; + +//! Get the machine alignment in bytes +VOLK_API size_t volk_get_alignment(void); + +/*! + * The VOLK_OR_PTR macro is a convenience macro + * for checking the alignment of a set of pointers. + * Example usage: + * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2))) + */ +#define VOLK_OR_PTR(ptr0, ptr1) \ + (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1))) -VOLK_API unsigned int volk_get_alignment(void); +/*! + * Is the pointer on a machine alignment boundary? + * + * Note: for performance reasons, this function + * is not usable until another volk API call is made + * which will perform certain initialization tasks. + * + * \param ptr the pointer to some memory buffer + * \return 1 for alignment boundary, else 0 + */ +VOLK_API bool volk_is_aligned(const void *ptr); #for $kern in $kernels + +//! A function pointer to the dispatcher implementation extern VOLK_API $kern.pname $kern.name; -extern VOLK_API void $(kern.name)_manual($kern.arglist_namedefs, const char* arch); -extern VOLK_API struct volk_func_desc $(kern.name)_get_func_desc(void); + +//! A function pointer to the fastest aligned implementation +extern VOLK_API $kern.pname $(kern.name)_a; + +//! A function pointer to the fastest unaligned implementation +extern VOLK_API $kern.pname $(kern.name)_u; + +//! Call into a specific implementation given by name +extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name); + +//! Get description paramaters for this kernel +extern VOLK_API volk_func_desc_t $(kern.name)_get_func_desc(void); #end for __VOLK_DECL_END diff --git a/volk/tmpl/volk_machine_xxx.tmpl.c b/volk/tmpl/volk_machine_xxx.tmpl.c index e405bd6938..68d7f3eba2 100644 --- a/volk/tmpl/volk_machine_xxx.tmpl.c +++ b/volk/tmpl/volk_machine_xxx.tmpl.c @@ -44,18 +44,23 @@ $(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp #end def ######################################################################## -#def make_tag_str_list($tags) -{$(', '.join(['"%s"'%a for a in $tags]))}#slurp +#def make_impl_name_list($impls) +{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp #end def ######################################################################## -#def make_tag_have_list($deps) -{$(', '.join([' | '.join(['(1 << LV_%s)'%a.upper() for a in d]) for d in $deps]))}#slurp +#def make_impl_align_list($impls) +{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp #end def ######################################################################## -#def make_tag_kern_list($name, $tags) -{$(', '.join(['%s_%s'%($name, a) for a in $tags]))}#slurp +#def make_impl_deps_list($impls) +{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp +#end def + +######################################################################## +#def make_impl_fcn_list($name, $impls) +{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp #end def struct volk_machine volk_machine_$(this_machine.name) = { @@ -63,11 +68,12 @@ struct volk_machine volk_machine_$(this_machine.name) = { "$this_machine.name", $this_machine.alignment, #for $kern in $kernels - #set $taglist, $tagdeps = $kern.get_tags($arch_names) - "$kern.name", - $make_tag_str_list($taglist), - $make_tag_have_list($tagdeps), - $make_tag_kern_list($kern.name, $taglist), - $(len($taglist)), + #set $impls = $kern.get_impls($arch_names) + "$kern.name", ##//kernel name + $make_impl_name_list($impls), ##//list of kernel implementations by name + $make_impl_deps_list($impls), ##//list of arch dependencies per implementation + $make_impl_align_list($impls), ##//alignment required? for each implementation + $make_impl_fcn_list($kern.name, $impls), ##//pointer to each implementation + $(len($impls)), ##//number of implementations listed here #end for }; diff --git a/volk/tmpl/volk_machines.tmpl.h b/volk/tmpl/volk_machines.tmpl.h index b30e600ed8..7e11b10795 100644 --- a/volk/tmpl/volk_machines.tmpl.h +++ b/volk/tmpl/volk_machines.tmpl.h @@ -25,18 +25,22 @@ #include <volk/volk_common.h> #include <volk/volk_typedefs.h> +#include <stdbool.h> +#include <stdlib.h> + __VOLK_DECL_BEGIN struct volk_machine { const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) const char *name; - const unsigned int alignment; //the maximum byte alignment required for functions in this library + const size_t alignment; //the maximum byte alignment required for functions in this library #for $kern in $kernels const char *$(kern.name)_name; - const char *$(kern.name)_indices[$(len($archs))]; - const int $(kern.name)_arch_defs[$(len($archs))]; - const $(kern.pname) $(kern.name)_archs[$(len($archs))]; - const int $(kern.name)_n_archs; + const char *$(kern.name)_impl_names[$(len($archs))]; + const int $(kern.name)_impl_deps[$(len($archs))]; + const bool $(kern.name)_impl_alignment[$(len($archs))]; + const $(kern.pname) $(kern.name)_impls[$(len($archs))]; + const size_t $(kern.name)_n_impls; #end for }; diff --git a/volk/tmpl/volk_typedefs.tmpl.h b/volk/tmpl/volk_typedefs.tmpl.h index 52a87242fe..6f5426965f 100644 --- a/volk/tmpl/volk_typedefs.tmpl.h +++ b/volk/tmpl/volk_typedefs.tmpl.h @@ -26,7 +26,7 @@ #include <volk/volk_complex.h> #for $kern in $kernels -typedef $kern.rettype (*$(kern.pname))($kern.arglist_defs); +typedef void (*$(kern.pname))($kern.arglist_types); #end for #endif /*INCLUDED_VOLK_TYPEDEFS*/ |