diff options
-rw-r--r-- | volk/CMakeLists.txt | 7 | ||||
-rw-r--r-- | volk/apps/volk_profile.cc | 232 | ||||
-rw-r--r-- | volk/cmake/msvc/stdbool.h | 45 | ||||
-rw-r--r-- | volk/gen/archs.xml | 1 | ||||
-rw-r--r-- | volk/gen/volk_arch_defs.py | 7 | ||||
-rw-r--r-- | volk/gen/volk_kernel_defs.py | 343 | ||||
-rw-r--r-- | volk/gen/volk_machine_defs.py | 4 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_convert_8i_a.h | 69 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_s32f_convert_32f_a.h | 119 | ||||
-rw-r--r-- | volk/include/volk/volk_16u_byteswap_u.h | 63 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_convert_64f_a.h | 70 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_convert_64f_u.h | 70 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_convert_16i_a.h | 150 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_convert_32i_u.h | 142 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_convert_8i_a.h | 155 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_s32f_multiply_32f_u.h | 102 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_add_32f_u.h | 66 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 290 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 290 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_multiply_32f_u.h | 106 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_conjugate_32fc_u.h | 64 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h | 78 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h | 78 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_magnitude_32f_u.h | 118 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_magnitude_squared_32f_a.h | 114 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_magnitude_squared_32f_u.h | 114 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 87 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 149 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h | 116 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_32fc_u.h | 77 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 81 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 81 | ||||
-rw-r--r-- | volk/include/volk/volk_32i_s32f_convert_32f_a.h | 73 | ||||
-rw-r--r-- | volk/include/volk/volk_32i_s32f_convert_32f_u.h | 75 | ||||
-rw-r--r-- | volk/include/volk/volk_32u_byteswap_u.h | 77 | ||||
-rw-r--r-- | volk/include/volk/volk_64f_convert_32f_a.h | 67 | ||||
-rw-r--r-- | volk/include/volk/volk_64f_convert_32f_u.h | 67 | ||||
-rw-r--r-- | volk/include/volk/volk_64u_byteswap_u.h | 88 | ||||
-rw-r--r-- | volk/include/volk/volk_8i_convert_16i_u.h | 73 | ||||
-rw-r--r-- | volk/include/volk/volk_8i_s32f_convert_32f_u.h | 94 | ||||
-rw-r--r-- | volk/include/volk/volk_prefs.h | 15 | ||||
-rw-r--r-- | volk/kernels/README.txt | 67 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h (renamed from volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_branch_4_state_8.h (renamed from volk/include/volk/volk_16i_branch_4_state_8_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_convert_8i.h (renamed from volk/include/volk/volk_16i_convert_8i_u.h) | 71 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_max_star_16i.h (renamed from volk/include/volk/volk_16i_max_star_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_max_star_horizontal_16i.h (renamed from volk/include/volk/volk_16i_max_star_horizontal_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_permute_and_scalar_add.h (renamed from volk/include/volk/volk_16i_permute_and_scalar_add_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_s32f_convert_32f.h (renamed from volk/include/volk/volk_16i_s32f_convert_32f_u.h) | 121 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h (renamed from volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h (renamed from volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h (renamed from volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_deinterleave_real_16i.h (renamed from volk/include/volk/volk_16ic_deinterleave_real_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_deinterleave_real_8i.h (renamed from volk/include/volk/volk_16ic_deinterleave_real_8i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_magnitude_16i.h (renamed from volk/include/volk/volk_16ic_magnitude_16i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h (renamed from volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h (renamed from volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_16u_byteswap.h (renamed from volk/include/volk/volk_16u_byteswap_a.h) | 65 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_accumulator_s32f.h (renamed from volk/include/volk/volk_32f_accumulator_s32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_convert_64f.h | 140 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_index_max_16u.h (renamed from volk/include/volk/volk_32f_index_max_16u_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h (renamed from volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h (renamed from volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_16i.h (renamed from volk/include/volk/volk_32f_s32f_convert_16i_u.h) | 152 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_32i.h (renamed from volk/include/volk/volk_32f_s32f_convert_32i_a.h) | 142 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_8i.h (renamed from volk/include/volk/volk_32f_s32f_convert_8i_u.h) | 157 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_multiply_32f.h (renamed from volk/include/volk/volk_32f_s32f_multiply_32f_a.h) | 104 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_normalize.h (renamed from volk/include/volk/volk_32f_s32f_normalize_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_power_32f.h (renamed from volk/include/volk/volk_32f_s32f_power_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_stddev_32f.h (renamed from volk/include/volk/volk_32f_s32f_stddev_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_sqrt_32f.h (renamed from volk/include/volk/volk_32f_sqrt_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h (renamed from volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_add_32f.h (renamed from volk/include/volk/volk_32f_x2_add_32f_a.h) | 68 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_divide_32f.h (renamed from volk/include/volk/volk_32f_x2_divide_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_dot_prod_16i.h (renamed from volk/include/volk/volk_32f_x2_dot_prod_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_dot_prod_32f.h | 580 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_interleave_32fc.h (renamed from volk/include/volk/volk_32f_x2_interleave_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_max_32f.h (renamed from volk/include/volk/volk_32f_x2_max_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_min_32f.h (renamed from volk/include/volk/volk_32f_x2_min_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_multiply_32f.h (renamed from volk/include/volk/volk_32f_x2_multiply_32f_a.h) | 108 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h (renamed from volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x2_subtract_32f.h (renamed from volk/include/volk/volk_32f_x2_subtract_32f_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h (renamed from volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_32f_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_32f_multiply_32fc_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_conjugate_32fc.h (renamed from volk/include/volk/volk_32fc_conjugate_32fc_a.h) | 64 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h | 156 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h (renamed from volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_real_32f.h (renamed from volk/include/volk/volk_32fc_deinterleave_real_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_real_64f.h (renamed from volk/include/volk/volk_32fc_deinterleave_real_64f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_index_max_16u.h (renamed from volk/include/volk/volk_32fc_index_max_16u_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_magnitude_32f.h (renamed from volk/include/volk/volk_32fc_magnitude_32f_a.h) | 120 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_magnitude_squared_32f.h | 228 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_atan2_32f.h (renamed from volk/include/volk/volk_32fc_s32f_atan2_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h (renamed from volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h (renamed from volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_power_32fc.h (renamed from volk/include/volk/volk_32fc_s32f_power_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h (renamed from volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h (renamed from volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h) | 87 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h) | 10 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h) | 6 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h) | 149 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h) | 116 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_x2_multiply_32fc_a.h) | 79 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h | 162 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h (renamed from volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_square_dist_32f.h (renamed from volk/include/volk/volk_32fc_x2_square_dist_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32i_s32f_convert_32f.h | 148 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32i_x2_and_32i.h (renamed from volk/include/volk/volk_32i_x2_and_32i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32i_x2_or_32i.h (renamed from volk/include/volk/volk_32i_x2_or_32i_a.h) | 4 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32u_byteswap.h (renamed from volk/include/volk/volk_32u_byteswap_a.h) | 77 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32u_popcnt.h (renamed from volk/include/volk/volk_32u_popcnt_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64f_convert_32f.h | 134 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64f_x2_max_64f.h (renamed from volk/include/volk/volk_64f_x2_max_64f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64f_x2_min_64f.h (renamed from volk/include/volk/volk_64f_x2_min_64f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_byteswap.h (renamed from volk/include/volk/volk_64u_byteswap_a.h) | 88 | ||||
-rw-r--r-- | volk/kernels/volk/volk_64u_popcnt.h (renamed from volk/include/volk/volk_64u_popcnt_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8i_convert_16i.h (renamed from volk/include/volk/volk_8i_convert_16i_a.h) | 75 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8i_s32f_convert_32f.h (renamed from volk/include/volk/volk_8i_s32f_convert_32f_a.h) | 96 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h (renamed from volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_deinterleave_real_16i.h (renamed from volk/include/volk/volk_8ic_deinterleave_real_16i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_deinterleave_real_8i.h (renamed from volk/include/volk/volk_8ic_deinterleave_real_8i_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h (renamed from volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h (renamed from volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h) | 2 | ||||
-rw-r--r-- | volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h (renamed from volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h) | 2 | ||||
-rw-r--r-- | volk/lib/CMakeLists.txt | 3 | ||||
-rw-r--r-- | volk/lib/qa_utils.cc | 36 | ||||
-rw-r--r-- | volk/lib/qa_utils.h | 2 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 192 | ||||
-rw-r--r-- | volk/lib/volk_prefs.c | 39 | ||||
-rw-r--r-- | volk/lib/volk_rank_archs.c | 104 | ||||
-rw-r--r-- | volk/lib/volk_rank_archs.h | 40 | ||||
-rw-r--r-- | volk/tmpl/volk.tmpl.c | 121 | ||||
-rw-r--r-- | volk/tmpl/volk.tmpl.h | 55 | ||||
-rw-r--r-- | volk/tmpl/volk_machine_xxx.tmpl.c | 30 | ||||
-rw-r--r-- | volk/tmpl/volk_machines.tmpl.h | 14 | ||||
-rw-r--r-- | volk/tmpl/volk_typedefs.tmpl.h | 2 |
141 files changed, 4376 insertions, 4097 deletions
diff --git a/volk/CMakeLists.txt b/volk/CMakeLists.txt index 68385f9740..00544c5466 100644 --- a/volk/CMakeLists.txt +++ b/volk/CMakeLists.txt @@ -75,7 +75,7 @@ set(Boost_ADDITIONAL_VERSIONS "1.60.0" "1.60" "1.61.0" "1.61" "1.62.0" "1.62" "1.63.0" "1.63" "1.64.0" "1.64" "1.65.0" "1.65" "1.66.0" "1.66" "1.67.0" "1.67" "1.68.0" "1.68" "1.69.0" "1.69" ) -find_package(Boost COMPONENTS unit_test_framework) +find_package(Boost COMPONENTS unit_test_framework filesystem system) find_package(ORC) @@ -103,12 +103,15 @@ install( # Install all headers in the include directories ######################################################################## install( - DIRECTORY ${CMAKE_SOURCE_DIR}/include/volk + DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk DESTINATION include COMPONENT "volk_devel" FILES_MATCHING PATTERN "*.h" ) install(FILES + ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h ${CMAKE_BINARY_DIR}/include/volk/volk.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index d35d90deea..3b1ab370d3 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -5,144 +5,124 @@ extern "C" { } #include <vector> #include <boost/foreach.hpp> +#include <boost/filesystem.hpp> #include <iostream> #include <fstream> #include <sys/stat.h> #include <sys/types.h> +namespace fs = boost::filesystem; + int main(int argc, char *argv[]) { std::vector<std::string> results; - //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results); - //VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results); - VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc_a, volk_32fc_s32fc_x2_rotator_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results); - VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 204600, 1000, &results); - VOLK_PROFILE(volk_16ic_deinterleave_real_16i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16ic_magnitude_16i_a, 1, 0, 204600, 100, &results); - VOLK_PROFILE(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 204600, 1000, &results); - VOLK_PROFILE(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_convert_8i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_convert_8i_u, 0, 0, 204600, 10000, &results); - //VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results); - //VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results); - //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results); - //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results); - VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 204600, 100, &results); - //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results); - VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_magnitude_32f_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_magnitude_squared_32f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_magnitude_squared_32f_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_32i_u, 1, 2<<31, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_convert_64f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_convert_64f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_8i_a, 1, 128, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_convert_8i_u, 1, 128, 204600, 10000, &results); - //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000, &results); - VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 20460, 100, &results); - VOLK_PROFILE(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results); - //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results); - VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results); - VOLK_PROFILE(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_x2_multiply_32f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100, &results); - VOLK_PROFILE(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100, &results); - VOLK_PROFILE(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 204600, 3000, &results); - VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 204600, 3000, &results); - VOLK_PROFILE(volk_32f_x2_subtract_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 204600, 5000, &results); - VOLK_PROFILE(volk_32i_x2_and_32i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32i_s32f_convert_32f_a, 1e-4, 100, 204600, 10000, &results); - VOLK_PROFILE(volk_32i_s32f_convert_32f_u, 1e-4, 100, 204600, 10000, &results); - VOLK_PROFILE(volk_32i_x2_or_32i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32u_byteswap_a, 0, 0, 204600, 2000, &results); - //VOLK_PROFILE(volk_32u_popcnt_a, 0, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_64f_convert_32f_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_64f_convert_32f_u, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_64f_x2_max_64f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_64f_x2_min_64f_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_64u_byteswap_a, 0, 0, 204600, 1000, &results); - //VOLK_PROFILE(volk_64u_popcnt_a, 0, 0, 2046, 10000, &results); - VOLK_PROFILE(volk_8ic_deinterleave_16i_x2_a, 0, 0, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_deinterleave_real_16i_a, 0, 256, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 204600, 3000, &results); - VOLK_PROFILE(volk_8ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 204600, 400, &results); - VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 204600, 400, &results); - VOLK_PROFILE(volk_8i_convert_16i_a, 0, 0, 204600, 20000, &results); - VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results); - VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results); - VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results); - //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results); - VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 1.0, 204600, 10000, &results); - VOLK_PROFILE(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 204600, 1000, &results); + //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results); + //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results); + VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results); + VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_deinterleave_16i_x2, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 204600, 1000, &results); + VOLK_PROFILE(volk_16ic_deinterleave_real_16i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16ic_magnitude_16i, 1, 0, 204600, 100, &results); + VOLK_PROFILE(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 204600, 1000, &results); + VOLK_PROFILE(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 204600, 10000, &results); + VOLK_PROFILE(volk_16i_convert_8i, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results); + //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204600, 50, &results); + VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204600, 100, &results); + //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_real_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_real_64f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_index_max_16u, 3, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_s32f_magnitude_16i, 1, 32768, 204600, 100, &results); + VOLK_PROFILE(volk_32fc_magnitude_32f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_magnitude_squared_32f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_conjugate_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32f_s32f_convert_16i, 1, 32768, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_convert_32i, 1, 2<<31, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_convert_64f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_convert_8i, 1, 128, 204600, 10000, &results); + //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results); + VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 20460, 100, &results); + VOLK_PROFILE(volk_32fc_x2_square_dist_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_divide_32f, 1e-4, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 5000, &results); + //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results); + VOLK_PROFILE(volk_32f_index_max_16u, 3, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic, 1, 32768, 204600, 3000, &results); + VOLK_PROFILE(volk_32f_x2_interleave_32fc, 0, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_max_32f, 1e-4, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_32f_x2_min_32f, 1e-4, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_32f_x2_multiply_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_normalize, 1e-4, 100, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_power_32f, 1e-4, 4, 204600, 100, &results); + VOLK_PROFILE(volk_32f_sqrt_32f, 1e-4, 0, 204600, 100, &results); + VOLK_PROFILE(volk_32f_s32f_stddev_32f, 1e-4, 100, 204600, 3000, &results); + VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 204600, 3000, &results); + VOLK_PROFILE(volk_32f_x2_subtract_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204600, 10000, &results); + VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204600, 2000, &results); + //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204600, 1000, &results); + //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results); + VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 204600, 3000, &results); + VOLK_PROFILE(volk_8ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 204600, 400, &results); + VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 204600, 400, &results); + VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 20000, &results); + VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 2000, &results); + VOLK_PROFILE(volk_8i_s32f_convert_32f, 1e-4, 100, 204600, 2000, &results); + //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 0, 204600, 1000, &results); + + char path[1024]; + volk_get_config_path(path); + const fs::path config_path(path); + + if (not fs::exists(config_path.branch_path())) + { + std::cout << "Creating " << config_path.branch_path() << "..." << std::endl; + fs::create_directories(config_path.branch_path()); + } - char path[256]; - get_config_path(path); - std::string config_path(path); - std::ofstream config; - std::cout << "filename: " << config_path << std::endl; - config.open(config_path.c_str()); + std::cout << "Writing " << config_path << "..." << std::endl; + std::ofstream config(config_path.string().c_str()); if(!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::string dir(getenv("HOME")); - dir += "/.volk"; - if(mkdir(dir.c_str(), 0777) == -1) { - std::cout << "Error creating directory " << dir << std::endl; - return -1; - } - config.open(config_path.c_str()); - if(!config.is_open()) { - std::cout << "Error opening file " << config_path << std::endl; - return -1; - } + std::cout << "Error opening file " << config_path << std::endl; } config << "\ diff --git a/volk/cmake/msvc/stdbool.h b/volk/cmake/msvc/stdbool.h new file mode 100644 index 0000000000..fc8ee28f40 --- /dev/null +++ b/volk/cmake/msvc/stdbool.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2005, 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef STDBOOL_WIN32_H +#define STDBOOL_WIN32_H + +#if !COMPILER(MSVC) +#error "This stdbool.h file should only be compiled with MSVC" +#endif + +#ifndef __cplusplus + +typedef unsigned char bool; + +#define true 1 +#define false 0 + +#ifndef CASSERT +#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1]; +#endif + +CASSERT(sizeof(bool) == 1, bool_is_one_byte) +CASSERT(true, true_is_true) +CASSERT(!false, false_is_false) + +#endif + +#endif diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml index a18455801d..2c9ab41a55 100644 --- a/volk/gen/archs.xml +++ b/volk/gen/archs.xml @@ -2,7 +2,6 @@ <grammar> <arch name="generic"> <!-- name is required--> - <alignment>1</alignment> </arch> <arch name="altivec"> diff --git a/volk/gen/volk_arch_defs.py b/volk/gen/volk_arch_defs.py index 41154d5a7a..3c75e1374e 100644 --- a/volk/gen/volk_arch_defs.py +++ b/volk/gen/volk_arch_defs.py @@ -18,9 +18,6 @@ archs = list() arch_dict = dict() -#TODO enable this when we are ready -create_unaligned_archs = False - class arch_class: def __init__(self, flags, checks, **kwargs): for key, cast, failval in ( @@ -49,10 +46,6 @@ def register_arch(**kwargs): arch = arch_class(**kwargs) archs.append(arch) arch_dict[arch.name] = arch - if arch.alignment > 1 and create_unaligned_archs: - kwargs['name'] += '_u' - kwargs['alignment'] = 1 - register_arch(**kwargs) ######################################################################## # register the arches diff --git a/volk/gen/volk_kernel_defs.py b/volk/gen/volk_kernel_defs.py index 52cdb684c2..f246db0f96 100644 --- a/volk/gen/volk_kernel_defs.py +++ b/volk/gen/volk_kernel_defs.py @@ -24,201 +24,186 @@ import re import sys import glob -from volk_arch_defs import archs - -remove_after_underscore = re.compile("_.*"); -space_remove = re.compile(" "); -leading_space_remove = re.compile("^ *"); -replace_arch = re.compile(", const char\* arch"); -replace_bracket = re.compile(" {"); -replace_volk = re.compile("volk"); - -def strip_trailing(tostrip, stripstr): - lindex = tostrip.rfind(stripstr) - tostrip = tostrip[0:lindex] + tostrip[lindex:len(tostrip)].replace(stripstr, ""); - return tostrip +######################################################################## +# Strip comments from a c/cpp file. +# Input is code string, output is code string without comments. +# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments +######################################################################## +def comment_remover(text): + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return "" + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + +######################################################################## +# Split code into nested sections according to ifdef preprocessor macros +######################################################################## +def split_into_nested_ifdef_sections(code): + sections = list() + section = '' + header = 'text' + in_section_depth = 0 + for i, line in enumerate(code.splitlines()): + m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line) + line_is = 'normal' + if m: + p0, p1, fcn, stuff = m.groups() + if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if' + if fcn in ('else', 'elif'): line_is = 'else' + if fcn in ('endif',): line_is = 'end' + + if line_is == 'if': in_section_depth += 1 + if line_is == 'end': in_section_depth -= 1 + + if in_section_depth == 1 and line_is == 'if': + sections.append((header, section)) + section = '' + header = line + continue -srcdir = os.path.dirname(os.path.dirname(__file__)) -hdr_files = glob.glob(os.path.join(srcdir, "include/volk/*.h")) - -datatypes = []; -functions = []; - -for line in hdr_files: - subline = re.search(".*_(a|u)\.h.*", os.path.basename(line)) - if subline: - subsubline = re.search("(?<=volk_).*", subline.group(0)); - if subsubline: - dtype = remove_after_underscore.sub("", subsubline.group(0)); - subdtype = re.search("[0-9]+[A-z]+", dtype); - if subdtype: - datatypes.append(subdtype.group(0)); - - -datatypes = set(datatypes); - -for line in hdr_files: - for dt in datatypes: - if dt in line: - subline = re.search("(volk_" + dt +"_.*(a|u).*\.h)", line); - if subline: - - subsubline = re.search(".+(?=\.h)", subline.group(0)); - functions.append(subsubline.group(0)); - -archs_or = "(" -for arch in archs: - archs_or = archs_or + arch.name.upper() + "|"; -archs_or = archs_or[0:len(archs_or)-1]; -archs_or = archs_or + ")"; - -taglist = []; -fcountlist = []; -arched_arglist = []; -retlist = []; -my_arglist = []; -my_argtypelist = []; -for func in functions: - tags = []; - fcount = []; - infile_source = open(os.path.join(srcdir, 'include', 'volk', func + ".h")) - begun_name = 0; - begun_paren = 0; - sourcefile = infile_source.readlines(); - infile_source.close(); - for line in sourcefile: -#FIXME: make it work for multiple #if define()s - archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); - if archline: - arch = archline.group(0); - archline = re.findall(archs_or + "(?=( |\n|&))", line); - if archline: - archsublist = []; - for tup in archline: - archsublist.append(tup[0]); - fcount.append(archsublist); - testline = re.search("static inline.*?" + func, line); - if (not testline): + if in_section_depth == 1 and line_is == 'else': + sections.append((header, section)) + section = '' + header = line continue - tagline = re.search(func + "_.+", line); - if tagline: - tag = re.search("(?<=" + func + "_)\w+(?= *\()",line); - if tag: - tag = re.search("\w+", tag.group(0)); - if tag: - tags.append(tag.group(0)); + if in_section_depth == 0 and line_is == 'end': + sections.append((header, section)) + section = '' + header = 'text' + continue - if begun_name == 0: - retline = re.search(".+(?=" + func + ")", line); - if retline: - ret = retline.group(0); + section += line + '\n' + sections.append((header, section)) #and pack remainder into sections + sections = [sec for sec in sections if sec[1].strip()] #filter empty sections + #recurse into non-text sections to fill subsections + for i, (header, section) in enumerate(sections): + if header == 'text': continue + sections[i] = (header, split_into_nested_ifdef_sections(section)) + return sections - subline = re.search(func + ".*", line); - if subline: - subsubline = re.search("\(.*?\)", subline.group(0)); - if subsubline: - args = subsubline.group(0); +######################################################################## +# Recursive print of sections to test code above +######################################################################## +def print_sections(sections, indent = ' '): + for header, body in sections: + if header == 'text': + print indent, ('\n'+indent).join(body.splitlines()) + continue + print indent.replace(' ', '-') + '>', header + print_sections(body, indent + ' ') + +######################################################################## +# Flatten a section to just body text +######################################################################## +def flatten_section_text(sections): + output = '' + for hdr, bdy in sections: + if hdr != 'text': output += flatten_section_text(bdy) + else: output += bdy + return output + +######################################################################## +# Extract kernel info from section, represent as an implementation +######################################################################## +class impl_class: + def __init__(self, kern_name, header, body): + #extract LV_HAVE_* + self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header))) + #extract function suffix and args + body = flatten_section_text(body) + try: + fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE) + body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket + m = fcn_matcher.match(body) + impl_name, the_rest = m.groups() + self.name = impl_name.replace(kern_name+'_', '') + self.args = list() + fcn_args = the_rest.split(',') + for fcn_arg in fcn_args: + arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE) + m = arg_matcher.match(fcn_arg) + arg_type, arg_name = m.groups() + self.args.append((arg_type, arg_name)) + except Exception as ex: + raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex) + + assert self.name + self.is_aligned = self.name.startswith('a_') - else: - begun_name = 1; - subsubline = re.search("\(.*", subline.group(0)); - if subsubline: - args = subsubline.group(0); - begun_paren = 1; - else: - if begun_paren == 1: - subline = re.search(".*?\)", line); - if subline: - args = args + subline.group(0); - begun_name = 0; - begun_paren = 0; - else: - subline = re.search(".*", line); - args = args + subline.group(0); - else: - subline = re.search("\(.*?\)", line); - if subline: - args = subline.group(0); - begun_name = 0; - else: - subline = re.search("\(.*", line); - if subline: - args = subline.group(0); - begun_paren = 1; - - replace = re.compile("static "); - ret = replace.sub("", ret); - replace = re.compile("inline "); - ret = replace.sub("", ret); - arched_args = args[args.find('(')+1:args.find(')')] - - remove = re.compile('\)|\(|{'); - rargs = remove.sub("", args); - sargs = rargs.split(','); - - - - margs = []; - atypes = []; - for arg in sargs: - temp = arg.split(" "); - margs.append(temp[-1]); - replace = re.compile(" " + temp[-1]); - atypes.append(replace.sub("", arg)); - - - my_args = "" - arg_types = "" - for arg in range(0, len(margs) - 1): - this_arg = leading_space_remove.sub("", margs[arg]); - my_args = my_args + this_arg + ", "; - this_type = leading_space_remove.sub("", atypes[arg]); - arg_types = arg_types + this_type + ", "; - - this_arg = leading_space_remove.sub("", margs[-1]); - my_args = my_args + this_arg; - this_type = leading_space_remove.sub("", atypes[-1]); - arg_types = arg_types + this_type; - my_argtypelist.append(arg_types); - - if(ret[-1] != ' '): - ret = ret + ' '; - - arched_arglist.append(arched_args); #!!!!!!!!!!! - my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! - retlist.append(ret); - fcountlist.append(fcount); - taglist.append(tags); + def __repr__(self): + return self.name +######################################################################## +# Get sets of LV_HAVE_* from the code +######################################################################## +def extract_lv_haves(code): + haves = list() + for line in code.splitlines(): + if not line.strip().startswith('#'): continue + have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line))) + if have_set: haves.append(have_set) + return haves + +######################################################################## +# Represent a processing kernel, parse from file +######################################################################## class kernel_class: - def __init__(self, index): - self.name = functions[index] + def __init__(self, kernel_file): + self.name = os.path.splitext(os.path.basename(kernel_file))[0] self.pname = self.name.replace('volk_', 'p_') - self.rettype = retlist[index] - self.arglist_defs = my_argtypelist[index] - self.arglist_namedefs = arched_arglist[index] - self.arglist_names = my_arglist[index] - self._tagdeps = fcountlist[index] - self._taglist = taglist[index] - - def get_tags(self, archs): - def is_in(x): return x.lower() in archs - taglist = list() - tagdeps = list() - for i in range(len(self._tagdeps)): - if all(map(is_in, self._tagdeps[i])): - taglist.append(self._taglist[i]) - tagdeps.append(self._tagdeps[i]) - return taglist, tagdeps + code = open(kernel_file, 'r').read() + code = comment_remover(code) + sections = split_into_nested_ifdef_sections(code) + self._impls = list() + for header, section in sections: + if 'ifndef' not in header.lower(): continue + for sub_hdr, body in section: + if 'if' not in sub_hdr.lower(): continue + if 'LV_HAVE_' not in sub_hdr: continue + self._impls.append(impl_class( + kern_name=self.name, header=sub_hdr, body=body, + )) + assert(self._impls) + self.has_dispatcher = False + for impl in self._impls: + if impl.name == 'dispatcher': + self._impls.remove(impl) + self.has_dispatcher = True + break + self.args = self._impls[0].args + self.arglist_types = ', '.join([a[0] for a in self.args]) + self.arglist_full = ', '.join(['%s %s'%a for a in self.args]) + self.arglist_names = ', '.join([a[1] for a in self.args]) + + def get_impls(self, archs): + archs = set(archs) + impls = list() + for impl in self._impls: + if impl.deps.intersection(archs) == impl.deps: + impls.append(impl) + return impls def __repr__(self): return self.name -kernels = map(kernel_class, range(len(retlist))) +######################################################################## +# Extract information from the VOLK kernels +######################################################################## +__file__ = os.path.abspath(__file__) +srcdir = os.path.dirname(os.path.dirname(__file__)) +kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h")) +kernels = map(kernel_class, kernel_files) if __name__ == '__main__': print kernels diff --git a/volk/gen/volk_machine_defs.py b/volk/gen/volk_machine_defs.py index d1a8569818..7293d47462 100644 --- a/volk/gen/volk_machine_defs.py +++ b/volk/gen/volk_machine_defs.py @@ -30,10 +30,6 @@ class machine_class: arch = arch_dict[arch_name] self.archs.append(arch) self.arch_names.append(arch_name) - arch_name += '_u' - if arch.alignment > 1 and arch_dict.has_key(arch_name): - arch = arch_dict[arch_name] - self.archs.append(arch) self.alignment = max(map(lambda a: a.alignment, self.archs)) def __repr__(self): return self.name diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h deleted file mode 100644 index 84548c8c50..0000000000 --- a/volk/include/volk/volk_16i_convert_8i_a.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_volk_16i_convert_8i_a_H -#define INCLUDED_volk_16i_convert_8i_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h deleted file mode 100644 index 7108ff6590..0000000000 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H -#define INCLUDED_volk_16i_s32f_convert_32f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE4_1 -#include <smmintrin.h> - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h deleted file mode 100644 index 8ef627a628..0000000000 --- a/volk/include/volk/volk_16u_byteswap_u.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef INCLUDED_volk_16u_byteswap_u_H -#define INCLUDED_volk_16u_byteswap_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - -/*! - \brief Byteswaps (in-place) an unaligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an unaligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_16u_byteswap_u_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h deleted file mode 100644 index 2c469ac421..0000000000 --- a/volk/include/volk/volk_32f_convert_64f_a.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_a_H -#define INCLUDED_volk_32f_convert_64f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h deleted file mode 100644 index 10d8a4f6c0..0000000000 --- a/volk/include/volk/volk_32f_convert_64f_u.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_u_H -#define INCLUDED_volk_32f_convert_64f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h deleted file mode 100644 index 9df4946f24..0000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H -#define INCLUDED_volk_32f_s32f_convert_16i_a_H - -#include <volk/volk_common.h> -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - __m128 ret1, ret2; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - // Scale and clip - ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - // Scale and clip - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -32768; - float max_val = 32767; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r < min_val) - r = min_val; - else if(r > max_val) - r = max_val; - *outputVectorPtr++ = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h deleted file mode 100644 index ee15edb464..0000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H -#define INCLUDED_volk_32f_s32f_convert_32i_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = -2147483647; - float max_val = 2147483647; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm_cvtps_epi32(inputVal1); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = -2147483647; - float max_val = 2147483647; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -2147483647; - float max_val = 2147483647; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int32_t)(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h deleted file mode 100644 index 800017d5da..0000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H -#define INCLUDED_volk_32f_s32f_convert_8i_a_H - -#include <volk/volk_common.h> -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - - float min_val = -128; - float max_val = 127; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); - inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(inputVal1); - intInputVal2 = _mm_cvtps_epi32(inputVal2); - intInputVal3 = _mm_cvtps_epi32(inputVal3); - intInputVal4 = _mm_cvtps_epi32(inputVal4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int8_t)(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - - float min_val = -128; - float max_val = 127; - float r; - - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int8_t)(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -128; - float max_val = 127; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int8_t)(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h deleted file mode 100644 index b3fae9b053..0000000000 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H -#define INCLUDED_volk_32f_s32f_multiply_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> -/*! - \brief Scalar float multiply - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param scalar the scalar value - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, bVal, cVal; - bVal = _mm_set_ps1(scalar); - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX -#include <immintrin.h> -/*! - \brief Scalar float multiply - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param scalar the scalar value - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m256 aVal, bVal, cVal; - bVal = _mm256_set1_ps(scalar); - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Scalar float multiply - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param scalar the scalar value - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* inputPtr = aVector; - float* outputPtr = cVector; - for(number = 0; number < num_points; number++){ - *outputPtr = (*inputPtr) * scalar; - inputPtr++; - outputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h deleted file mode 100644 index 52e8286bc2..0000000000 --- a/volk/include/volk/volk_32f_x2_add_32f_u.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_add_32f_u_H -#define INCLUDED_volk_32f_x2_add_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h deleted file mode 100644 index 067c33ad89..0000000000 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ /dev/null @@ -1,290 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H - -#include <volk/volk_common.h> -#include<stdio.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#ifdef LV_HAVE_SSE - - -static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include <pmmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); - dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); - dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); - dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include <smmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_AVX - -#include <immintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val; - __m256 b0Val, b1Val; - __m256 c0Val, c1Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_AVX*/ - -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h deleted file mode 100644 index b24e8b1f79..0000000000 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ /dev/null @@ -1,290 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H -#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H - -#include <volk/volk_common.h> -#include<stdio.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#ifdef LV_HAVE_SSE - - -static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 - -#include <pmmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); - dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); - dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); - dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 - -#include <smmintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_AVX - -#include <immintrin.h> - -static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val; - __m256 b0Val, b1Val; - __m256 c0Val, c1Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_AVX*/ - -#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h deleted file mode 100644 index bfb896d602..0000000000 --- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H -#define INCLUDED_volk_32f_x2_multiply_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX -#include <immintrin.h> -/*! - \brief Multiplies the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h deleted file mode 100644 index e0d79ea7bc..0000000000 --- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H -#define INCLUDED_volk_32fc_conjugate_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - - x = _mm_xor_ps(x, conjugator); // conjugate register - - _mm_storeu_ps((float*)c,x); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = lv_conj(*a); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h deleted file mode 100644 index 77566e671d..0000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h deleted file mode 100644 index feed54be8c..0000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H -#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_storeu_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_storeu_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_x2_u_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h deleted file mode 100644 index c8b3f0a088..0000000000 --- a/volk/include/volk/volk_32fc_magnitude_32f_u.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H -#define INCLUDED_volk_32fc_magnitude_32f_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h deleted file mode 100644 index d3ac9717a8..0000000000 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H -#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H - -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h deleted file mode 100644 index 53a4e68eb4..0000000000 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H -#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <math.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include <xmmintrin.h> - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h deleted file mode 100644 index 5c7d15b02f..0000000000 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H -#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> -/*! - \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm_set_ps1(lv_creal(scalar)); - yh = _mm_set_ps1(lv_cimag(scalar)); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = num_points; - - // unwrap loop - while (number >= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; - } - - // clean up any remaining - while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h deleted file mode 100644 index 5b16b8639a..0000000000 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H -#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H - - -#include<volk/volk_complex.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - -#ifdef LV_HAVE_SSE3 - -#include <xmmintrin.h> -#include <pmmintrin.h> -#include <mmintrin.h> - - -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int num_bytes = num_points*8; - - // Variable never used? - //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - union HalfMask { - uint32_t intRep[4]; - __m128 vec; - } halfMask; - - union NegMask { - int intRep[4]; - __m128 vec; - } negMask; - - unsigned int offset = 0; - float Rsum=0, Isum=0; - float Im,Re; - - __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; - __m128 zv = {0,0,0,0}; - - halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; - halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; - - negMask.intRep[0] = negMask.intRep[2] = 0x80000000; - negMask.intRep[1] = negMask.intRep[3] = 0; - - // main loop - while(num_bytes >= 4*sizeof(float)){ - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_mul_ps(in1, in2); - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_mul_ps(in1, fehg); - Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - num_bytes -= 4*sizeof(float); - offset += 2; - Rsum += Re; - Isum += Im; - } - - // handle the last complex case ... - if(num_bytes > 0){ - - if(num_bytes != 4){ - // bad things are happening - } - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); - Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - Rsum += Re; - Isum += Im; - } - - result[0] = lv_cmake(Rsum,Isum); - return; -} - -#endif /*LV_HAVE_SSE3*/ - - -#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ - - - diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h deleted file mode 100644 index 7c0dba7fd8..0000000000 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H -#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H - -#include <volk/volk_common.h> -#include <volk/volk_complex.h> -#include <stdio.h> -#include <string.h> - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_points/2; - unsigned int isodd = num_points &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[num_points - 1] * taps[num_points - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - -#ifdef LV_HAVE_SSE3 - -#include <pmmintrin.h> - -static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_points/2; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - - _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if(num_points % 1 != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h deleted file mode 100644 index a998d6184e..0000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H -#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h deleted file mode 100644 index 2755192e96..0000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H -#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - y = _mm_xor_ps(y, conjugator); // conjugate y - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * lv_conj(*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h deleted file mode 100644 index 09dcd635b9..0000000000 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H -#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H - -#include <inttypes.h> -#include <stdio.h> -#include <volk/volk_complex.h> -#include <float.h> - -#ifdef LV_HAVE_SSE3 -#include <pmmintrin.h> - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - y = _mm_xor_ps(y, conjugator); // conjugate y - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * lv_conj(*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector First vector to be multiplied - \param bVector Second vector that is conjugated before being multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h deleted file mode 100644 index 8f4123d719..0000000000 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H -#define INCLUDED_volk_32i_s32f_convert_32f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h deleted file mode 100644 index b3a8ab2015..0000000000 --- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H -#define INCLUDED_volk_32i_s32f_convert_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h deleted file mode 100644 index e27d1f03dd..0000000000 --- a/volk/include/volk/volk_32u_byteswap_u.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_32u_byteswap_u_H -#define INCLUDED_volk_32u_byteswap_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32u_byteswap_u_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h deleted file mode 100644 index 11d51702bc..0000000000 --- a/volk/include/volk/volk_64f_convert_32f_a.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_a_H -#define INCLUDED_volk_64f_convert_32f_a_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h deleted file mode 100644 index 31dc5b5fe9..0000000000 --- a/volk/include/volk/volk_64f_convert_32f_u.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_u_H -#define INCLUDED_volk_64f_convert_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h deleted file mode 100644 index 41a4a3130f..0000000000 --- a/volk/include/volk/volk_64u_byteswap_u.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef INCLUDED_volk_64u_byteswap_u_H -#define INCLUDED_volk_64u_byteswap_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE2 -#include <emmintrin.h> - -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - uint64_t number = 0; - const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64u_byteswap_u_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h deleted file mode 100644 index 7d7104f52b..0000000000 --- a/volk/include/volk/volk_8i_convert_16i_u.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_8i_convert_16i_u_H -#define INCLUDED_volk_8i_convert_16i_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE4_1 -#include <smmintrin.h> - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h deleted file mode 100644 index 8bb2c0d1a4..0000000000 --- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H -#define INCLUDED_volk_8i_s32f_convert_32f_u_H - -#include <inttypes.h> -#include <stdio.h> - -#ifdef LV_HAVE_SSE4_1 -#include <smmintrin.h> - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h index 83d9baf89d..690e5f99f6 100644 --- a/volk/include/volk/volk_prefs.h +++ b/volk/include/volk/volk_prefs.h @@ -2,23 +2,26 @@ #define INCLUDED_VOLK_PREFS_H #include <volk/volk_common.h> +#include <stdlib.h> __VOLK_DECL_BEGIN -struct volk_arch_pref { - char name[128]; - char arch[32]; -}; +typedef struct volk_arch_pref +{ + char name[128]; //name of the kernel + char impl_a[128]; //best aligned impl + char impl_u[128]; //best unaligned impl +} volk_arch_pref_t; //////////////////////////////////////////////////////////////////////// // get path to volk_config profiling info //////////////////////////////////////////////////////////////////////// -VOLK_API void get_config_path(char *); +VOLK_API void volk_get_config_path(char *); //////////////////////////////////////////////////////////////////////// // load prefs into global prefs struct //////////////////////////////////////////////////////////////////////// -VOLK_API int load_preferences(struct volk_arch_pref **); +VOLK_API size_t volk_load_preferences(volk_arch_pref_t **); __VOLK_DECL_END diff --git a/volk/kernels/README.txt b/volk/kernels/README.txt new file mode 100644 index 0000000000..5dd7434b54 --- /dev/null +++ b/volk/kernels/README.txt @@ -0,0 +1,67 @@ +######################################################################## +# How to create custom kernel dispatchers +######################################################################## +A kernel dispatcher is kernel implementation that calls other kernel implementations. +By default, a dispatcher is generated by the build system for every kernel such that: + * the best aligned implemention is called when all pointer arguments are aligned, + * and otherwise the best unaligned implementation is called. + +The author of a VOLK kernel may create a custom dispatcher, +to be called in place of the automatically generated one. +A custom dispatcher may be useful to handle head and tail cases, +or to implement different alignment and bounds checking logic. + +######################################################################## +# Code for an example dispatcher w/ tail case +######################################################################## +#include <volk/volk_common.h> + +#ifdef LV_HAVE_DISPATCHER + +static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) +{ + const unsigned int num_points_r = num_points%4; + const unsigned int num_points_x = num_points - num_points_r; + + if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector)))) + { + volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x); + } + else + { + volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x); + } + + volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r); +} + +#endif //LV_HAVE_DISPATCHER + +######################################################################## +# Code for an example dispatcher w/ tail case and accumulator +######################################################################## +#include <volk/volk_common.h> + +#ifdef LV_HAVE_DISPATCHER + +static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points) +{ + const unsigned int num_points_r = num_points%16; + const unsigned int num_points_x = num_points - num_points_r; + + if (volk_is_aligned(VOLK_OR_PTR(input, taps))) + { + volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x); + } + else + { + volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x); + } + + float result_tail = 0; + volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r); + + *result += result_tail; +} + +#endif //LV_HAVE_DISPATCHER diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index 1f6554af8b..8bc1569f61 100644 --- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { +static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { static const int N_UNROLL = 4; diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/kernels/volk/volk_16i_branch_4_state_8.h index 6338fbdd17..cdfbc7ba13 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a.h +++ b/volk/kernels/volk/volk_16i_branch_4_state_8.h @@ -138,7 +138,7 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC -static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { +static inline void volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; int bound = 4; diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/kernels/volk/volk_16i_convert_8i.h index 80608a1412..3789b2e4ab 100644 --- a/volk/include/volk/volk_16i_convert_8i_u.h +++ b/volk/kernels/volk/volk_16i_convert_8i.h @@ -54,7 +54,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_ \param num_points The number of data values to be converted \note Input and output buffers do NOT need to be properly aligned */ -static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ +static inline void volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -69,3 +69,72 @@ static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int #endif /* INCLUDED_volk_16i_convert_8i_u_H */ +#ifndef INCLUDED_volk_16i_convert_8i_a_H +#define INCLUDED_volk_16i_convert_8i_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/kernels/volk/volk_16i_max_star_16i.h index ca81cf0d62..c67351c5fa 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/kernels/volk/volk_16i_max_star_16i.h @@ -87,7 +87,7 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_points) { +static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) { const unsigned int num_bytes = num_points*2; diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h index 13c235bc0b..ef88ec094f 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -112,7 +112,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in #ifdef LV_HAVE_GENERIC -static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_points) { +static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) { const unsigned int num_bytes = num_points*2; diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h index d91b36208a..7a01d172a3 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h +++ b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h @@ -118,7 +118,7 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short #ifdef LV_HAVE_GENERIC -static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) { +static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) { const unsigned int num_bytes = num_points*2; diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h index 4ce8e8f35b..a810a601a0 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h +++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h @@ -105,7 +105,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in \param num_points The number of data values to be converted \note Output buffer does NOT need to be properly aligned */ -static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ float* outputVectorPtr = outputVector; const int16_t* inputVectorPtr = inputVector; unsigned int number = 0; @@ -120,3 +120,122 @@ static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, cons #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H +#define INCLUDED_volk_16i_s32f_convert_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h index 18b2e3d845..56b2cc07ab 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h +++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -167,7 +167,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { +static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { const unsigned int num_bytes = num_points*2; diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h index 677cb40e9f..9b6d19fd66 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h +++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h @@ -115,7 +115,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta #ifdef LV_HAVE_GENERIC -static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { +static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { const unsigned int num_bytes = num_points*2; diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h index f8aa30874f..9ce8012640 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h +++ b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h @@ -128,7 +128,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_ \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; @@ -149,7 +149,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int \param num_points The number of complex data values to be deinterleaved */ extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h index bac1f2e4b0..f6eccd77ee 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h +++ b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h @@ -103,7 +103,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (int16_t*)complexVector; int16_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h index cd2fabb521..f3d0c83524 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h +++ b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h @@ -66,7 +66,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; @@ -85,7 +85,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, con \param num_points The number of complex data values to be deinterleaved */ extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/kernels/volk/volk_16ic_magnitude_16i.h index 317075e85e..b33306a123 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a.h +++ b/volk/kernels/volk/volk_16ic_magnitude_16i.h @@ -161,7 +161,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -182,7 +182,7 @@ static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, c \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ +static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h index 1300395ff0..55243b4aa8 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h @@ -78,7 +78,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; @@ -100,7 +100,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, \param num_points The number of complex data values to be deinterleaved */ extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h index 5e2d82b947..57d078a595 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h +++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h @@ -108,7 +108,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h index d20eea1a79..27901cb9ac 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h +++ b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h @@ -149,7 +149,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co \param scalar The data value to be divided against each input data value of the input complex vector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -171,7 +171,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/kernels/volk/volk_16u_byteswap.h index fc3eb5fa7a..57f2008991 100644 --- a/volk/include/volk/volk_16u_byteswap_a.h +++ b/volk/kernels/volk/volk_16u_byteswap.h @@ -1,3 +1,66 @@ +#ifndef INCLUDED_volk_16u_byteswap_u_H +#define INCLUDED_volk_16u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_16u_byteswap_u_H */ #ifndef INCLUDED_volk_16u_byteswap_a_H #define INCLUDED_volk_16u_byteswap_a_H @@ -68,7 +131,7 @@ static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned in \param numDataPoints The number of data points */ extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/kernels/volk/volk_32f_accumulator_s32f.h index 78364d0a01..a67d10f9b5 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a.h +++ b/volk/kernels/volk/volk_32f_accumulator_s32f.h @@ -50,7 +50,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ const float* aPtr = inputBuffer; unsigned int number = 0; float returnValue = 0; diff --git a/volk/kernels/volk/volk_32f_convert_64f.h b/volk/kernels/volk/volk_32f_convert_64f.h new file mode 100644 index 0000000000..2f036955dd --- /dev/null +++ b/volk/kernels/volk/volk_32f_convert_64f.h @@ -0,0 +1,140 @@ +#ifndef INCLUDED_volk_32f_convert_64f_u_H +#define INCLUDED_volk_32f_convert_64f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_u_H */ +#ifndef INCLUDED_volk_32f_convert_64f_a_H +#define INCLUDED_volk_32f_convert_64f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/kernels/volk/volk_32f_index_max_16u.h index b9ca1dd3e7..dd1aed2459 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/kernels/volk/volk_32f_index_max_16u.h @@ -124,7 +124,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa #endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) { +static inline void volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { if(num_points > 0){ float max = src0[0]; unsigned int index = 0; diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h index 43713f8b5a..71881c2d5f 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h @@ -87,7 +87,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. \param num_points The number of real values in the input vector. */ -static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ +static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ if (num_points < 1) { return; } diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h index db61e359d6..bf05a882d5 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h @@ -128,7 +128,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 \param noiseFloorAmplitude The noise floor of the input spectrum, in dB */ -static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ float sumMean = 0.0; unsigned int number; // find the sum (for mean), etc diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h index 56e42c9bd5..9fd758655f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h @@ -127,7 +127,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const \param num_points The number of data values to be converted \note Input buffer does NOT need to be properly aligned */ -static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -150,3 +150,153 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H +#define INCLUDED_volk_32f_s32f_convert_16i_a_H + +#include <volk/volk_common.h> +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/kernels/volk/volk_32f_s32f_convert_32i.h index 38e6b2e745..1a46093ee2 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_32i.h @@ -1,3 +1,145 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H +#define INCLUDED_volk_32f_s32f_convert_32i_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H #define INCLUDED_volk_32f_s32f_convert_32i_a_H diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_8i.h index 870e9419bb..b451505221 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_8i.h @@ -132,7 +132,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl \param num_points The number of data values to be converted \note Input buffer does NOT need to be properly aligned */ -static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; @@ -155,3 +155,158 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H +#define INCLUDED_volk_32f_s32f_convert_8i_a_H + +#include <volk/volk_common.h> +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + + float min_val = -128; + float max_val = 127; + float r; + + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int8_t)(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h index 99b8e68c5b..2dd86a17c2 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h @@ -1,3 +1,105 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H +#define INCLUDED_volk_32f_s32f_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H #define INCLUDED_volk_32f_s32f_multiply_32f_a_H @@ -108,7 +210,7 @@ static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const fl \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/kernels/volk/volk_32f_s32f_normalize.h index f5fd0d1dba..a0bd33c7dc 100644 --- a/volk/include/volk/volk_32f_s32f_normalize_a.h +++ b/volk/kernels/volk/volk_32f_s32f_normalize.h @@ -49,7 +49,7 @@ static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float s \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ unsigned int number = 0; float* inputPtr = vecBuffer; const float invScalar = 1.0 / scalar; @@ -69,7 +69,7 @@ static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const flo \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/kernels/volk/volk_32f_s32f_power_32f.h index 633ad14b09..2822444686 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_power_32f.h @@ -127,7 +127,7 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ +static inline void volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h index 98401b2d42..0622b278a6 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h @@ -120,7 +120,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in \param mean The mean of the input buffer \param num_points The number of values in input buffer to used in the stddev calculation */ -static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ +static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ float returnValue = 0; if(num_points > 0){ const float* aPtr = inputBuffer; diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/kernels/volk/volk_32f_sqrt_32f.h index d9b16fc0fb..ab9fffd7dc 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a.h +++ b/volk/kernels/volk/volk_32f_sqrt_32f.h @@ -47,7 +47,7 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; unsigned int number = 0; @@ -66,7 +66,7 @@ extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int); \param aVector One of the vectors to be sqrted \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector */ -static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){ +static inline void volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points){ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); } diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h index 7de32f7b18..9bded6713d 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h @@ -143,7 +143,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m \param inputBuffer The buffer of points to calculate the std deviation for \param num_points The number of values in input buffer to used in the stddev and mean calculations */ -static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ +static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ float returnValue = 0; float newMean = 0; if(num_points > 0){ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/kernels/volk/volk_32f_x2_add_32f.h index 51e63e54d2..42278f6068 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_add_32f.h @@ -1,3 +1,69 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ #ifndef INCLUDED_volk_32f_x2_add_32f_a_H #define INCLUDED_volk_32f_x2_add_32f_a_H @@ -72,7 +138,7 @@ static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aV \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/kernels/volk/volk_32f_x2_divide_32f.h index 7b60fb22ef..d5a7c7d7c0 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_divide_32f.h @@ -51,7 +51,7 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe \param bVector The divisor vector \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ -static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector */ extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h index 961c2418ca..8fcc7deaed 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h +++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { float dotProduct = 0; const float* aPtr = input; diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h new file mode 100644 index 0000000000..b91252e36f --- /dev/null +++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h @@ -0,0 +1,580 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include <smmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_AVX + +#include <immintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include <smmintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_AVX + +#include <immintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr+8); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h index 52d80b6bb3..0935cb32bd 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h +++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h @@ -56,7 +56,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c \param complexVector The complex output vector \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ +static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ float* complexVectorPtr = (float*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/kernels/volk/volk_32f_x2_max_32f.h index 79f2d04b56..27633acae8 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_max_32f.h @@ -53,7 +53,7 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -76,7 +76,7 @@ static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aV \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/kernels/volk/volk_32f_x2_min_32f.h index 42cac08339..4773d13211 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_min_32f.h @@ -53,7 +53,7 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -76,7 +76,7 @@ static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aV \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h index 340e051657..9fdbec0a2c 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h @@ -1,3 +1,109 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H +#define INCLUDED_volk_32f_x2_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H #define INCLUDED_volk_32f_x2_multiply_32f_a_H @@ -111,7 +217,7 @@ static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const floa \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h index 10fc267dcd..ce7b91a318 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h +++ b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h @@ -137,7 +137,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be interleaved */ -static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ +static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ int16_t* complexVectorPtr = (int16_t*)complexVector; const float* iBufferPtr = iBuffer; const float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h index e2b8be797f..8ea491f988 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h +++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h @@ -51,7 +51,7 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a \param bVector The vector to be subtracted \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ -static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ float* cPtr = cVector; const float* aPtr = aVector; const float* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const floa \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector */ extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index e33e5a916a..e975f14e92 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -101,7 +101,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { +static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { const unsigned int num_bytes = num_points*4; diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 109b787e8c..e0a8a59ced 100644 --- a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { +static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { float res[2]; float *realpt = &res[0], *imagpt = &res[1]; diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h index 28d584bf2c..104e3250e6 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -64,7 +64,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; const float* bPtr= bVector; @@ -85,7 +85,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ +static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h index 919280d510..dce897ff57 100644 --- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h @@ -1,3 +1,67 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H #define INCLUDED_volk_32fc_conjugate_32fc_a_H diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h index 4106f38513..0d33ed7e28 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -57,7 +57,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h new file mode 100644 index 0000000000..4a4c5509bd --- /dev/null +++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h @@ -0,0 +1,156 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_storeu_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_storeu_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h index c88809bebd..b1968296f5 100644 --- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l \param qBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h index 0d6c6b7af4..3d57598135 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h @@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; float* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h index 1e346bacaf..1fa66e8add 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h @@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/kernels/volk/volk_32fc_index_max_16u.h index 0e2201152c..c8d7212401 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/kernels/volk/volk_32fc_index_max_16u.h @@ -189,7 +189,7 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) { +static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) { const unsigned int num_bytes = num_points*8; diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h index efb84a904b..64e99cc1be 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a.h +++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h @@ -1,3 +1,121 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H +#define INCLUDED_volk_32fc_magnitude_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H #define INCLUDED_volk_32fc_magnitude_32f_a_H @@ -123,7 +241,7 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h new file mode 100644 index 0000000000..0af81401a8 --- /dev/null +++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -0,0 +1,228 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h index d86bd63c1c..b076ab44ef 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -139,7 +139,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv \param normalizeFactor The atan2 results will be divided by this normalization factor. \param num_points The number of complex values in the input vector. */ -static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.0 / normalizeFactor; diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h index 1c17fb70c6..9e10217a0f 100644 --- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h @@ -63,7 +63,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* iBufferPtr = iBuffer; unsigned int number = 0; diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h index 38fd609d31..09abd967d6 100644 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h @@ -129,7 +129,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ const float* complexVectorPtr = (float*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; @@ -150,7 +150,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVect \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_32fc_s32f_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h index 3106edbefd..d4a1d17469 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h @@ -94,7 +94,7 @@ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_ \param power The power value to be applied to each data point \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector */ -static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ +static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = 0; diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h index 30a77dbc18..f76d9d35e4 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h @@ -96,7 +96,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu \param normalizationFactor This value is divided agains all the input values before the power is calculated \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ +static inline void volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h index 27f755351d..e73eb09f8f 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h +++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h @@ -103,7 +103,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo \param rbw The resolution bandwith of the fft spectrum \param num_points The number of fft data points */ -static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ // Calculate the Power of the complex point const float* inputPtr = (float*)complexFFTInput; float* realFFTDataPointsPtr = logPowerOutput; diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h index f206c5e874..668a047609 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h @@ -1,3 +1,90 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h index eee9f0064f..ab6b7fb1df 100644 --- a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h @@ -4,7 +4,7 @@ #include <volk/volk_complex.h> #include <stdio.h> -#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h> +#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h> #ifdef LV_HAVE_GENERIC @@ -19,9 +19,9 @@ */ -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; - volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points); + volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -32,7 +32,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVe static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; - volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points); + volk_32fc_s32fc_x2_rotator_32fc_sse4_1(outVector, inVector, phase_inc, phase, num_points); } @@ -58,7 +58,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; - volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points); + volk_32fc_s32fc_x2_rotator_32fc_avx(outVector, inVector, phase_inc, phase, num_points); } diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h index 51b6041ec0..ffbbdff690 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h @@ -20,7 +20,7 @@ */ -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ unsigned int i = 0; int j = 0; for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { @@ -42,7 +42,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVecto #ifdef LV_HAVE_SSE4_1 #include <smmintrin.h> -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; @@ -153,7 +153,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index 0deb9c2f90..e6ccf5c384 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -1,3 +1,152 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H + + +#include<volk/volk_complex.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE3 + +#include <xmmintrin.h> +#include <pmmintrin.h> +#include <mmintrin.h> + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int num_bytes = num_points*8; + + // Variable never used? + //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + union HalfMask { + uint32_t intRep[4]; + __m128 vec; + } halfMask; + + union NegMask { + int intRep[4]; + __m128 vec; + } negMask; + + unsigned int offset = 0; + float Rsum=0, Isum=0; + float Im,Re; + + __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; + __m128 zv = {0,0,0,0}; + + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; + halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; + + negMask.intRep[0] = negMask.intRep[2] = 0x80000000; + negMask.intRep[1] = negMask.intRep[3] = 0; + + // main loop + while(num_bytes >= 4*sizeof(float)){ + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_mul_ps(in1, in2); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_mul_ps(in1, fehg); + Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + num_bytes -= 4*sizeof(float); + offset += 2; + Rsum += Re; + Isum += Im; + } + + // handle the last complex case ... + if(num_bytes > 0){ + + if(num_bytes != 4){ + // bad things are happening + } + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); + Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + Rsum += Re; + Isum += Im; + } + + result[0] = lv_cmake(Rsum,Isum); + return; +} + +#endif /*LV_HAVE_SSE3*/ + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ + + + #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index 10ff4080ed..066bed4439 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -1,3 +1,119 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H + +#include <volk/volk_common.h> +#include <volk/volk_complex.h> +#include <stdio.h> +#include <string.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[num_points - 1] * taps[num_points - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_points/2; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if(num_points % 1 != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h index f79ddb59bf..7db68c1bd8 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -1,3 +1,80 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H @@ -81,7 +158,7 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h new file mode 100644 index 0000000000..cfd6c007f1 --- /dev/null +++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -0,0 +1,162 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index d985fcd7f5..cb2e945015 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -107,7 +107,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { const unsigned int num_bytes = num_points*8; diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h index a10b6702bb..27a081b7cf 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -93,7 +93,7 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { +static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { const unsigned int num_bytes = num_points*8; diff --git a/volk/kernels/volk/volk_32i_s32f_convert_32f.h b/volk/kernels/volk/volk_32i_s32f_convert_32f.h new file mode 100644 index 0000000000..7a09883453 --- /dev/null +++ b/volk/kernels/volk/volk_32i_s32f_convert_32f.h @@ -0,0 +1,148 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H +#define INCLUDED_volk_32i_s32f_convert_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H +#define INCLUDED_volk_32i_s32f_convert_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/kernels/volk/volk_32i_x2_and_32i.h index e5330847b3..54ecb79812 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a.h +++ b/volk/kernels/volk/volk_32i_x2_and_32i.h @@ -51,7 +51,7 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV \param bVector One of the vectors \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ -static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t \param num_points The number of values in aVector and bVector to be anded together and stored into cVector */ extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/kernels/volk/volk_32i_x2_or_32i.h index 24045894c6..acadd5a57f 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a.h +++ b/volk/kernels/volk/volk_32i_x2_or_32i.h @@ -51,7 +51,7 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe \param bVector One of the vectors to be ored \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ -static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ int32_t* cPtr = cVector; const int32_t* aPtr = aVector; const int32_t* bPtr= bVector; @@ -72,7 +72,7 @@ static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* \param num_points The number of values in aVector and bVector to be ored together and stored into cVector */ extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ +static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/kernels/volk/volk_32u_byteswap.h index 71ae027d37..8f6e3ad7b5 100644 --- a/volk/include/volk/volk_32u_byteswap_a.h +++ b/volk/kernels/volk/volk_32u_byteswap.h @@ -1,3 +1,80 @@ +#ifndef INCLUDED_volk_32u_byteswap_u_H +#define INCLUDED_volk_32u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_u_H */ #ifndef INCLUDED_volk_32u_byteswap_a_H #define INCLUDED_volk_32u_byteswap_a_H diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/kernels/volk/volk_32u_popcnt.h index b72d605c67..9783569729 100644 --- a/volk/include/volk/volk_32u_popcnt_a.h +++ b/volk/kernels/volk/volk_32u_popcnt.h @@ -7,7 +7,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) { +static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) { // This is faster than a lookup table uint32_t retVal = value; diff --git a/volk/kernels/volk/volk_64f_convert_32f.h b/volk/kernels/volk/volk_64f_convert_32f.h new file mode 100644 index 0000000000..c27526ffaf --- /dev/null +++ b/volk/kernels/volk/volk_64f_convert_32f.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_64f_convert_32f_u_H +#define INCLUDED_volk_64f_convert_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_u_H */ +#ifndef INCLUDED_volk_64f_convert_32f_a_H +#define INCLUDED_volk_64f_convert_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/kernels/volk/volk_64f_x2_max_64f.h index 33aae6d102..f9a04c2c40 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a.h +++ b/volk/kernels/volk/volk_64f_x2_max_64f.h @@ -53,7 +53,7 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/kernels/volk/volk_64f_x2_min_64f.h index 25d8b4c982..c77ca87fbd 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a.h +++ b/volk/kernels/volk/volk_64f_x2_min_64f.h @@ -53,7 +53,7 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe \param bVector The vector to be checked \param num_points The number of values in aVector and bVector to be checked and stored into cVector */ -static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ +static inline void volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ double* cPtr = cVector; const double* aPtr = aVector; const double* bPtr= bVector; diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/kernels/volk/volk_64u_byteswap.h index 3d1d87623e..e05daf6d5c 100644 --- a/volk/include/volk/volk_64u_byteswap_a.h +++ b/volk/kernels/volk/volk_64u_byteswap.h @@ -1,3 +1,91 @@ +#ifndef INCLUDED_volk_64u_byteswap_u_H +#define INCLUDED_volk_64u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_u_H */ #ifndef INCLUDED_volk_64u_byteswap_a_H #define INCLUDED_volk_64u_byteswap_a_H diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/kernels/volk/volk_64u_popcnt.h index 5e68ed2083..466cfa5dad 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/kernels/volk/volk_64u_popcnt.h @@ -8,7 +8,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { +static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) { //const uint32_t* valueVector = (const uint32_t*)&value; diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/kernels/volk/volk_8i_convert_16i.h index 9104f90cb0..3e5c92723f 100644 --- a/volk/include/volk/volk_8i_convert_16i_a.h +++ b/volk/kernels/volk/volk_8i_convert_16i.h @@ -1,3 +1,76 @@ +#ifndef INCLUDED_volk_8i_convert_16i_u_H +#define INCLUDED_volk_8i_convert_16i_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ #ifndef INCLUDED_volk_8i_convert_16i_a_H #define INCLUDED_volk_8i_convert_16i_a_H @@ -73,7 +146,7 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in \param num_points The number of data values to be converted */ extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ +static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/kernels/volk/volk_8i_s32f_convert_32f.h index 02a7f356e0..bd7ff82d9a 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h +++ b/volk/kernels/volk/volk_8i_s32f_convert_32f.h @@ -1,3 +1,97 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H +#define INCLUDED_volk_8i_s32f_convert_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE4_1 +#include <smmintrin.h> + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H #define INCLUDED_volk_8i_s32f_convert_32f_a_H @@ -95,7 +189,7 @@ static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const \param num_points The number of data values to be converted */ extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); } diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h index 8f13da32ff..b59d22d186 100644 --- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h @@ -59,7 +59,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16 \param qBuffer The Q buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; int16_t* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h index d26b3d0d0d..82cedb2bb7 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h @@ -49,7 +49,7 @@ static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, con \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; int16_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h index 21efed83e7..c8ff18e67b 100644 --- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h @@ -50,7 +50,7 @@ static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const \param iBuffer The I buffer output data \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (int8_t*)complexVector; int8_t* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h index d82da59fb1..9e244c8fc2 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h @@ -146,7 +146,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; float* qBufferPtr = qBuffer; diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h index b2c15d3a30..56a1adcbb5 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h +++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h @@ -116,7 +116,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con \param scalar The scaling value being multiplied against each data point \param num_points The number of complex data values to be deinterleaved */ -static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const int8_t* complexVectorPtr = (const int8_t*)complexVector; float* iBufferPtr = iBuffer; diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h index f85fdb9995..685a21ddcd 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h @@ -75,7 +75,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ unsigned int number = 0; int16_t* c16Ptr = (int16_t*)cVector; int8_t* a8Ptr = (int8_t*)aVector; diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h index 4b16171cec..edb52ff509 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h @@ -95,7 +95,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* \param bVector The complex vector which will be converted to complex conjugate and multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ unsigned int number = 0; float* cPtr = (float*)cVector; const float invScalar = 1.0 / scalar; diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index d9aeb797c3..fcb0edb68f 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -202,7 +202,7 @@ message(STATUS "Available machines: ${available_machines}") #dependencies are all python, xml, and header implementation files file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml) file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py) -file(GLOB h_files ${CMAKE_SOURCE_DIR}/include/volk/*.h) +file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h) macro(gen_template tmpl output) list(APPEND volk_gen_sources ${output}) @@ -253,6 +253,7 @@ endforeach(machine_name) include_directories( ${CMAKE_BINARY_DIR}/include ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/kernels ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 4e361aece2..e526eb2d01 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -63,12 +63,12 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { } } -static std::vector<std::string> get_arch_list(struct volk_func_desc desc) { +static std::vector<std::string> get_arch_list(volk_func_desc_t desc) { std::vector<std::string> archlist; - for(int i = 0; i < desc.n_archs; i++) { + for(size_t i = 0; i < desc.n_impls; i++) { //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc - archlist.push_back(std::string(desc.indices[i])); + archlist.push_back(std::string(desc.impl_names[i])); } return archlist; @@ -256,7 +256,7 @@ public: private: std::list<std::vector<char> > _mems; }; -bool run_volk_tests(struct volk_func_desc desc, +bool run_volk_tests(volk_func_desc_t desc, void (*manual_func)(), std::string name, float tol, @@ -442,22 +442,32 @@ bool run_volk_tests(struct volk_func_desc desc, arch_results.push_back(!fail); } - double best_time = std::numeric_limits<double>::max(); - std::string best_arch = "generic"; - for(size_t i=0; i < arch_list.size(); i++) { - if((profile_times[i] < best_time) && arch_results[i]) { - best_time = profile_times[i]; - best_arch = arch_list[i]; + double best_time_a = std::numeric_limits<double>::max(); + double best_time_u = std::numeric_limits<double>::max(); + std::string best_arch_a = "generic"; + std::string best_arch_u = "generic"; + for(size_t i=0; i < arch_list.size(); i++) + { + if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) + { + best_time_u = profile_times[i]; + best_arch_u = arch_list[i]; + } + if((profile_times[i] < best_time_a) && arch_results[i]) + { + best_time_a = profile_times[i]; + best_arch_a = arch_list[i]; } } - std::cout << "Best arch: " << best_arch << std::endl; + std::cout << "Best aligned arch: " << best_arch_a << std::endl; + std::cout << "Best unaligned arch: " << best_arch_u << std::endl; if(best_arch_vector) { if(puppet_master_name == "NULL") { - best_arch_vector->push_back(name + std::string(" ") + best_arch); + best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u); } else { - best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch); + best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u); } } diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index 1e639ac3c6..0f17cdaa34 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -21,7 +21,7 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); +bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); #define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 9b7d9da5e3..f133897cba 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -2,109 +2,89 @@ #include <volk/volk.h> #include <boost/test/unit_test.hpp> -//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000); -//VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000); -//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000); -//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); -//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); -VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 1); -//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); -VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 1); -VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 1); -//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 1); -VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1); -//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1); -VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1); -//VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); +//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000); +//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000); +VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1); +//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1); +VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1); +//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1); +VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1); +//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000); +VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1); +VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000); +VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c index 5e5c9dfff7..f787b5e2aa 100644 --- a/volk/lib/volk_prefs.c +++ b/volk/lib/volk_prefs.c @@ -7,7 +7,8 @@ //#include <Windows.h> //#endif -void get_config_path(char *path) { +void volk_get_config_path(char *path) +{ const char *suffix = "/.volk/volk_config"; char *home = NULL; if (home == NULL) home = getenv("HOME"); @@ -20,38 +21,30 @@ void get_config_path(char *path) { strcat(path, suffix); } -//passing by reference in C can (***********) -int load_preferences(struct volk_arch_pref **prefs) { +size_t volk_load_preferences(volk_arch_pref_t **prefs_res) +{ FILE *config_file; - char path[512], line[512], function[128], arch[32]; - int n_arch_prefs = 0; - struct volk_arch_pref *t_pref; + char path[512], line[512]; + size_t n_arch_prefs = 0; + volk_arch_pref_t *prefs = NULL; //get the config path - get_config_path(path); + volk_get_config_path(path); if (path == NULL) return n_arch_prefs; //no prefs found config_file = fopen(path, "r"); if(!config_file) return n_arch_prefs; //no prefs found - while(fgets(line, 512, config_file) != NULL) { - if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) { - n_arch_prefs++; - } - } - - //now allocate the memory required for volk_arch_prefs - (*prefs) = (struct volk_arch_pref *) malloc(n_arch_prefs * sizeof(struct volk_arch_pref)); - t_pref = (*prefs); - //reset the file pointer and write the prefs into volk_arch_prefs - rewind(config_file); - while(fgets(line, 512, config_file) != NULL) { - if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) { - strncpy(t_pref->name, function, 128); - strncpy(t_pref->arch, arch, 32); - t_pref++; + while(fgets(line, sizeof(line), config_file) != NULL) + { + prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); + volk_arch_pref_t *p = prefs + n_arch_prefs; + if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5)) + { + n_arch_prefs++; } } fclose(config_file); + *prefs_res = prefs; return n_arch_prefs; } diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c index 865d60955c..dfe44e2924 100644 --- a/volk/lib/volk_rank_archs.c +++ b/volk/lib/volk_rank_archs.c @@ -1,43 +1,105 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + #include <volk_rank_archs.h> #include <volk/volk_prefs.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) { +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4 + #define __popcnt __builtin_popcount +#elif HAVE_INTRIN_H + #include <intrin.h> //defines __popcnt +#else + #error no popcnt for your compiler, add one here +#endif + +int volk_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +){ unsigned int i; - for(i=0; i<n_archs; i++) { - if(!strncmp(indices[i], arch_name, 20)) { + for (i = 0; i < n_impls; i++) { + if(!strncmp(impl_names[i], impl_name, 20)) { return i; } } + //TODO return -1; //something terrible should happen here printf("Volk warning: no arch found, returning generic impl\n"); - return get_index(indices, n_archs, "generic"); //but we'll fake it for now + return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now } -unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char* name, unsigned int arch) { - unsigned int i; - unsigned int best_val = 0; - static struct volk_arch_pref *volk_arch_prefs; - static unsigned int n_arch_prefs = 0; +int volk_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +){ + size_t i; + static volk_arch_pref_t *volk_arch_prefs; + static size_t n_arch_prefs = 0; static int prefs_loaded = 0; if(!prefs_loaded) { - n_arch_prefs = load_preferences(&volk_arch_prefs); + n_arch_prefs = volk_load_preferences(&volk_arch_prefs); prefs_loaded = 1; } - //now look for the function name in the prefs list - for(i=0; i < n_arch_prefs; i++) { - if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it - return get_index(indices, n_archs, volk_arch_prefs[i].arch); - } - } + //now look for the function name in the prefs list + for(i = 0; i < n_arch_prefs; i++) + { + if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it + { + const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; + return volk_get_index(impl_names, n_impls, impl_name); + } + } - for(i=1; i < n_archs; ++i) { - if((arch_defs[i]&(!arch)) == 0) { - best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; + //return the best index with the largest deps + size_t best_index_a = 0; + size_t best_index_u = 0; + int best_value_a = -1; + int best_value_u = -1; + for(i = 0; i < n_impls; i++) + { + const signed val = __popcnt(impl_deps[i]); + if (alignment[i] && val > best_value_a) + { + best_index_a = i; + best_value_a = val; + } + if (!alignment[i] && val > best_value_u) + { + best_index_u = i; + best_value_u = val; + } } - } - return best_val; + + //when align and we found a best aligned, use it + if (align && best_value_a != -1) return best_index_a; + + //otherwise return the best unaligned + return best_index_u; } diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h index 546240d2c6..b3bf8ff17c 100644 --- a/volk/lib/volk_rank_archs.h +++ b/volk/lib/volk_rank_archs.h @@ -1,12 +1,48 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + #ifndef INCLUDED_VOLK_RANK_ARCHS_H #define INCLUDED_VOLK_RANK_ARCHS_H +#include <stdlib.h> +#include <stdbool.h> + #ifdef __cplusplus extern "C" { #endif -unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name); -unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char *name, unsigned int arch); +int volk_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +); + +int volk_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +); #ifdef __cplusplus } diff --git a/volk/tmpl/volk.tmpl.c b/volk/tmpl/volk.tmpl.c index c3a1544ff8..f915f157f6 100644 --- a/volk/tmpl/volk.tmpl.c +++ b/volk/tmpl/volk.tmpl.c @@ -27,6 +27,10 @@ #include <volk/volk.h> #include <stdio.h> #include <string.h> +#include <assert.h> + +static size_t __alignment = 0; +static intptr_t __alignment_mask = 0; struct volk_machine *get_machine(void) { extern struct volk_machine *volk_machines[]; @@ -46,45 +50,118 @@ struct volk_machine *get_machine(void) { } } printf("Using Volk machine: %s\n", machine->name); + __alignment = machine->alignment; + __alignment_mask = (intptr_t)(__alignment-1); return machine; } } -unsigned int volk_get_alignment(void) { - return get_machine()->alignment; +size_t volk_get_alignment(void) +{ + get_machine(); //ensures alignment is set + return __alignment; +} + +bool volk_is_aligned(const void *ptr) +{ + return ((intptr_t)(ptr) & __alignment_mask) == 0; } +#define LV_HAVE_GENERIC +#define LV_HAVE_DISPATCHER + #for $kern in $kernels -void get_$(kern.name)($kern.arglist_namedefs) { - $kern.name = get_machine()->$(kern.name)_archs[volk_rank_archs( - get_machine()->$(kern.name)_indices, - get_machine()->$(kern.name)_arch_defs, - get_machine()->$(kern.name)_n_archs, - get_machine()->$(kern.name)_name, - volk_get_lvarch() - )]; +#if $kern.has_dispatcher +#include <volk/$(kern.name).h> //pulls in the dispatcher +#end if + +static inline void __$(kern.name)_d($kern.arglist_full) +{ + #if $kern.has_dispatcher + $(kern.name)_dispatcher($kern.arglist_names); + return; + #end if + + if (volk_is_aligned( + #set $num_open_parens = 0 + #for $arg_type, $arg_name in $kern.args + #if '*' in $arg_type + VOLK_OR_PTR($arg_name, + #set $num_open_parens += 1 + #end if + #end for + 0$(')'*$num_open_parens) + )){ + $(kern.name)_a($kern.arglist_names); + } + else{ + $(kern.name)_u($kern.arglist_names); + } +} + +static inline void __init_$(kern.name)(void) +{ + const char *name = get_machine()->$(kern.name)_name; + const char **impl_names = get_machine()->$(kern.name)_impl_names; + const int *impl_deps = get_machine()->$(kern.name)_impl_deps; + const bool *alignment = get_machine()->$(kern.name)_impl_alignment; + const size_t n_impls = get_machine()->$(kern.name)_n_impls; + const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/); + const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/); + $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a]; + $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u]; + + assert($(kern.name)_a); + assert($(kern.name)_u); + + $(kern.name) = &__$(kern.name)_d; +} + +static inline void __$(kern.name)_a($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)_a($kern.arglist_names); +} + +static inline void __$(kern.name)_u($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)_u($kern.arglist_names); +} + +static inline void __$(kern.name)($kern.arglist_full) +{ + __init_$(kern.name)(); $(kern.name)($kern.arglist_names); } -$kern.pname $kern.name = &get_$(kern.name); +$kern.pname $(kern.name)_a = &__$(kern.name)_a; +$kern.pname $(kern.name)_u = &__$(kern.name)_u; +$kern.pname $(kern.name) = &__$(kern.name); -void $(kern.name)_manual($kern.arglist_namedefs, const char* arch) { - const size_t index = get_index( - get_machine()->$(kern.name)_indices, - get_machine()->$(kern.name)_n_archs, - arch +void $(kern.name)_manual($kern.arglist_full, const char* impl_name) +{ + const int index = volk_get_index( + get_machine()->$(kern.name)_impl_names, + get_machine()->$(kern.name)_n_impls, + impl_name ); - get_machine()->$(kern.name)_archs[index]( + get_machine()->$(kern.name)_impls[index]( $kern.arglist_names ); } -struct volk_func_desc $(kern.name)_get_func_desc(void) { - struct volk_func_desc desc = { - get_machine()->$(kern.name)_indices, - get_machine()->$(kern.name)_arch_defs, - get_machine()->$(kern.name)_n_archs +volk_func_desc_t $(kern.name)_get_func_desc(void) { + const char **impl_names = get_machine()->$(kern.name)_impl_names; + const int *impl_deps = get_machine()->$(kern.name)_impl_deps; + const bool *alignment = get_machine()->$(kern.name)_impl_alignment; + const size_t n_impls = get_machine()->$(kern.name)_n_impls; + volk_func_desc_t desc = { + impl_names, + impl_deps, + alignment, + n_impls }; return desc; } diff --git a/volk/tmpl/volk.tmpl.h b/volk/tmpl/volk.tmpl.h index 161579e46d..464b65598a 100644 --- a/volk/tmpl/volk.tmpl.h +++ b/volk/tmpl/volk.tmpl.h @@ -27,20 +27,59 @@ #include <volk/volk_common.h> #include <volk/volk_complex.h> +#include <stdlib.h> +#include <stdbool.h> + __VOLK_DECL_BEGIN -struct volk_func_desc { - const char **indices; - const int *arch_defs; - const int n_archs; -}; +typedef struct volk_func_desc +{ + const char **impl_names; + const int *impl_deps; + const bool *impl_alignment; + const size_t n_impls; +} volk_func_desc_t; + +//! Get the machine alignment in bytes +VOLK_API size_t volk_get_alignment(void); + +/*! + * The VOLK_OR_PTR macro is a convenience macro + * for checking the alignment of a set of pointers. + * Example usage: + * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2))) + */ +#define VOLK_OR_PTR(ptr0, ptr1) \ + (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1))) -VOLK_API unsigned int volk_get_alignment(void); +/*! + * Is the pointer on a machine alignment boundary? + * + * Note: for performance reasons, this function + * is not usable until another volk API call is made + * which will perform certain initialization tasks. + * + * \param ptr the pointer to some memory buffer + * \return 1 for alignment boundary, else 0 + */ +VOLK_API bool volk_is_aligned(const void *ptr); #for $kern in $kernels + +//! A function pointer to the dispatcher implementation extern VOLK_API $kern.pname $kern.name; -extern VOLK_API void $(kern.name)_manual($kern.arglist_namedefs, const char* arch); -extern VOLK_API struct volk_func_desc $(kern.name)_get_func_desc(void); + +//! A function pointer to the fastest aligned implementation +extern VOLK_API $kern.pname $(kern.name)_a; + +//! A function pointer to the fastest unaligned implementation +extern VOLK_API $kern.pname $(kern.name)_u; + +//! Call into a specific implementation given by name +extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name); + +//! Get description paramaters for this kernel +extern VOLK_API volk_func_desc_t $(kern.name)_get_func_desc(void); #end for __VOLK_DECL_END diff --git a/volk/tmpl/volk_machine_xxx.tmpl.c b/volk/tmpl/volk_machine_xxx.tmpl.c index e405bd6938..68d7f3eba2 100644 --- a/volk/tmpl/volk_machine_xxx.tmpl.c +++ b/volk/tmpl/volk_machine_xxx.tmpl.c @@ -44,18 +44,23 @@ $(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp #end def ######################################################################## -#def make_tag_str_list($tags) -{$(', '.join(['"%s"'%a for a in $tags]))}#slurp +#def make_impl_name_list($impls) +{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp #end def ######################################################################## -#def make_tag_have_list($deps) -{$(', '.join([' | '.join(['(1 << LV_%s)'%a.upper() for a in d]) for d in $deps]))}#slurp +#def make_impl_align_list($impls) +{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp #end def ######################################################################## -#def make_tag_kern_list($name, $tags) -{$(', '.join(['%s_%s'%($name, a) for a in $tags]))}#slurp +#def make_impl_deps_list($impls) +{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp +#end def + +######################################################################## +#def make_impl_fcn_list($name, $impls) +{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp #end def struct volk_machine volk_machine_$(this_machine.name) = { @@ -63,11 +68,12 @@ struct volk_machine volk_machine_$(this_machine.name) = { "$this_machine.name", $this_machine.alignment, #for $kern in $kernels - #set $taglist, $tagdeps = $kern.get_tags($arch_names) - "$kern.name", - $make_tag_str_list($taglist), - $make_tag_have_list($tagdeps), - $make_tag_kern_list($kern.name, $taglist), - $(len($taglist)), + #set $impls = $kern.get_impls($arch_names) + "$kern.name", ##//kernel name + $make_impl_name_list($impls), ##//list of kernel implementations by name + $make_impl_deps_list($impls), ##//list of arch dependencies per implementation + $make_impl_align_list($impls), ##//alignment required? for each implementation + $make_impl_fcn_list($kern.name, $impls), ##//pointer to each implementation + $(len($impls)), ##//number of implementations listed here #end for }; diff --git a/volk/tmpl/volk_machines.tmpl.h b/volk/tmpl/volk_machines.tmpl.h index b30e600ed8..7e11b10795 100644 --- a/volk/tmpl/volk_machines.tmpl.h +++ b/volk/tmpl/volk_machines.tmpl.h @@ -25,18 +25,22 @@ #include <volk/volk_common.h> #include <volk/volk_typedefs.h> +#include <stdbool.h> +#include <stdlib.h> + __VOLK_DECL_BEGIN struct volk_machine { const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) const char *name; - const unsigned int alignment; //the maximum byte alignment required for functions in this library + const size_t alignment; //the maximum byte alignment required for functions in this library #for $kern in $kernels const char *$(kern.name)_name; - const char *$(kern.name)_indices[$(len($archs))]; - const int $(kern.name)_arch_defs[$(len($archs))]; - const $(kern.pname) $(kern.name)_archs[$(len($archs))]; - const int $(kern.name)_n_archs; + const char *$(kern.name)_impl_names[$(len($archs))]; + const int $(kern.name)_impl_deps[$(len($archs))]; + const bool $(kern.name)_impl_alignment[$(len($archs))]; + const $(kern.pname) $(kern.name)_impls[$(len($archs))]; + const size_t $(kern.name)_n_impls; #end for }; diff --git a/volk/tmpl/volk_typedefs.tmpl.h b/volk/tmpl/volk_typedefs.tmpl.h index 52a87242fe..6f5426965f 100644 --- a/volk/tmpl/volk_typedefs.tmpl.h +++ b/volk/tmpl/volk_typedefs.tmpl.h @@ -26,7 +26,7 @@ #include <volk/volk_complex.h> #for $kern in $kernels -typedef $kern.rettype (*$(kern.pname))($kern.arglist_defs); +typedef void (*$(kern.pname))($kern.arglist_types); #end for #endif /*INCLUDED_VOLK_TYPEDEFS*/ |