summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
Diffstat (limited to 'volk')
-rw-r--r--volk/CMakeLists.txt6
-rw-r--r--volk/apps/CMakeLists.txt4
-rw-r--r--volk/apps/volk_profile.cc201
-rw-r--r--volk/cmake/CMakeParseArgumentsCopy.cmake138
-rw-r--r--volk/cmake/msvc/stdbool.h45
-rw-r--r--volk/gen/archs.xml1
-rw-r--r--volk/gen/volk_arch_defs.py7
-rw-r--r--volk/gen/volk_kernel_defs.py343
-rw-r--r--volk/gen/volk_machine_defs.py4
-rw-r--r--volk/include/volk/volk_16i_convert_8i_a.h69
-rw-r--r--volk/include/volk/volk_16i_s32f_convert_32f_a.h119
-rw-r--r--volk/include/volk/volk_16u_byteswap_u.h63
-rw-r--r--volk/include/volk/volk_32f_convert_64f_a.h70
-rw-r--r--volk/include/volk/volk_32f_convert_64f_u.h70
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_a.h150
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_32i_u.h142
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_8i_a.h155
-rw-r--r--volk/include/volk/volk_32f_s32f_multiply_32f_u.h102
-rw-r--r--volk/include/volk/volk_32f_x2_add_32f_u.h66
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h290
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h290
-rw-r--r--volk/include/volk/volk_32f_x2_multiply_32f_u.h106
-rw-r--r--volk/include/volk/volk_32fc_conjugate_32fc_u.h64
-rw-r--r--volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h78
-rw-r--r--volk/include/volk/volk_32fc_magnitude_32f_u.h118
-rw-r--r--volk/include/volk/volk_32fc_magnitude_squared_32f_a.h114
-rw-r--r--volk/include/volk/volk_32fc_magnitude_squared_32f_u.h114
-rw-r--r--volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h87
-rw-r--r--volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h145
-rw-r--r--volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h116
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_32fc_u.h77
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h81
-rw-r--r--volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h81
-rw-r--r--volk/include/volk/volk_32i_s32f_convert_32f_a.h73
-rw-r--r--volk/include/volk/volk_32i_s32f_convert_32f_u.h75
-rw-r--r--volk/include/volk/volk_32u_byteswap_u.h77
-rw-r--r--volk/include/volk/volk_64f_convert_32f_a.h67
-rw-r--r--volk/include/volk/volk_64f_convert_32f_u.h67
-rw-r--r--volk/include/volk/volk_64u_byteswap_u.h88
-rw-r--r--volk/include/volk/volk_8i_convert_16i_u.h73
-rw-r--r--volk/include/volk/volk_8i_s32f_convert_32f_u.h94
-rw-r--r--volk/include/volk/volk_prefs.h15
-rw-r--r--volk/kernels/README.txt67
-rw-r--r--volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h (renamed from volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h)2
-rw-r--r--volk/kernels/volk/volk_16i_branch_4_state_8.h (renamed from volk/include/volk/volk_16i_branch_4_state_8_a.h)2
-rw-r--r--volk/kernels/volk/volk_16i_convert_8i.h (renamed from volk/include/volk/volk_16i_convert_8i_u.h)71
-rw-r--r--volk/kernels/volk/volk_16i_max_star_16i.h (renamed from volk/include/volk/volk_16i_max_star_16i_a.h)8
-rw-r--r--volk/kernels/volk/volk_16i_max_star_horizontal_16i.h (renamed from volk/include/volk/volk_16i_max_star_horizontal_16i_a.h)8
-rw-r--r--volk/kernels/volk/volk_16i_permute_and_scalar_add.h (renamed from volk/include/volk/volk_16i_permute_and_scalar_add_a.h)7
-rw-r--r--volk/kernels/volk/volk_16i_s32f_convert_32f.h (renamed from volk/include/volk/volk_16i_s32f_convert_32f_u.h)121
-rw-r--r--volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h (renamed from volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h)9
-rw-r--r--volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h (renamed from volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h)8
-rw-r--r--volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h (renamed from volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h)4
-rw-r--r--volk/kernels/volk/volk_16ic_deinterleave_real_16i.h (renamed from volk/include/volk/volk_16ic_deinterleave_real_16i_a.h)2
-rw-r--r--volk/kernels/volk/volk_16ic_deinterleave_real_8i.h (renamed from volk/include/volk/volk_16ic_deinterleave_real_8i_a.h)4
-rw-r--r--volk/kernels/volk/volk_16ic_magnitude_16i.h (renamed from volk/include/volk/volk_16ic_magnitude_16i_a.h)4
-rw-r--r--volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h)4
-rw-r--r--volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h (renamed from volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h (renamed from volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h)4
-rw-r--r--volk/kernels/volk/volk_16u_byteswap.h (renamed from volk/include/volk/volk_16u_byteswap_a.h)65
-rw-r--r--volk/kernels/volk/volk_32f_accumulator_s32f.h (renamed from volk/include/volk/volk_32f_accumulator_s32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_convert_64f.h140
-rw-r--r--volk/kernels/volk/volk_32f_index_max_16u.h (renamed from volk/include/volk/volk_32f_index_max_16u_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h (renamed from volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h (renamed from volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_s32f_convert_16i.h (renamed from volk/include/volk/volk_32f_s32f_convert_16i_u.h)152
-rw-r--r--volk/kernels/volk/volk_32f_s32f_convert_32i.h (renamed from volk/include/volk/volk_32f_s32f_convert_32i_a.h)142
-rw-r--r--volk/kernels/volk/volk_32f_s32f_convert_8i.h (renamed from volk/include/volk/volk_32f_s32f_convert_8i_u.h)157
-rw-r--r--volk/kernels/volk/volk_32f_s32f_multiply_32f.h (renamed from volk/include/volk/volk_32f_s32f_multiply_32f_a.h)104
-rw-r--r--volk/kernels/volk/volk_32f_s32f_normalize.h (renamed from volk/include/volk/volk_32f_s32f_normalize_a.h)4
-rw-r--r--volk/kernels/volk/volk_32f_s32f_power_32f.h (renamed from volk/include/volk/volk_32f_s32f_power_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_s32f_stddev_32f.h (renamed from volk/include/volk/volk_32f_s32f_stddev_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_sqrt_32f.h (renamed from volk/include/volk/volk_32f_sqrt_32f_a.h)4
-rw-r--r--volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h (renamed from volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_x2_add_32f.h (renamed from volk/include/volk/volk_32f_x2_add_32f_a.h)68
-rw-r--r--volk/kernels/volk/volk_32f_x2_divide_32f.h (renamed from volk/include/volk/volk_32f_x2_divide_32f_a.h)4
-rw-r--r--volk/kernels/volk/volk_32f_x2_dot_prod_16i.h (renamed from volk/include/volk/volk_32f_x2_dot_prod_16i_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_x2_dot_prod_32f.h580
-rw-r--r--volk/kernels/volk/volk_32f_x2_interleave_32fc.h (renamed from volk/include/volk/volk_32f_x2_interleave_32fc_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_x2_max_32f.h (renamed from volk/include/volk/volk_32f_x2_max_32f_a.h)4
-rw-r--r--volk/kernels/volk/volk_32f_x2_min_32f.h (renamed from volk/include/volk/volk_32f_x2_min_32f_a.h)4
-rw-r--r--volk/kernels/volk/volk_32f_x2_multiply_32f.h (renamed from volk/include/volk/volk_32f_x2_multiply_32f_a.h)108
-rw-r--r--volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h (renamed from volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h)2
-rw-r--r--volk/kernels/volk/volk_32f_x2_subtract_32f.h (renamed from volk/include/volk/volk_32f_x2_subtract_32f_a.h)4
-rw-r--r--volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h (renamed from volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h)7
-rw-r--r--volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_32f_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_32f_multiply_32fc_a.h)4
-rw-r--r--volk/kernels/volk/volk_32fc_conjugate_32fc.h (renamed from volk/include/volk/volk_32fc_conjugate_32fc_a.h)64
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h156
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h (renamed from volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_real_32f.h (renamed from volk/include/volk/volk_32fc_deinterleave_real_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_deinterleave_real_64f.h (renamed from volk/include/volk/volk_32fc_deinterleave_real_64f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_index_max_16u.h (renamed from volk/include/volk/volk_32fc_index_max_16u_a.h)9
-rw-r--r--volk/kernels/volk/volk_32fc_magnitude_32f.h (renamed from volk/include/volk/volk_32fc_magnitude_32f_a.h)120
-rw-r--r--volk/kernels/volk/volk_32fc_magnitude_squared_32f.h228
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_atan2_32f.h (renamed from volk/include/volk/volk_32fc_s32f_atan2_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h (renamed from volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h (renamed from volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h)4
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_power_32fc.h (renamed from volk/include/volk/volk_32fc_s32f_power_32fc_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h (renamed from volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h (renamed from volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h)87
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h)10
-rw-r--r--volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h (renamed from volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h)6
-rw-r--r--volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h)161
-rw-r--r--volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h (renamed from volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h)136
-rw-r--r--volk/kernels/volk/volk_32fc_x2_multiply_32fc.h (renamed from volk/include/volk/volk_32fc_x2_multiply_32fc_a.h)79
-rw-r--r--volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h162
-rw-r--r--volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h (renamed from volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h)8
-rw-r--r--volk/kernels/volk/volk_32fc_x2_square_dist_32f.h (renamed from volk/include/volk/volk_32fc_x2_square_dist_32f_a.h)8
-rw-r--r--volk/kernels/volk/volk_32i_s32f_convert_32f.h148
-rw-r--r--volk/kernels/volk/volk_32i_x2_and_32i.h (renamed from volk/include/volk/volk_32i_x2_and_32i_a.h)4
-rw-r--r--volk/kernels/volk/volk_32i_x2_or_32i.h (renamed from volk/include/volk/volk_32i_x2_or_32i_a.h)4
-rw-r--r--volk/kernels/volk/volk_32u_byteswap.h (renamed from volk/include/volk/volk_32u_byteswap_a.h)77
-rw-r--r--volk/kernels/volk/volk_32u_popcnt.h (renamed from volk/include/volk/volk_32u_popcnt_a.h)2
-rw-r--r--volk/kernels/volk/volk_64f_convert_32f.h134
-rw-r--r--volk/kernels/volk/volk_64f_x2_max_64f.h (renamed from volk/include/volk/volk_64f_x2_max_64f_a.h)2
-rw-r--r--volk/kernels/volk/volk_64f_x2_min_64f.h (renamed from volk/include/volk/volk_64f_x2_min_64f_a.h)2
-rw-r--r--volk/kernels/volk/volk_64u_byteswap.h (renamed from volk/include/volk/volk_64u_byteswap_a.h)88
-rw-r--r--volk/kernels/volk/volk_64u_popcnt.h (renamed from volk/include/volk/volk_64u_popcnt_a.h)2
-rw-r--r--volk/kernels/volk/volk_8i_convert_16i.h (renamed from volk/include/volk/volk_8i_convert_16i_a.h)75
-rw-r--r--volk/kernels/volk/volk_8i_s32f_convert_32f.h (renamed from volk/include/volk/volk_8i_s32f_convert_32f_a.h)96
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h (renamed from volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h)2
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_real_16i.h (renamed from volk/include/volk/volk_8ic_deinterleave_real_16i_a.h)2
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_real_8i.h (renamed from volk/include/volk/volk_8ic_deinterleave_real_8i_a.h)2
-rw-r--r--volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h (renamed from volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h)2
-rw-r--r--volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h (renamed from volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h)2
-rw-r--r--volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h (renamed from volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h)2
-rw-r--r--volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h (renamed from volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h)2
-rw-r--r--volk/lib/CMakeLists.txt3
-rw-r--r--volk/lib/qa_utils.cc36
-rw-r--r--volk/lib/qa_utils.h2
-rw-r--r--volk/lib/testqa.cc190
-rw-r--r--volk/lib/volk_prefs.c39
-rw-r--r--volk/lib/volk_rank_archs.c111
-rw-r--r--volk/lib/volk_rank_archs.h40
-rw-r--r--volk/python/volk_modtool/CMakeLists.txt39
-rw-r--r--volk/python/volk_modtool/README114
-rw-r--r--volk/python/volk_modtool/__init__.py24
-rw-r--r--volk/python/volk_modtool/cfg.py104
-rwxr-xr-xvolk/python/volk_modtool/volk_modtool128
-rw-r--r--volk/python/volk_modtool/volk_modtool_generate.py310
-rw-r--r--volk/tmpl/volk.tmpl.c121
-rw-r--r--volk/tmpl/volk.tmpl.h55
-rw-r--r--volk/tmpl/volk_machine_xxx.tmpl.c30
-rw-r--r--volk/tmpl/volk_machines.tmpl.h14
-rw-r--r--volk/tmpl/volk_typedefs.tmpl.h2
148 files changed, 5290 insertions, 4021 deletions
diff --git a/volk/CMakeLists.txt b/volk/CMakeLists.txt
index 99f7052560..a04c2adefd 100644
--- a/volk/CMakeLists.txt
+++ b/volk/CMakeLists.txt
@@ -115,12 +115,15 @@ install(
# Install all headers in the include directories
########################################################################
install(
- DIRECTORY ${CMAKE_SOURCE_DIR}/include/volk
+ DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk
DESTINATION include COMPONENT "volk_devel"
FILES_MATCHING PATTERN "*.h"
)
install(FILES
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h
${CMAKE_BINARY_DIR}/include/volk/volk.h
${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h
${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h
@@ -138,6 +141,7 @@ add_subdirectory(lib)
# And the utility apps
########################################################################
add_subdirectory(apps)
+add_subdirectory(python/volk_modtool)
########################################################################
# Print summary
diff --git a/volk/apps/CMakeLists.txt b/volk/apps/CMakeLists.txt
index 03ad92b792..577b7ef137 100644
--- a/volk/apps/CMakeLists.txt
+++ b/volk/apps/CMakeLists.txt
@@ -25,11 +25,11 @@ if(MSVC)
endif(MSVC)
include_directories(
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_SOURCE_DIR}/include
${CMAKE_BINARY_DIR}/include
${CMAKE_SOURCE_DIR}/lib
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${CMAKE_CURRENT_BINARY_DIR}
${Boost_INCLUDE_DIRS}
)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index e0919a278a..edc32183a2 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -18,117 +18,100 @@ int main(int argc, char *argv[]) {
std::vector<std::string> results;
- //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results);
- //VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results);
- VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc_a, volk_32fc_s32fc_x2_rotator_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results);
- VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 204600, 1000, &results);
- VOLK_PROFILE(volk_16ic_deinterleave_real_16i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16ic_magnitude_16i_a, 1, 0, 204600, 100, &results);
- VOLK_PROFILE(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 204600, 1000, &results);
- VOLK_PROFILE(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16i_convert_8i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16i_convert_8i_u, 0, 0, 204600, 10000, &results);
- //VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results);
- //VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results);
- //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results);
- //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results);
- VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results);
- VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 204600, 100, &results);
- //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000, &results);
- VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results);
- VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_magnitude_32f_u, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_magnitude_squared_32f_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_magnitude_squared_32f_u, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_convert_32i_u, 1, 2<<31, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_convert_64f_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_convert_64f_u, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_convert_8i_a, 1, 128, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_convert_8i_u, 1, 128, 204600, 10000, &results);
- //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000, &results);
- VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 20460, 100, &results);
- VOLK_PROFILE(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results);
- VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results);
- //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results);
- VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results);
- VOLK_PROFILE(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results);
- VOLK_PROFILE(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000, &results);
- VOLK_PROFILE(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_x2_multiply_32f_u, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100, &results);
- VOLK_PROFILE(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100, &results);
- VOLK_PROFILE(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 204600, 3000, &results);
- VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 204600, 3000, &results);
- VOLK_PROFILE(volk_32f_x2_subtract_32f_a, 1e-4, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 204600, 5000, &results);
- VOLK_PROFILE(volk_32i_x2_and_32i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32i_s32f_convert_32f_a, 1e-4, 100, 204600, 10000, &results);
- VOLK_PROFILE(volk_32i_s32f_convert_32f_u, 1e-4, 100, 204600, 10000, &results);
- VOLK_PROFILE(volk_32i_x2_or_32i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32u_byteswap_a, 0, 0, 204600, 2000, &results);
- //VOLK_PROFILE(volk_32u_popcnt_a, 0, 0, 2046, 10000, &results);
- VOLK_PROFILE(volk_64f_convert_32f_a, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_64f_convert_32f_u, 1e-4, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_64f_x2_max_64f_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_64f_x2_min_64f_a, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_64u_byteswap_a, 0, 0, 204600, 1000, &results);
- //VOLK_PROFILE(volk_64u_popcnt_a, 0, 0, 2046, 10000, &results);
- VOLK_PROFILE(volk_8ic_deinterleave_16i_x2_a, 0, 0, 204600, 3000, &results);
- VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 204600, 3000, &results);
- VOLK_PROFILE(volk_8ic_deinterleave_real_16i_a, 0, 256, 204600, 3000, &results);
- VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 204600, 3000, &results);
- VOLK_PROFILE(volk_8ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results);
- VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 204600, 400, &results);
- VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 204600, 400, &results);
- VOLK_PROFILE(volk_8i_convert_16i_a, 0, 0, 204600, 20000, &results);
- VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results);
- VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results);
- VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results);
- //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results);
- VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results);
- VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 1.0, 204600, 10000, &results);
- VOLK_PROFILE(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 204600, 1000, &results);
+ //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results);
+ //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results);
+ VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results);
+ VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_deinterleave_16i_x2, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_16ic_deinterleave_real_16i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16ic_magnitude_16i, 1, 0, 204600, 100, &results);
+ VOLK_PROFILE(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16i_convert_8i, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results);
+ //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204600, 50, &results);
+ VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204600, 100, &results);
+ //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_real_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32fc_deinterleave_real_64f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_index_max_16u, 3, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_magnitude_16i, 1, 32768, 204600, 100, &results);
+ VOLK_PROFILE(volk_32fc_magnitude_32f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_magnitude_squared_32f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_x2_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_conjugate_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32f_s32f_convert_16i, 1, 32768, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_convert_32i, 1, 2<<31, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_convert_64f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_convert_8i, 1, 128, 204600, 10000, &results);
+ //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results);
+ VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 20460, 100, &results);
+ VOLK_PROFILE(volk_32fc_x2_square_dist_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_x2_divide_32f, 1e-4, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 5000, &results);
+ //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results);
+ VOLK_PROFILE(volk_32f_index_max_16u, 3, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic, 1, 32768, 204600, 3000, &results);
+ VOLK_PROFILE(volk_32f_x2_interleave_32fc, 0, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x2_max_32f, 1e-4, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32f_x2_min_32f, 1e-4, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_32f_x2_multiply_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_normalize, 1e-4, 100, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_s32f_power_32f, 1e-4, 4, 204600, 100, &results);
+ VOLK_PROFILE(volk_32f_sqrt_32f, 1e-4, 0, 204600, 100, &results);
+ VOLK_PROFILE(volk_32f_s32f_stddev_32f, 1e-4, 100, 204600, 3000, &results);
+ VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 204600, 3000, &results);
+ VOLK_PROFILE(volk_32f_x2_subtract_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 204600, 5000, &results);
+ VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204600, 2000, &results);
+ //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204600, 1000, &results);
+ //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results);
+ VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 204600, 3000, &results);
+ VOLK_PROFILE(volk_8ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 204600, 400, &results);
+ VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 204600, 400, &results);
+ VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 20000, &results);
+ VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 2000, &results);
+ VOLK_PROFILE(volk_8i_s32f_convert_32f, 1e-4, 100, 204600, 2000, &results);
+ //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results);
+ VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+ VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204600, 10000, &results);
+
char path[1024];
- get_config_path(path);
+ volk_get_config_path(path);
+
const fs::path config_path(path);
if (not fs::exists(config_path.branch_path()))
diff --git a/volk/cmake/CMakeParseArgumentsCopy.cmake b/volk/cmake/CMakeParseArgumentsCopy.cmake
new file mode 100644
index 0000000000..7ce4c49ae5
--- /dev/null
+++ b/volk/cmake/CMakeParseArgumentsCopy.cmake
@@ -0,0 +1,138 @@
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for
+# parsing the arguments given to that macro or function.
+# It processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e. keywords which can be used when calling the macro without any value
+# following, like e.g. the OPTIONAL keyword of the install() command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g. DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this macro
+# which can be followed by more than one value, like e.g. the TARGETS or
+# FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.
+# These variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether
+# your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments as the
+# real install() command:
+#
+# function(MY_INSTALL)
+# set(options OPTIONAL FAST)
+# set(oneValueArgs DESTINATION RENAME)
+# set(multiValueArgs TARGETS CONFIGURATIONS)
+# cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
+# ...
+#
+# Assume my_install() has been called like this:
+# my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+# After the cmake_parse_arguments() call the macro will have set the following
+# variables:
+# MY_INSTALL_OPTIONAL = TRUE
+# MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+# MY_INSTALL_DESTINATION = "bin"
+# MY_INSTALL_RENAME = "" (was not used)
+# MY_INSTALL_TARGETS = "foo;bar"
+# MY_INSTALL_CONFIGURATIONS = "" (was not used)
+# MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+# You can the continue and process these variables.
+#
+# Keywords terminate lists of values, e.g. if directly after a one_value_keyword
+# another recognized keyword follows, this is interpreted as the beginning of
+# the new option.
+# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would
+# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+ return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+ # first set all result variables to empty/FALSE
+ foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+ set(${prefix}_${arg_name})
+ endforeach(arg_name)
+
+ foreach(option ${_optionNames})
+ set(${prefix}_${option} FALSE)
+ endforeach(option)
+
+ set(${prefix}_UNPARSED_ARGUMENTS)
+
+ set(insideValues FALSE)
+ set(currentArgName)
+
+ # now iterate over all arguments and fill the result variables
+ foreach(currentArg ${ARGN})
+ list(FIND _optionNames "${currentArg}" optionIndex) # ... then this marks the end of the arguments belonging to this keyword
+ list(FIND _singleArgNames "${currentArg}" singleArgIndex) # ... then this marks the end of the arguments belonging to this keyword
+ list(FIND _multiArgNames "${currentArg}" multiArgIndex) # ... then this marks the end of the arguments belonging to this keyword
+
+ if(${optionIndex} EQUAL -1 AND ${singleArgIndex} EQUAL -1 AND ${multiArgIndex} EQUAL -1)
+ if(insideValues)
+ if("${insideValues}" STREQUAL "SINGLE")
+ set(${prefix}_${currentArgName} ${currentArg})
+ set(insideValues FALSE)
+ elseif("${insideValues}" STREQUAL "MULTI")
+ list(APPEND ${prefix}_${currentArgName} ${currentArg})
+ endif()
+ else(insideValues)
+ list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+ endif(insideValues)
+ else()
+ if(NOT ${optionIndex} EQUAL -1)
+ set(${prefix}_${currentArg} TRUE)
+ set(insideValues FALSE)
+ elseif(NOT ${singleArgIndex} EQUAL -1)
+ set(currentArgName ${currentArg})
+ set(${prefix}_${currentArgName})
+ set(insideValues "SINGLE")
+ elseif(NOT ${multiArgIndex} EQUAL -1)
+ set(currentArgName ${currentArg})
+ set(${prefix}_${currentArgName})
+ set(insideValues "MULTI")
+ endif()
+ endif()
+
+ endforeach(currentArg)
+
+ # propagate the result variables to the caller:
+ foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+ set(${prefix}_${arg_name} ${${prefix}_${arg_name}} PARENT_SCOPE)
+ endforeach(arg_name)
+ set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs)
diff --git a/volk/cmake/msvc/stdbool.h b/volk/cmake/msvc/stdbool.h
new file mode 100644
index 0000000000..ca4581d37a
--- /dev/null
+++ b/volk/cmake/msvc/stdbool.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2005, 2006 Apple Computer, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef STDBOOL_WIN32_H
+#define STDBOOL_WIN32_H
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef __cplusplus
+
+typedef unsigned char bool;
+
+#define true 1
+#define false 0
+
+#ifndef CASSERT
+#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1];
+#endif
+
+CASSERT(sizeof(bool) == 1, bool_is_one_byte)
+CASSERT(true, true_is_true)
+CASSERT(!false, false_is_false)
+
+#endif
+
+#endif
diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml
index a18455801d..2c9ab41a55 100644
--- a/volk/gen/archs.xml
+++ b/volk/gen/archs.xml
@@ -2,7 +2,6 @@
<grammar>
<arch name="generic"> <!-- name is required-->
- <alignment>1</alignment>
</arch>
<arch name="altivec">
diff --git a/volk/gen/volk_arch_defs.py b/volk/gen/volk_arch_defs.py
index 41154d5a7a..3c75e1374e 100644
--- a/volk/gen/volk_arch_defs.py
+++ b/volk/gen/volk_arch_defs.py
@@ -18,9 +18,6 @@
archs = list()
arch_dict = dict()
-#TODO enable this when we are ready
-create_unaligned_archs = False
-
class arch_class:
def __init__(self, flags, checks, **kwargs):
for key, cast, failval in (
@@ -49,10 +46,6 @@ def register_arch(**kwargs):
arch = arch_class(**kwargs)
archs.append(arch)
arch_dict[arch.name] = arch
- if arch.alignment > 1 and create_unaligned_archs:
- kwargs['name'] += '_u'
- kwargs['alignment'] = 1
- register_arch(**kwargs)
########################################################################
# register the arches
diff --git a/volk/gen/volk_kernel_defs.py b/volk/gen/volk_kernel_defs.py
index 52cdb684c2..f246db0f96 100644
--- a/volk/gen/volk_kernel_defs.py
+++ b/volk/gen/volk_kernel_defs.py
@@ -24,201 +24,186 @@ import re
import sys
import glob
-from volk_arch_defs import archs
-
-remove_after_underscore = re.compile("_.*");
-space_remove = re.compile(" ");
-leading_space_remove = re.compile("^ *");
-replace_arch = re.compile(", const char\* arch");
-replace_bracket = re.compile(" {");
-replace_volk = re.compile("volk");
-
-def strip_trailing(tostrip, stripstr):
- lindex = tostrip.rfind(stripstr)
- tostrip = tostrip[0:lindex] + tostrip[lindex:len(tostrip)].replace(stripstr, "");
- return tostrip
+########################################################################
+# Strip comments from a c/cpp file.
+# Input is code string, output is code string without comments.
+# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments
+########################################################################
+def comment_remover(text):
+ def replacer(match):
+ s = match.group(0)
+ if s.startswith('/'):
+ return ""
+ else:
+ return s
+ pattern = re.compile(
+ r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+ re.DOTALL | re.MULTILINE
+ )
+ return re.sub(pattern, replacer, text)
+
+########################################################################
+# Split code into nested sections according to ifdef preprocessor macros
+########################################################################
+def split_into_nested_ifdef_sections(code):
+ sections = list()
+ section = ''
+ header = 'text'
+ in_section_depth = 0
+ for i, line in enumerate(code.splitlines()):
+ m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
+ line_is = 'normal'
+ if m:
+ p0, p1, fcn, stuff = m.groups()
+ if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if'
+ if fcn in ('else', 'elif'): line_is = 'else'
+ if fcn in ('endif',): line_is = 'end'
+
+ if line_is == 'if': in_section_depth += 1
+ if line_is == 'end': in_section_depth -= 1
+
+ if in_section_depth == 1 and line_is == 'if':
+ sections.append((header, section))
+ section = ''
+ header = line
+ continue
-srcdir = os.path.dirname(os.path.dirname(__file__))
-hdr_files = glob.glob(os.path.join(srcdir, "include/volk/*.h"))
-
-datatypes = [];
-functions = [];
-
-for line in hdr_files:
- subline = re.search(".*_(a|u)\.h.*", os.path.basename(line))
- if subline:
- subsubline = re.search("(?<=volk_).*", subline.group(0));
- if subsubline:
- dtype = remove_after_underscore.sub("", subsubline.group(0));
- subdtype = re.search("[0-9]+[A-z]+", dtype);
- if subdtype:
- datatypes.append(subdtype.group(0));
-
-
-datatypes = set(datatypes);
-
-for line in hdr_files:
- for dt in datatypes:
- if dt in line:
- subline = re.search("(volk_" + dt +"_.*(a|u).*\.h)", line);
- if subline:
-
- subsubline = re.search(".+(?=\.h)", subline.group(0));
- functions.append(subsubline.group(0));
-
-archs_or = "("
-for arch in archs:
- archs_or = archs_or + arch.name.upper() + "|";
-archs_or = archs_or[0:len(archs_or)-1];
-archs_or = archs_or + ")";
-
-taglist = [];
-fcountlist = [];
-arched_arglist = [];
-retlist = [];
-my_arglist = [];
-my_argtypelist = [];
-for func in functions:
- tags = [];
- fcount = [];
- infile_source = open(os.path.join(srcdir, 'include', 'volk', func + ".h"))
- begun_name = 0;
- begun_paren = 0;
- sourcefile = infile_source.readlines();
- infile_source.close();
- for line in sourcefile:
-#FIXME: make it work for multiple #if define()s
- archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line);
- if archline:
- arch = archline.group(0);
- archline = re.findall(archs_or + "(?=( |\n|&))", line);
- if archline:
- archsublist = [];
- for tup in archline:
- archsublist.append(tup[0]);
- fcount.append(archsublist);
- testline = re.search("static inline.*?" + func, line);
- if (not testline):
+ if in_section_depth == 1 and line_is == 'else':
+ sections.append((header, section))
+ section = ''
+ header = line
continue
- tagline = re.search(func + "_.+", line);
- if tagline:
- tag = re.search("(?<=" + func + "_)\w+(?= *\()",line);
- if tag:
- tag = re.search("\w+", tag.group(0));
- if tag:
- tags.append(tag.group(0));
+ if in_section_depth == 0 and line_is == 'end':
+ sections.append((header, section))
+ section = ''
+ header = 'text'
+ continue
- if begun_name == 0:
- retline = re.search(".+(?=" + func + ")", line);
- if retline:
- ret = retline.group(0);
+ section += line + '\n'
+ sections.append((header, section)) #and pack remainder into sections
+ sections = [sec for sec in sections if sec[1].strip()] #filter empty sections
+ #recurse into non-text sections to fill subsections
+ for i, (header, section) in enumerate(sections):
+ if header == 'text': continue
+ sections[i] = (header, split_into_nested_ifdef_sections(section))
+ return sections
- subline = re.search(func + ".*", line);
- if subline:
- subsubline = re.search("\(.*?\)", subline.group(0));
- if subsubline:
- args = subsubline.group(0);
+########################################################################
+# Recursive print of sections to test code above
+########################################################################
+def print_sections(sections, indent = ' '):
+ for header, body in sections:
+ if header == 'text':
+ print indent, ('\n'+indent).join(body.splitlines())
+ continue
+ print indent.replace(' ', '-') + '>', header
+ print_sections(body, indent + ' ')
+
+########################################################################
+# Flatten a section to just body text
+########################################################################
+def flatten_section_text(sections):
+ output = ''
+ for hdr, bdy in sections:
+ if hdr != 'text': output += flatten_section_text(bdy)
+ else: output += bdy
+ return output
+
+########################################################################
+# Extract kernel info from section, represent as an implementation
+########################################################################
+class impl_class:
+ def __init__(self, kern_name, header, body):
+ #extract LV_HAVE_*
+ self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header)))
+ #extract function suffix and args
+ body = flatten_section_text(body)
+ try:
+ fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE)
+ body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket
+ m = fcn_matcher.match(body)
+ impl_name, the_rest = m.groups()
+ self.name = impl_name.replace(kern_name+'_', '')
+ self.args = list()
+ fcn_args = the_rest.split(',')
+ for fcn_arg in fcn_args:
+ arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE)
+ m = arg_matcher.match(fcn_arg)
+ arg_type, arg_name = m.groups()
+ self.args.append((arg_type, arg_name))
+ except Exception as ex:
+ raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex)
+
+ assert self.name
+ self.is_aligned = self.name.startswith('a_')
- else:
- begun_name = 1;
- subsubline = re.search("\(.*", subline.group(0));
- if subsubline:
- args = subsubline.group(0);
- begun_paren = 1;
- else:
- if begun_paren == 1:
- subline = re.search(".*?\)", line);
- if subline:
- args = args + subline.group(0);
- begun_name = 0;
- begun_paren = 0;
- else:
- subline = re.search(".*", line);
- args = args + subline.group(0);
- else:
- subline = re.search("\(.*?\)", line);
- if subline:
- args = subline.group(0);
- begun_name = 0;
- else:
- subline = re.search("\(.*", line);
- if subline:
- args = subline.group(0);
- begun_paren = 1;
-
- replace = re.compile("static ");
- ret = replace.sub("", ret);
- replace = re.compile("inline ");
- ret = replace.sub("", ret);
- arched_args = args[args.find('(')+1:args.find(')')]
-
- remove = re.compile('\)|\(|{');
- rargs = remove.sub("", args);
- sargs = rargs.split(',');
-
-
-
- margs = [];
- atypes = [];
- for arg in sargs:
- temp = arg.split(" ");
- margs.append(temp[-1]);
- replace = re.compile(" " + temp[-1]);
- atypes.append(replace.sub("", arg));
-
-
- my_args = ""
- arg_types = ""
- for arg in range(0, len(margs) - 1):
- this_arg = leading_space_remove.sub("", margs[arg]);
- my_args = my_args + this_arg + ", ";
- this_type = leading_space_remove.sub("", atypes[arg]);
- arg_types = arg_types + this_type + ", ";
-
- this_arg = leading_space_remove.sub("", margs[-1]);
- my_args = my_args + this_arg;
- this_type = leading_space_remove.sub("", atypes[-1]);
- arg_types = arg_types + this_type;
- my_argtypelist.append(arg_types);
-
- if(ret[-1] != ' '):
- ret = ret + ' ';
-
- arched_arglist.append(arched_args); #!!!!!!!!!!!
- my_arglist.append(my_args) #!!!!!!!!!!!!!!!!!
- retlist.append(ret);
- fcountlist.append(fcount);
- taglist.append(tags);
+ def __repr__(self):
+ return self.name
+########################################################################
+# Get sets of LV_HAVE_* from the code
+########################################################################
+def extract_lv_haves(code):
+ haves = list()
+ for line in code.splitlines():
+ if not line.strip().startswith('#'): continue
+ have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line)))
+ if have_set: haves.append(have_set)
+ return haves
+
+########################################################################
+# Represent a processing kernel, parse from file
+########################################################################
class kernel_class:
- def __init__(self, index):
- self.name = functions[index]
+ def __init__(self, kernel_file):
+ self.name = os.path.splitext(os.path.basename(kernel_file))[0]
self.pname = self.name.replace('volk_', 'p_')
- self.rettype = retlist[index]
- self.arglist_defs = my_argtypelist[index]
- self.arglist_namedefs = arched_arglist[index]
- self.arglist_names = my_arglist[index]
- self._tagdeps = fcountlist[index]
- self._taglist = taglist[index]
-
- def get_tags(self, archs):
- def is_in(x): return x.lower() in archs
- taglist = list()
- tagdeps = list()
- for i in range(len(self._tagdeps)):
- if all(map(is_in, self._tagdeps[i])):
- taglist.append(self._taglist[i])
- tagdeps.append(self._tagdeps[i])
- return taglist, tagdeps
+ code = open(kernel_file, 'r').read()
+ code = comment_remover(code)
+ sections = split_into_nested_ifdef_sections(code)
+ self._impls = list()
+ for header, section in sections:
+ if 'ifndef' not in header.lower(): continue
+ for sub_hdr, body in section:
+ if 'if' not in sub_hdr.lower(): continue
+ if 'LV_HAVE_' not in sub_hdr: continue
+ self._impls.append(impl_class(
+ kern_name=self.name, header=sub_hdr, body=body,
+ ))
+ assert(self._impls)
+ self.has_dispatcher = False
+ for impl in self._impls:
+ if impl.name == 'dispatcher':
+ self._impls.remove(impl)
+ self.has_dispatcher = True
+ break
+ self.args = self._impls[0].args
+ self.arglist_types = ', '.join([a[0] for a in self.args])
+ self.arglist_full = ', '.join(['%s %s'%a for a in self.args])
+ self.arglist_names = ', '.join([a[1] for a in self.args])
+
+ def get_impls(self, archs):
+ archs = set(archs)
+ impls = list()
+ for impl in self._impls:
+ if impl.deps.intersection(archs) == impl.deps:
+ impls.append(impl)
+ return impls
def __repr__(self):
return self.name
-kernels = map(kernel_class, range(len(retlist)))
+########################################################################
+# Extract information from the VOLK kernels
+########################################################################
+__file__ = os.path.abspath(__file__)
+srcdir = os.path.dirname(os.path.dirname(__file__))
+kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h"))
+kernels = map(kernel_class, kernel_files)
if __name__ == '__main__':
print kernels
diff --git a/volk/gen/volk_machine_defs.py b/volk/gen/volk_machine_defs.py
index d1a8569818..7293d47462 100644
--- a/volk/gen/volk_machine_defs.py
+++ b/volk/gen/volk_machine_defs.py
@@ -30,10 +30,6 @@ class machine_class:
arch = arch_dict[arch_name]
self.archs.append(arch)
self.arch_names.append(arch_name)
- arch_name += '_u'
- if arch.alignment > 1 and arch_dict.has_key(arch_name):
- arch = arch_dict[arch_name]
- self.archs.append(arch)
self.alignment = max(map(lambda a: a.alignment, self.archs))
def __repr__(self): return self.name
diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h
deleted file mode 100644
index 84548c8c50..0000000000
--- a/volk/include/volk/volk_16i_convert_8i_a.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef INCLUDED_volk_16i_convert_8i_a_H
-#define INCLUDED_volk_16i_convert_8i_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
- \brief Converts the input 16 bit integer data into 8 bit integer data
- \param inputVector The 16 bit input data buffer
- \param outputVector The 8 bit output data buffer
- \param num_points The number of data values to be converted
-*/
-static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- int8_t* outputVectorPtr = outputVector;
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal1;
- __m128i inputVal2;
- __m128i ret;
-
- for(;number < sixteenthPoints; number++){
-
- // Load the 16 values
- inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
- inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-
- inputVal1 = _mm_srai_epi16(inputVal1, 8);
- inputVal2 = _mm_srai_epi16(inputVal2, 8);
-
- ret = _mm_packs_epi16(inputVal1, inputVal2);
-
- _mm_store_si128((__m128i*)outputVectorPtr, ret);
-
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] =(int8_t)(inputVector[number] >> 8);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the input 16 bit integer data into 8 bit integer data
- \param inputVector The 16 bit input data buffer
- \param outputVector The 8 bit output data buffer
- \param num_points The number of data values to be converted
-*/
-static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
- int8_t* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h
deleted file mode 100644
index 7108ff6590..0000000000
--- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
-#define INCLUDED_volk_16i_s32f_convert_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128i inputVal;
- __m128i inputVal2;
- __m128 ret;
-
- for(;number < eighthPoints; number++){
-
- // Load the 8 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
- // Shift the input data to the right by 64 bits ( 8 bytes )
- inputVal2 = _mm_srli_si128(inputVal, 8);
-
- // Convert the lower 4 values into 32 bit words
- inputVal = _mm_cvtepi16_epi32(inputVal);
- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- ret = _mm_cvtepi32_ps(inputVal2);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- outputVectorPtr += 4;
-
- inputPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- __m128 invScalar = _mm_set_ps1(1.0/scalar);
- int16_t* inputPtr = (int16_t*)inputVector;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
-
- inputPtr += 4;
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) / scalar;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 16 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const int16_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
deleted file mode 100644
index 8ef627a628..0000000000
--- a/volk/include/volk/volk_16u_byteswap_u.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef INCLUDED_volk_16u_byteswap_u_H
-#define INCLUDED_volk_16u_byteswap_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
- \brief Byteswaps (in-place) an unaligned vector of int16_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
- uint16_t* inputPtr = intsToSwap;
- __m128i input, left, right, output;
-
- const unsigned int eighthPoints = num_points / 8;
- for(;number < eighthPoints; number++){
- // Load the 16t values, increment inputPtr later since we're doing it in-place.
- input = _mm_loadu_si128((__m128i*)inputPtr);
- // Do the two shifts
- left = _mm_slli_epi16(input, 8);
- right = _mm_srli_epi16(input, 8);
- // Or the left and right halves together
- output = _mm_or_si128(left, right);
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
- inputPtr += 8;
- }
-
- // Byteswap any remaining points:
- number = eighthPoints*8;
- for(; number < num_points; number++){
- uint16_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
- *inputPtr = outputVal;
- inputPtr++;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Byteswaps (in-place) an unaligned vector of int16_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
- unsigned int point;
- uint16_t* inputPtr = intsToSwap;
- for(point = 0; point < num_points; point++){
- uint16_t output = *inputPtr;
- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
- *inputPtr = output;
- inputPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h
deleted file mode 100644
index 2c469ac421..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_a.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_volk_32f_convert_64f_a_H
-#define INCLUDED_volk_32f_convert_64f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
- */
-static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m128d ret;
- __m128 inputVal;
-
- for(;number < quarterPoints; number++){
- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
-
- inputVal = _mm_movehl_ps(inputVal, inputVal);
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_store_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
- double* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h
deleted file mode 100644
index 10d8a4f6c0..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_u.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_volk_32f_convert_64f_u_H
-#define INCLUDED_volk_32f_convert_64f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
- */
-static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- double* outputVectorPtr = outputVector;
- __m128d ret;
- __m128 inputVal;
-
- for(;number < quarterPoints; number++){
- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
-
- inputVal = _mm_movehl_ps(inputVal, inputVal);
-
- ret = _mm_cvtps_pd(inputVal);
-
- _mm_storeu_pd(outputVectorPtr, ret);
- outputVectorPtr += 2;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (double)(inputVector[number]);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the float values into double values
- \param dVector The converted double vector values
- \param fVector The float vector values to be converted
- \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){
- double* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_convert_64f_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
deleted file mode 100644
index 9df4946f24..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ /dev/null
@@ -1,150 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
-#define INCLUDED_volk_32f_s32f_convert_16i_a_H
-
-#include <volk/volk_common.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int eighthPoints = num_points / 8;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = -32768;
- float max_val = 32767;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2;
- __m128i intInputVal1, intInputVal2;
- __m128 ret1, ret2;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < eighthPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- // Scale and clip
- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(ret1);
- intInputVal2 = _mm_cvtps_epi32(ret2);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int16_t* outputVectorPtr = outputVector;
-
- float min_val = -32768;
- float max_val = 32767;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- // Scale and clip
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int16_t)rintf(r);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 16 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int16_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = -32768;
- float max_val = 32767;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r < min_val)
- r = min_val;
- else if(r > max_val)
- r = max_val;
- *outputVectorPtr++ = (int16_t)rintf(r);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
deleted file mode 100644
index ee15edb464..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
-#define INCLUDED_volk_32f_s32f_convert_32i_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = -2147483647;
- float max_val = 2147483647;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1;
- __m128i intInputVal1;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
-
- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)(r);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int32_t* outputVectorPtr = outputVector;
-
- float min_val = -2147483647;
- float max_val = 2147483647;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_loadu_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int32_t)(r);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 32 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- \note Input buffer does NOT need to be properly aligned
- */
-static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int32_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = -2147483647;
- float max_val = 2147483647;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- *outputVectorPtr++ = (int32_t)(r);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
deleted file mode 100644
index 800017d5da..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
-#define INCLUDED_volk_32f_s32f_convert_8i_a_H
-
-#include <volk/volk_common.h>
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int sixteenthPoints = num_points / 16;
-
- const float* inputVectorPtr = (const float*)inputVector;
- int8_t* outputVectorPtr = outputVector;
-
- float min_val = -128;
- float max_val = 127;
- float r;
-
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- for(;number < sixteenthPoints; number++){
- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
- intInputVal2 = _mm_cvtps_epi32(inputVal2);
- intInputVal3 = _mm_cvtps_epi32(inputVal3);
- intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
- outputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int8_t)(r);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const float* inputVectorPtr = (const float*)inputVector;
-
- float min_val = -128;
- float max_val = 127;
- float r;
-
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
-
- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
- for(;number < quarterPoints; number++){
- ret = _mm_load_ps(inputVectorPtr);
- inputVectorPtr += 4;
-
- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
- _mm_store_ps(outputFloatBuffer, ret);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
- *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- r = inputVector[number] * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- outputVector[number] = (int8_t)(r);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
- \param inputVector The floating point input data buffer
- \param outputVector The 8 bit output data buffer
- \param scalar The value multiplied against each point in the input buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
- int8_t* outputVectorPtr = outputVector;
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float min_val = -128;
- float max_val = 127;
- float r;
-
- for(number = 0; number < num_points; number++){
- r = *inputVectorPtr++ * scalar;
- if(r > max_val)
- r = max_val;
- else if(r < min_val)
- r = min_val;
- *outputVectorPtr++ = (int8_t)(r);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
deleted file mode 100644
index b3fae9b053..0000000000
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
-#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
- \brief Scalar float multiply
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param scalar the scalar value
- \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m128 aVal, bVal, cVal;
- bVal = _mm_set_ps1(scalar);
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * scalar;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-/*!
- \brief Scalar float multiply
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param scalar the scalar value
- \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
-
- __m256 aVal, bVal, cVal;
- bVal = _mm256_set1_ps(scalar);
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
-
- cVal = _mm256_mul_ps(aVal, bVal);
-
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * scalar;
- }
-}
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Scalar float multiply
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param scalar the scalar value
- \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const float* inputPtr = aVector;
- float* outputPtr = cVector;
- for(number = 0; number < num_points; number++){
- *outputPtr = (*inputPtr) * scalar;
- inputPtr++;
- outputPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h
deleted file mode 100644
index 52e8286bc2..0000000000
--- a/volk/include/volk/volk_32f_x2_add_32f_u.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
-#define INCLUDED_volk_32f_x2_add_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
- \brief Adds the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be added
- \param bVector One of the vectors to be added
- \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_add_ps(aVal, bVal);
-
- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Adds the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be added
- \param bVector One of the vectors to be added
- \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) + (*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
deleted file mode 100644
index 067c33ad89..0000000000
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ /dev/null
@@ -1,290 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
-#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
-
-#include <volk/volk_common.h>
-#include<stdio.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE
-
-
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#ifdef LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal1, bVal1, cVal1;
- __m128 aVal2, bVal2, cVal2;
- __m128 aVal3, bVal3, cVal3;
- __m128 aVal4, bVal4, cVal4;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-
- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
-
- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
- }
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#ifdef LV_HAVE_AVX
-
-#include <immintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val;
- __m256 b0Val, b1Val;
- __m256 c0Val, c1Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm256_load_ps(aPtr);
- a1Val = _mm256_load_ps(aPtr+8);
- b0Val = _mm256_load_ps(bPtr);
- b1Val = _mm256_load_ps(bPtr+8);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_AVX*/
-
-#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
deleted file mode 100644
index b24e8b1f79..0000000000
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ /dev/null
@@ -1,290 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
-#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
-
-#include <volk/volk_common.h>
-#include<stdio.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr= taps;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE
-
-
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_loadu_ps(aPtr);
- a1Val = _mm_loadu_ps(aPtr+4);
- a2Val = _mm_loadu_ps(aPtr+8);
- a3Val = _mm_loadu_ps(aPtr+12);
- b0Val = _mm_loadu_ps(bPtr);
- b1Val = _mm_loadu_ps(bPtr+4);
- b2Val = _mm_loadu_ps(bPtr+8);
- b3Val = _mm_loadu_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 a0Val, a1Val, a2Val, a3Val;
- __m128 b0Val, b1Val, b2Val, b3Val;
- __m128 c0Val, c1Val, c2Val, c3Val;
-
- __m128 dotProdVal0 = _mm_setzero_ps();
- __m128 dotProdVal1 = _mm_setzero_ps();
- __m128 dotProdVal2 = _mm_setzero_ps();
- __m128 dotProdVal3 = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm_loadu_ps(aPtr);
- a1Val = _mm_loadu_ps(aPtr+4);
- a2Val = _mm_loadu_ps(aPtr+8);
- a3Val = _mm_loadu_ps(aPtr+12);
- b0Val = _mm_loadu_ps(bPtr);
- b1Val = _mm_loadu_ps(bPtr+4);
- b2Val = _mm_loadu_ps(bPtr+8);
- b3Val = _mm_loadu_ps(bPtr+12);
-
- c0Val = _mm_mul_ps(a0Val, b0Val);
- c1Val = _mm_mul_ps(a1Val, b1Val);
- c2Val = _mm_mul_ps(a2Val, b2Val);
- c3Val = _mm_mul_ps(a3Val, b3Val);
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#ifdef LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m128 aVal1, bVal1, cVal1;
- __m128 aVal2, bVal2, cVal2;
- __m128 aVal3, bVal3, cVal3;
- __m128 aVal4, bVal4, cVal4;
-
- __m128 dotProdVal = _mm_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
-
- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-
- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
- cVal1 = _mm_or_ps(cVal1, cVal2);
- cVal3 = _mm_or_ps(cVal3, cVal4);
- cVal1 = _mm_or_ps(cVal1, cVal3);
-
- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
- }
-
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
-
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#ifdef LV_HAVE_AVX
-
-#include <immintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
-
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float dotProduct = 0;
- const float* aPtr = input;
- const float* bPtr = taps;
-
- __m256 a0Val, a1Val;
- __m256 b0Val, b1Val;
- __m256 c0Val, c1Val;
-
- __m256 dotProdVal0 = _mm256_setzero_ps();
- __m256 dotProdVal1 = _mm256_setzero_ps();
-
- for(;number < sixteenthPoints; number++){
-
- a0Val = _mm256_loadu_ps(aPtr);
- a1Val = _mm256_loadu_ps(aPtr+8);
- b0Val = _mm256_loadu_ps(bPtr);
- b1Val = _mm256_loadu_ps(bPtr+8);
-
- c0Val = _mm256_mul_ps(a0Val, b0Val);
- c1Val = _mm256_mul_ps(a1Val, b1Val);
-
- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-
- aPtr += 16;
- bPtr += 16;
- }
-
- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-
- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
- _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
- dotProduct = dotProductVector[0];
- dotProduct += dotProductVector[1];
- dotProduct += dotProductVector[2];
- dotProduct += dotProductVector[3];
- dotProduct += dotProductVector[4];
- dotProduct += dotProductVector[5];
- dotProduct += dotProductVector[6];
- dotProduct += dotProductVector[7];
-
- number = sixteenthPoints*16;
- for(;number < num_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- }
-
- *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_AVX*/
-
-#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
deleted file mode 100644
index bfb896d602..0000000000
--- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
-#define INCLUDED_volk_32f_x2_multiply_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
- \brief Multiplys the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m128 aVal, bVal, cVal;
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-/*!
- \brief Multiplies the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
-
- __m256 aVal, bVal, cVal;
- for(;number < eighthPoints; number++){
-
- aVal = _mm256_loadu_ps(aPtr);
- bVal = _mm256_loadu_ps(bPtr);
-
- cVal = _mm256_mul_ps(aVal, bVal);
-
- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
- aPtr += 8;
- bPtr += 8;
- cPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
-}
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Multiplys the two input vectors and store their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
- float* cPtr = cVector;
- const float* aPtr = aVector;
- const float* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
deleted file mode 100644
index e0d79ea7bc..0000000000
--- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
-#define INCLUDED_volk_32fc_conjugate_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Takes the conjugate of a complex vector.
- \param cVector The vector where the results will be stored
- \param aVector Vector to be conjugated
- \param num_points The number of complex values in aVector to be conjugated and stored into cVector
- */
-static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-
- for(;number < halfPoints; number++){
-
- x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
-
- x = _mm_xor_ps(x, conjugator); // conjugate register
-
- _mm_storeu_ps((float*)c,x); // Store the results back into the C container
-
- a += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0) {
- *c = lv_conj(*a);
- }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Takes the conjugate of a complex vector.
- \param cVector The vector where the results will be stored
- \param aVector Vector to be conjugated
- \param num_points The number of complex values in aVector to be conjugated and stored into cVector
- */
-static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = lv_conj(*aPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
deleted file mode 100644
index 77566e671d..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
-#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
- \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
-
- const float* complexVectorPtr = (float*)complexVector;
- double* iBufferPtr = iBuffer;
- double* qBufferPtr = qBuffer;
-
- const unsigned int halfPoints = num_points / 2;
- __m128 cplxValue, fVal;
- __m128d dVal;
-
- for(;number < halfPoints; number++){
-
- cplxValue = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i1i2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
- dVal = _mm_cvtps_pd(fVal);
- _mm_store_pd(iBufferPtr, dVal);
-
- // Arrange in q1q2q1q2 format
- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
- dVal = _mm_cvtps_pd(fVal);
- _mm_store_pd(qBufferPtr, dVal);
-
- iBufferPtr += 2;
- qBufferPtr += 2;
- }
-
- number = halfPoints * 2;
- for(; number < num_points; number++){
- *iBufferPtr++ = *complexVectorPtr++;
- *qBufferPtr++ = *complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
- \param complexVector The complex input vector
- \param iBuffer The I buffer output data
- \param qBuffer The Q buffer output data
- \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const float* complexVectorPtr = (float*)complexVector;
- double* iBufferPtr = iBuffer;
- double* qBufferPtr = qBuffer;
-
- for(number = 0; number < num_points; number++){
- *iBufferPtr++ = (double)*complexVectorPtr++;
- *qBufferPtr++ = (double)*complexVectorPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h
deleted file mode 100644
index c8b3f0a088..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_32f_u.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
-#define INCLUDED_volk_32fc_magnitude_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- result = _mm_sqrt_ps(result);
-
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
- /*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, iValue, qValue, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
- result = _mm_sqrt_ps(result);
-
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
deleted file mode 100644
index d3ac9717a8..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
-#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
- /*!
- \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, iValue, qValue, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_load_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
- _mm_store_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (real*real) + (imag*imag);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
deleted file mode 100644
index 53a4e68eb4..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
-#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
- /*!
- \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
-
- __m128 cplxValue1, cplxValue2, iValue, qValue, result;
- for(;number < quarterPoints; number++){
- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
- complexVectorPtr += 4;
-
- // Arrange in i1i2i3i4 format
- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
- // Arrange in q1q2q3q4 format
- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
- _mm_storeu_ps(magnitudeVectorPtr, result);
- magnitudeVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- float val1Real = *complexVectorPtr++;
- float val1Imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
- \param complexVector The vector containing the complex input values
- \param magnitudeVector The vector containing the real output values
- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
- */
-static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
- const float* complexVectorPtr = (float*)complexVector;
- float* magnitudeVectorPtr = magnitudeVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- const float real = *complexVectorPtr++;
- const float imag = *complexVectorPtr++;
- *magnitudeVectorPtr++ = (real*real) + (imag*imag);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
deleted file mode 100644
index 5c7d15b02f..0000000000
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
-#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
- \brief Multiplies the input vector by a scalar and stores the results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector The vector to be multiplied
- \param scalar The complex scalar to multiply aVector
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm_set_ps1(lv_creal(scalar));
- yh = _mm_set_ps1(lv_cimag(scalar));
-
- for(;number < halfPoints; number++){
-
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm_storeu_ps((float*)c,z); // Store the results back into the C container
-
- a += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0) {
- *c = (*a) * scalar;
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Multiplies the input vector by a scalar and stores the results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector The vector to be multiplied
- \param scalar The complex scalar to multiply aVector
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = num_points;
-
- // unwrap loop
- while (number >= 8){
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- number -= 8;
- }
-
- // clean up any remaining
- while (number-- > 0)
- *cPtr++ = *aPtr++ * scalar;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
deleted file mode 100644
index e7493413f7..0000000000
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
-#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
-
-
-#include<volk/volk_complex.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
- float * res = (float*) result;
- float * in = (float*) input;
- float * tp = (float*) taps;
- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
- unsigned int isodd = (num_bytes >> 3) &1;
-
-
-
- float sum0[2] = {0,0};
- float sum1[2] = {0,0};
- unsigned int i = 0;
-
-
- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
- sum0[0] += in[0] * tp[0] + in[1] * tp[1];
- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
- sum1[0] += in[2] * tp[2] + in[3] * tp[3];
- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-
-
- in += 4;
- tp += 4;
-
- }
-
-
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
-
-
-
- for(i = 0; i < isodd; ++i) {
-
-
- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-
- }
- /*
- for(i = 0; i < num_bytes >> 3; ++i) {
- *result += input[i] * conjf(taps[i]);
- }
- */
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#include <mmintrin.h>
-
-
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
- // Variable never used?
- //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
- union HalfMask {
- uint32_t intRep[4];
- __m128 vec;
- } halfMask;
-
- union NegMask {
- int intRep[4];
- __m128 vec;
- } negMask;
-
- unsigned int offset = 0;
- float Rsum=0, Isum=0;
- float Im,Re;
-
- __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
- __m128 zv = {0,0,0,0};
-
- halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
- halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
-
- negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
- negMask.intRep[1] = negMask.intRep[3] = 0;
-
- // main loop
- while(num_bytes >= 4*sizeof(float)){
-
- in1 = _mm_loadu_ps( (float*) (input+offset) );
- in2 = _mm_loadu_ps( (float*) (taps+offset) );
- Rv = _mm_mul_ps(in1, in2);
- fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
- Iv = _mm_mul_ps(in1, fehg);
- Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
- Ivm = _mm_xor_ps( negMask.vec, Iv );
- Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
- _mm_store_ss( &Im, Is );
- _mm_store_ss( &Re, Rs );
- num_bytes -= 4*sizeof(float);
- offset += 2;
- Rsum += Re;
- Isum += Im;
- }
-
- // handle the last complex case ...
- if(num_bytes > 0){
-
- if(num_bytes != 4){
- // bad things are happening
- }
-
- in1 = _mm_loadu_ps( (float*) (input+offset) );
- in2 = _mm_loadu_ps( (float*) (taps+offset) );
- Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
- fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
- Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
- Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
- Ivm = _mm_xor_ps( negMask.vec, Iv );
- Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
- _mm_store_ss( &Im, Is );
- _mm_store_ss( &Re, Rs );
- Rsum += Re;
- Isum += Im;
- }
-
- result[0] = lv_cmake(Rsum,Isum);
- return;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-
-#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
-
-
-
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
deleted file mode 100644
index 7c0dba7fd8..0000000000
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
-#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
-
-#include <volk/volk_common.h>
-#include <volk/volk_complex.h>
-#include <stdio.h>
-#include <string.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
- float * res = (float*) result;
- float * in = (float*) input;
- float * tp = (float*) taps;
- unsigned int n_2_ccomplex_blocks = num_points/2;
- unsigned int isodd = num_points &1;
-
-
-
- float sum0[2] = {0,0};
- float sum1[2] = {0,0};
- unsigned int i = 0;
-
-
- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
-
- sum0[0] += in[0] * tp[0] - in[1] * tp[1];
- sum0[1] += in[0] * tp[1] + in[1] * tp[0];
- sum1[0] += in[2] * tp[2] - in[3] * tp[3];
- sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-
-
- in += 4;
- tp += 4;
-
- }
-
-
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
-
-
-
- for(i = 0; i < isodd; ++i) {
-
-
- *result += input[num_points - 1] * taps[num_points - 1];
-
- }
-
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-
- lv_32fc_t dotProduct;
- memset(&dotProduct, 0x0, 2*sizeof(float));
-
- unsigned int number = 0;
- const unsigned int halfPoints = num_points/2;
-
- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
-
- const lv_32fc_t* a = input;
- const lv_32fc_t* b = taps;
-
- dotProdVal = _mm_setzero_ps();
-
- for(;number < halfPoints; number++){
-
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
-
- a += 2;
- b += 2;
- }
-
- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
-
- _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
- dotProduct += ( dotProductVector[0] + dotProductVector[1] );
-
- if(num_points % 1 != 0) {
- dotProduct += (*a) * (*b);
- }
-
- *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
deleted file mode 100644
index a998d6184e..0000000000
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
-#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Multiplies the two input complex vectors and stores their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
- */
-static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- for(;number < halfPoints; number++){
-
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm_storeu_ps((float*)c,z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0) {
- *c = (*a) * (*b);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies the two input complex vectors and stores their results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
- */
-static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * (*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
deleted file mode 100644
index 2755192e96..0000000000
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
-#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector First vector to be multiplied
- \param bVector Second vector that is conjugated before being multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
- */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-
- for(;number < halfPoints; number++){
-
- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
- y = _mm_xor_ps(y, conjugator); // conjugate y
-
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm_store_ps((float*)c,z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0) {
- *c = (*a) * lv_conj(*b);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector First vector to be multiplied
- \param bVector Second vector that is conjugated before being multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
- */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
deleted file mode 100644
index 09dcd635b9..0000000000
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
-#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
- /*!
- \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector First vector to be multiplied
- \param bVector Second vector that is conjugated before being multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
- */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
-
- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-
- for(;number < halfPoints; number++){
-
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
- y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
- y = _mm_xor_ps(y, conjugator); // conjugate y
-
- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm_storeu_ps((float*)c,z); // Store the results back into the C container
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if((num_points % 2) != 0) {
- *c = (*a) * lv_conj(*b);
- }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
- \param cVector The vector where the results will be stored
- \param aVector First vector to be multiplied
- \param bVector Second vector that is conjugated before being multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
- */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr= bVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h
deleted file mode 100644
index 8f4123d719..0000000000
--- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
-#define INCLUDED_volk_32i_s32f_convert_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
- /*!
- \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 32 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m128i inputVal;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
-
- // Load the 4 values
- inputVal = _mm_load_si128((__m128i*)inputPtr);
-
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
-
- _mm_store_ps(outputVectorPtr, ret);
-
- outputVectorPtr += 4;
- inputPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 32 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- */
-static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const int32_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
deleted file mode 100644
index b3a8ab2015..0000000000
--- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
-#define INCLUDED_volk_32i_s32f_convert_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
- /*!
- \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 32 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1(iScalar);
- int32_t* inputPtr = (int32_t*)inputVector;
- __m128i inputVal;
- __m128 ret;
-
- for(;number < quarterPoints; number++){
-
- // Load the 4 values
- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
- ret = _mm_cvtepi32_ps(inputVal);
- ret = _mm_mul_ps(ret, invScalar);
-
- _mm_storeu_ps(outputVectorPtr, ret);
-
- outputVectorPtr += 4;
- inputPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] =((float)(inputVector[number])) * iScalar;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 32 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const int32_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h
deleted file mode 100644
index e27d1f03dd..0000000000
--- a/volk/include/volk/volk_32u_byteswap_u.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_volk_32u_byteswap_u_H
-#define INCLUDED_volk_32u_byteswap_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
- \brief Byteswaps (in-place) an aligned vector of int32_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
- unsigned int number = 0;
-
- uint32_t* inputPtr = intsToSwap;
- __m128i input, byte1, byte2, byte3, byte4, output;
- __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
- __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
- const uint64_t quarterPoints = num_points / 4;
- for(;number < quarterPoints; number++){
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- input = _mm_loadu_si128((__m128i*)inputPtr);
- // Do the four shifts
- byte1 = _mm_slli_epi32(input, 24);
- byte2 = _mm_slli_epi32(input, 8);
- byte3 = _mm_srli_epi32(input, 8);
- byte4 = _mm_srli_epi32(input, 24);
- // Or bytes together
- output = _mm_or_si128(byte1, byte4);
- byte2 = _mm_and_si128(byte2, byte2mask);
- output = _mm_or_si128(output, byte2);
- byte3 = _mm_and_si128(byte3, byte3mask);
- output = _mm_or_si128(output, byte3);
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
- inputPtr += 4;
- }
-
- // Byteswap any remaining points:
- number = quarterPoints*4;
- for(; number < num_points; number++){
- uint32_t outputVal = *inputPtr;
- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
- *inputPtr = outputVal;
- inputPtr++;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Byteswaps (in-place) an aligned vector of int32_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = intsToSwap;
-
- unsigned int point;
- for(point = 0; point < num_points; point++){
- uint32_t output = *inputPtr;
- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
-
- *inputPtr = output;
- inputPtr++;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32u_byteswap_u_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h
deleted file mode 100644
index 11d51702bc..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_a.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_volk_64f_convert_32f_a_H
-#define INCLUDED_volk_64f_convert_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Converts the double values into float values
- \param dVector The converted float vector values
- \param fVector The double vector values to be converted
- \param num_points The number of points in the two vectors to be converted
- */
-static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m128 ret, ret2;
- __m128d inputVal1, inputVal2;
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
- inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
-
- ret = _mm_cvtpd_ps(inputVal1);
- ret2 = _mm_cvtpd_ps(inputVal2);
-
- ret = _mm_movelh_ps(ret, ret2);
-
- _mm_store_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the double values into float values
- \param dVector The converted float vector values
- \param fVector The double vector values to be converted
- \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const double* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h
deleted file mode 100644
index 31dc5b5fe9..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_u.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_volk_64f_convert_32f_u_H
-#define INCLUDED_volk_64f_convert_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
- /*!
- \brief Converts the double values into float values
- \param dVector The converted float vector values
- \param fVector The double vector values to be converted
- \param num_points The number of points in the two vectors to be converted
- */
-static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
- unsigned int number = 0;
-
- const unsigned int quarterPoints = num_points / 4;
-
- const double* inputVectorPtr = (const double*)inputVector;
- float* outputVectorPtr = outputVector;
- __m128 ret, ret2;
- __m128d inputVal1, inputVal2;
-
- for(;number < quarterPoints; number++){
- inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
- inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-
- ret = _mm_cvtpd_ps(inputVal1);
- ret2 = _mm_cvtpd_ps(inputVal2);
-
- ret = _mm_movelh_ps(ret, ret2);
-
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]);
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Converts the double values into float values
- \param dVector The converted float vector values
- \param fVector The double vector values to be converted
- \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const double* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++));
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h
deleted file mode 100644
index 41a4a3130f..0000000000
--- a/volk/include/volk/volk_64u_byteswap_u.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef INCLUDED_volk_64u_byteswap_u_H
-#define INCLUDED_volk_64u_byteswap_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
- \brief Byteswaps (in-place) an aligned vector of int64_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- __m128i input, byte1, byte2, byte3, byte4, output;
- __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
- __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
- uint64_t number = 0;
- const unsigned int halfPoints = num_points / 2;
- for(;number < halfPoints; number++){
- // Load the 32t values, increment inputPtr later since we're doing it in-place.
- input = _mm_loadu_si128((__m128i*)inputPtr);
-
- // Do the four shifts
- byte1 = _mm_slli_epi32(input, 24);
- byte2 = _mm_slli_epi32(input, 8);
- byte3 = _mm_srli_epi32(input, 8);
- byte4 = _mm_srli_epi32(input, 24);
- // Or bytes together
- output = _mm_or_si128(byte1, byte4);
- byte2 = _mm_and_si128(byte2, byte2mask);
- output = _mm_or_si128(output, byte2);
- byte3 = _mm_and_si128(byte3, byte3mask);
- output = _mm_or_si128(output, byte3);
-
- // Reorder the two words
- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
- // Store the results
- _mm_storeu_si128((__m128i*)inputPtr, output);
- inputPtr += 4;
- }
-
- // Byteswap any remaining points:
- number = halfPoints*2;
- for(; number < num_points; number++){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
-
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
- *inputPtr++ = output2;
- *inputPtr++ = output1;
- }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Byteswaps (in-place) an aligned vector of int64_t's.
- \param intsToSwap The vector of data to byte swap
- \param numDataPoints The number of data points
-*/
-static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){
- uint32_t* inputPtr = (uint32_t*)intsToSwap;
- unsigned int point;
- for(point = 0; point < num_points; point++){
- uint32_t output1 = *inputPtr;
- uint32_t output2 = inputPtr[1];
-
- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-
- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
- *inputPtr++ = output2;
- *inputPtr++ = output1;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h
deleted file mode 100644
index 7d7104f52b..0000000000
--- a/volk/include/volk/volk_8i_convert_16i_u.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_8i_convert_16i_u_H
-#define INCLUDED_volk_8i_convert_16i_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
- /*!
- \brief Converts the input 8 bit integer data into 16 bit integer data
- \param inputVector The 8 bit input data buffer
- \param outputVector The 16 bit output data buffer
- \param num_points The number of data values to be converted
- \note Input and output buffers do NOT need to be properly aligned
- */
-static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
- __m128i* outputVectorPtr = (__m128i*)outputVector;
- __m128i inputVal;
- __m128i ret;
-
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_loadu_si128(inputVectorPtr);
- ret = _mm_cvtepi8_epi16(inputVal);
- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
- _mm_storeu_si128(outputVectorPtr, ret);
-
- outputVectorPtr++;
-
- inputVal = _mm_srli_si128(inputVal, 8);
- ret = _mm_cvtepi8_epi16(inputVal);
- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
- _mm_storeu_si128(outputVectorPtr, ret);
-
- outputVectorPtr++;
-
- inputVectorPtr++;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number])*256;
- }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Converts the input 8 bit integer data into 16 bit integer data
- \param inputVector The 8 bit input data buffer
- \param outputVector The 16 bit output data buffer
- \param num_points The number of data values to be converted
- \note Input and output buffers do NOT need to be properly aligned
- */
-static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
- int16_t* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
deleted file mode 100644
index 8bb2c0d1a4..0000000000
--- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
-#define INCLUDED_volk_8i_s32f_convert_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
- /*!
- \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 8 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
- unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
-
- float* outputVectorPtr = outputVector;
- const float iScalar = 1.0 / scalar;
- __m128 invScalar = _mm_set_ps1( iScalar );
- const int8_t* inputVectorPtr = inputVector;
- __m128 ret;
- __m128i inputVal;
- __m128i interimVal;
-
- for(;number < sixteenthPoints; number++){
- inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVal = _mm_srli_si128(inputVal, 4);
- interimVal = _mm_cvtepi8_epi32(inputVal);
- ret = _mm_cvtepi32_ps(interimVal);
- ret = _mm_mul_ps(ret, invScalar);
- _mm_storeu_ps(outputVectorPtr, ret);
- outputVectorPtr += 4;
-
- inputVectorPtr += 16;
- }
-
- number = sixteenthPoints * 16;
- for(; number < num_points; number++){
- outputVector[number] = (float)(inputVector[number]) * iScalar;
- }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#ifdef LV_HAVE_GENERIC
- /*!
- \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
- \param inputVector The 8 bit input data buffer
- \param outputVector The floating point output data buffer
- \param scalar The value divided against each point in the output buffer
- \param num_points The number of data values to be converted
- \note Output buffer does NOT need to be properly aligned
- */
-static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
- float* outputVectorPtr = outputVector;
- const int8_t* inputVectorPtr = inputVector;
- unsigned int number = 0;
- const float iScalar = 1.0 / scalar;
-
- for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h
index 83d9baf89d..690e5f99f6 100644
--- a/volk/include/volk/volk_prefs.h
+++ b/volk/include/volk/volk_prefs.h
@@ -2,23 +2,26 @@
#define INCLUDED_VOLK_PREFS_H
#include <volk/volk_common.h>
+#include <stdlib.h>
__VOLK_DECL_BEGIN
-struct volk_arch_pref {
- char name[128];
- char arch[32];
-};
+typedef struct volk_arch_pref
+{
+ char name[128]; //name of the kernel
+ char impl_a[128]; //best aligned impl
+ char impl_u[128]; //best unaligned impl
+} volk_arch_pref_t;
////////////////////////////////////////////////////////////////////////
// get path to volk_config profiling info
////////////////////////////////////////////////////////////////////////
-VOLK_API void get_config_path(char *);
+VOLK_API void volk_get_config_path(char *);
////////////////////////////////////////////////////////////////////////
// load prefs into global prefs struct
////////////////////////////////////////////////////////////////////////
-VOLK_API int load_preferences(struct volk_arch_pref **);
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
__VOLK_DECL_END
diff --git a/volk/kernels/README.txt b/volk/kernels/README.txt
new file mode 100644
index 0000000000..5dd7434b54
--- /dev/null
+++ b/volk/kernels/README.txt
@@ -0,0 +1,67 @@
+########################################################################
+# How to create custom kernel dispatchers
+########################################################################
+A kernel dispatcher is kernel implementation that calls other kernel implementations.
+By default, a dispatcher is generated by the build system for every kernel such that:
+ * the best aligned implemention is called when all pointer arguments are aligned,
+ * and otherwise the best unaligned implementation is called.
+
+The author of a VOLK kernel may create a custom dispatcher,
+to be called in place of the automatically generated one.
+A custom dispatcher may be useful to handle head and tail cases,
+or to implement different alignment and bounds checking logic.
+
+########################################################################
+# Code for an example dispatcher w/ tail case
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+{
+ const unsigned int num_points_r = num_points%4;
+ const unsigned int num_points_x = num_points - num_points_r;
+
+ if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
+ {
+ volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
+ }
+ else
+ {
+ volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
+ }
+
+ volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
+}
+
+#endif //LV_HAVE_DISPATCHER
+
+########################################################################
+# Code for an example dispatcher w/ tail case and accumulator
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
+{
+ const unsigned int num_points_r = num_points%16;
+ const unsigned int num_points_x = num_points - num_points_r;
+
+ if (volk_is_aligned(VOLK_OR_PTR(input, taps)))
+ {
+ volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
+ }
+ else
+ {
+ volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
+ }
+
+ float result_tail = 0;
+ volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
+
+ *result += result_tail;
+}
+
+#endif //LV_HAVE_DISPATCHER
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
index 1f6554af8b..8bc1569f61 100644
--- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -8,7 +8,7 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
static const int N_UNROLL = 4;
diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/kernels/volk/volk_16i_branch_4_state_8.h
index 6338fbdd17..cdfbc7ba13 100644
--- a/volk/include/volk/volk_16i_branch_4_state_8_a.h
+++ b/volk/kernels/volk/volk_16i_branch_4_state_8.h
@@ -138,7 +138,7 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src
#endif /*LV_HAVE_SSEs*/
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+static inline void volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
int i = 0;
int bound = 4;
diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/kernels/volk/volk_16i_convert_8i.h
index 80608a1412..3789b2e4ab 100644
--- a/volk/include/volk/volk_16i_convert_8i_u.h
+++ b/volk/kernels/volk/volk_16i_convert_8i.h
@@ -54,7 +54,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_
\param num_points The number of data values to be converted
\note Input and output buffers do NOT need to be properly aligned
*/
-static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+static inline void volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
int8_t* outputVectorPtr = outputVector;
const int16_t* inputVectorPtr = inputVector;
unsigned int number = 0;
@@ -69,3 +69,72 @@ static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int
#endif /* INCLUDED_volk_16i_convert_8i_u_H */
+#ifndef INCLUDED_volk_16i_convert_8i_a_H
+#define INCLUDED_volk_16i_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ int8_t* outputVectorPtr = outputVector;
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal1;
+ __m128i inputVal2;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+
+ // Load the 16 values
+ inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+ inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+ inputVal1 = _mm_srai_epi16(inputVal1, 8);
+ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] =(int8_t)(inputVector[number] >> 8);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the input 16 bit integer data into 8 bit integer data
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/kernels/volk/volk_16i_max_star_16i.h
index edfff8a82b..c67351c5fa 100644
--- a/volk/include/volk/volk_16i_max_star_16i_a.h
+++ b/volk/kernels/volk/volk_16i_max_star_16i.h
@@ -12,9 +12,9 @@
#include<emmintrin.h>
#include<tmmintrin.h>
-static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) {
-
+static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*2;
short candidate = src0[0];
short cands[8];
@@ -87,7 +87,9 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) {
+static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
int i = 0;
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
index c1c9084256..ef88ec094f 100644
--- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
+++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -13,7 +13,9 @@
#include<emmintrin.h>
#include<tmmintrin.h>
-static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
@@ -110,7 +112,9 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
int i = 0;
diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h
index 47e3cbf9cb..7a01d172a3 100644
--- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
+++ b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -13,8 +13,9 @@
#include<xmmintrin.h>
#include<emmintrin.h>
-static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*2;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -117,7 +118,9 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
int i = 0;
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index 4ce8e8f35b..a810a601a0 100644
--- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -105,7 +105,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in
\param num_points The number of data values to be converted
\note Output buffer does NOT need to be properly aligned
*/
-static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
float* outputVectorPtr = outputVector;
const int16_t* inputVectorPtr = inputVector;
unsigned int number = 0;
@@ -120,3 +120,122 @@ static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, cons
#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
index 0d84985530..56b2cc07ab 100644
--- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
+++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -13,10 +13,9 @@
#include<emmintrin.h>
-static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-
-
+static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*2;
int i = 0;
@@ -168,7 +167,9 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
int i = 0;
diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
index 5560b92d92..9b6d19fd66 100644
--- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
+++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -13,7 +13,9 @@
#include<xmmintrin.h>
#include<emmintrin.h>
-static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4;
__m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
@@ -113,7 +115,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
#ifdef LV_HAVE_GENERIC
-static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*2;
int i = 0;
diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h
index f8aa30874f..9ce8012640 100644
--- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
+++ b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h
@@ -128,7 +128,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_
\param qBuffer The Q buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
int16_t* qBufferPtr = qBuffer;
@@ -149,7 +149,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int
\param num_points The number of complex data values to be deinterleaved
*/
extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h
index bac1f2e4b0..f6eccd77ee 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
+++ b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h
@@ -103,7 +103,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const int16_t* complexVectorPtr = (int16_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h
index cd2fabb521..f3d0c83524 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h
+++ b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h
@@ -66,7 +66,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
int16_t* complexVectorPtr = (int16_t*)complexVector;
int8_t* iBufferPtr = iBuffer;
@@ -85,7 +85,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, con
\param num_points The number of complex data values to be deinterleaved
*/
extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/kernels/volk/volk_16ic_magnitude_16i.h
index 317075e85e..b33306a123 100644
--- a/volk/include/volk/volk_16ic_magnitude_16i_a.h
+++ b/volk/kernels/volk/volk_16ic_magnitude_16i.h
@@ -161,7 +161,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
-static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
int16_t* magnitudeVectorPtr = magnitudeVector;
unsigned int number = 0;
@@ -182,7 +182,7 @@ static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, c
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
index 1300395ff0..55243b4aa8 100644
--- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
+++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
@@ -78,7 +78,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa
\param scalar The data value to be divided against each input data value of the input complex vector
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
float* iBufferPtr = iBuffer;
float* qBufferPtr = qBuffer;
@@ -100,7 +100,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer,
\param num_points The number of complex data values to be deinterleaved
*/
extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
index 5e2d82b947..57d078a595 100644
--- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
+++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
@@ -108,7 +108,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co
\param scalar The scaling value being multiplied against each data point
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
float* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h
index d20eea1a79..27901cb9ac 100644
--- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
+++ b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h
@@ -149,7 +149,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co
\param scalar The data value to be divided against each input data value of the input complex vector
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
-static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
const int16_t* complexVectorPtr = (const int16_t*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
unsigned int number = 0;
@@ -171,7 +171,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/kernels/volk/volk_16u_byteswap.h
index fc3eb5fa7a..57f2008991 100644
--- a/volk/include/volk/volk_16u_byteswap_a.h
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -1,3 +1,66 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
#ifndef INCLUDED_volk_16u_byteswap_a_H
#define INCLUDED_volk_16u_byteswap_a_H
@@ -68,7 +131,7 @@ static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned in
\param numDataPoints The number of data points
*/
extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
-static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/kernels/volk/volk_32f_accumulator_s32f.h
index 78364d0a01..a67d10f9b5 100644
--- a/volk/include/volk/volk_32f_accumulator_s32f_a.h
+++ b/volk/kernels/volk/volk_32f_accumulator_s32f.h
@@ -50,7 +50,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
-static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){
+static inline void volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
const float* aPtr = inputBuffer;
unsigned int number = 0;
float returnValue = 0;
diff --git a/volk/kernels/volk/volk_32f_convert_64f.h b/volk/kernels/volk/volk_32f_convert_64f.h
new file mode 100644
index 0000000000..2f036955dd
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_convert_64f.h
@@ -0,0 +1,140 @@
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_storeu_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
+#ifndef INCLUDED_volk_32f_convert_64f_a_H
+#define INCLUDED_volk_32f_convert_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m128d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+
+ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+ ret = _mm_cvtps_pd(inputVal);
+
+ _mm_store_pd(outputVectorPtr, ret);
+ outputVectorPtr += 2;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+ double* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/kernels/volk/volk_32f_index_max_16u.h
index b9ca1dd3e7..dd1aed2459 100644
--- a/volk/include/volk/volk_32f_index_max_16u_a.h
+++ b/volk/kernels/volk/volk_32f_index_max_16u.h
@@ -124,7 +124,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+static inline void volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
float max = src0[0];
unsigned int index = 0;
diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
index 43713f8b5a..71881c2d5f 100644
--- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -87,7 +87,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co
\param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
\param num_points The number of real values in the input vector.
*/
-static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
if (num_points < 1) {
return;
}
diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
index db61e359d6..bf05a882d5 100644
--- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
@@ -128,7 +128,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois
\param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
\param noiseFloorAmplitude The noise floor of the input spectrum, in dB
*/
-static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
float sumMean = 0.0;
unsigned int number;
// find the sum (for mean), etc
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
index 56e42c9bd5..9fd758655f 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -127,7 +127,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
-static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
@@ -150,3 +150,153 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co
#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r < min_val)
+ r = min_val;
+ else if(r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/kernels/volk/volk_32f_s32f_convert_32i.h
index 38e6b2e745..1a46093ee2 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_32i.h
@@ -1,3 +1,145 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1;
+ __m128i intInputVal1;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int32_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -2147483647;
+ float max_val = 2147483647;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int32_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
#define INCLUDED_volk_32f_s32f_convert_32i_a_H
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_8i.h
index 870e9419bb..b451505221 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_8i.h
@@ -132,7 +132,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
-static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
int8_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
@@ -155,3 +155,158 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons
#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < sixteenthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+ inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+ inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ int8_t* outputVectorPtr = outputVector;
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 8 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int8_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -128;
+ float max_val = 127;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int8_t)(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
index 99b8e68c5b..2dd86a17c2 100644
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -1,3 +1,105 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, bVal, cVal;
+ bVal = _mm_set_ps1(scalar);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, bVal, cVal;
+ bVal = _mm256_set1_ps(scalar);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ for(number = 0; number < num_points; number++){
+ *outputPtr = (*inputPtr) * scalar;
+ inputPtr++;
+ outputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
@@ -108,7 +210,7 @@ static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const fl
\param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
*/
extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/kernels/volk/volk_32f_s32f_normalize.h
index f5fd0d1dba..a0bd33c7dc 100644
--- a/volk/include/volk/volk_32f_s32f_normalize_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_normalize.h
@@ -49,7 +49,7 @@ static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float s
\param bVector One of the vectors to be normalizeed
\param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
*/
-static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
unsigned int number = 0;
float* inputPtr = vecBuffer;
const float invScalar = 1.0 / scalar;
@@ -69,7 +69,7 @@ static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const flo
\param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
*/
extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
float invscalar = 1.0 / scalar;
volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
}
diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/kernels/volk/volk_32f_s32f_power_32f.h
index 633ad14b09..2822444686 100644
--- a/volk/include/volk/volk_32f_s32f_power_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_power_32f.h
@@ -127,7 +127,7 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV
\param power The power value to be applied to each data point
\param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
*/
-static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+static inline void volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
unsigned int number = 0;
diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h
index 98401b2d42..0622b278a6 100644
--- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h
@@ -120,7 +120,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in
\param mean The mean of the input buffer
\param num_points The number of values in input buffer to used in the stddev calculation
*/
-static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
float returnValue = 0;
if(num_points > 0){
const float* aPtr = inputBuffer;
diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/kernels/volk/volk_32f_sqrt_32f.h
index d9b16fc0fb..ab9fffd7dc 100644
--- a/volk/include/volk/volk_32f_sqrt_32f_a.h
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -47,7 +47,7 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector,
\param aVector One of the vectors to be sqrted
\param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
*/
-static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
unsigned int number = 0;
@@ -66,7 +66,7 @@ extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
\param aVector One of the vectors to be sqrted
\param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
*/
-static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points){
volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
}
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
index 7de32f7b18..9bded6713d 100644
--- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
+++ b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
@@ -143,7 +143,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m
\param inputBuffer The buffer of points to calculate the std deviation for
\param num_points The number of values in input buffer to used in the stddev and mean calculations
*/
-static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
float returnValue = 0;
float newMean = 0;
if(num_points > 0){
diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/kernels/volk/volk_32f_x2_add_32f.h
index 51e63e54d2..42278f6068 100644
--- a/volk/include/volk/volk_32f_x2_add_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_add_32f.h
@@ -1,3 +1,69 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
#define INCLUDED_volk_32f_x2_add_32f_a_H
@@ -72,7 +138,7 @@ static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aV
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/kernels/volk/volk_32f_x2_divide_32f.h
index 7b60fb22ef..d5a7c7d7c0 100644
--- a/volk/include/volk/volk_32f_x2_divide_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_divide_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe
\param bVector The divisor vector
\param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
*/
-static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
@@ -72,7 +72,7 @@ static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float*
\param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
*/
extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
index 961c2418ca..8fcc7deaed 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -8,7 +8,7 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
float dotProduct = 0;
const float* aPtr = input;
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
new file mode 100644
index 0000000000..b91252e36f
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -0,0 +1,580 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 aVal1, bVal1, cVal1;
+ __m128 aVal2, bVal2, cVal2;
+ __m128 aVal3, bVal3, cVal3;
+ __m128 aVal4, bVal4, cVal4;
+
+ __m128 dotProdVal = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+ cVal1 = _mm_or_ps(cVal1, cVal2);
+ cVal3 = _mm_or_ps(cVal3, cVal4);
+ cVal1 = _mm_or_ps(cVal1, cVal3);
+
+ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
index 52d80b6bb3..0935cb32bd 100644
--- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
+++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -56,7 +56,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c
\param complexVector The complex output vector
\param num_points The number of complex data values to be interleaved
*/
-static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
float* complexVectorPtr = (float*)complexVector;
const float* iBufferPtr = iBuffer;
const float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/kernels/volk/volk_32f_x2_max_32f.h
index 79f2d04b56..27633acae8 100644
--- a/volk/include/volk/volk_32f_x2_max_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_max_32f.h
@@ -53,7 +53,7 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto
\param bVector The vector to be checked
\param num_points The number of values in aVector and bVector to be checked and stored into cVector
*/
-static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
@@ -76,7 +76,7 @@ static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aV
\param num_points The number of values in aVector and bVector to be checked and stored into cVector
*/
extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/kernels/volk/volk_32f_x2_min_32f.h
index 42cac08339..4773d13211 100644
--- a/volk/include/volk/volk_32f_x2_min_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -53,7 +53,7 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto
\param bVector The vector to be checked
\param num_points The number of values in aVector and bVector to be checked and stored into cVector
*/
-static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
@@ -76,7 +76,7 @@ static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aV
\param num_points The number of values in aVector and bVector to be checked and stored into cVector
*/
extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
index 340e051657..9fdbec0a2c 100644
--- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -1,3 +1,109 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_mul_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Multiplies the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m256 aVal, bVal, cVal;
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ bVal = _mm256_loadu_ps(bPtr);
+
+ cVal = _mm256_mul_ps(aVal, bVal);
+
+ _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
#define INCLUDED_volk_32f_x2_multiply_32f_a_H
@@ -111,7 +217,7 @@ static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const floa
\param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
*/
extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
index 10fc267dcd..ce7b91a318 100644
--- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
+++ b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
@@ -137,7 +137,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect
\param scalar The scaling value being multiplied against each data point
\param num_points The number of complex data values to be interleaved
*/
-static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
int16_t* complexVectorPtr = (int16_t*)complexVector;
const float* iBufferPtr = iBuffer;
const float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
index e2b8be797f..8ea491f988 100644
--- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a
\param bVector The vector to be subtracted
\param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
*/
-static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
@@ -72,7 +72,7 @@ static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const floa
\param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
*/
extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index 3c530628c8..e975f14e92 100644
--- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -13,8 +13,9 @@
#include<xmmintrin.h>
#include<pmmintrin.h>
-static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*4;
float result = 0.0;
float fst = 0.0;
@@ -100,9 +101,9 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-
+static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*4;
float result = 0.0;
float fst = 0.0;
diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index 109b787e8c..e0a8a59ced 100644
--- a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -8,7 +8,7 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
float res[2];
float *realpt = &res[0], *imagpt = &res[1];
diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
index 28d584bf2c..104e3250e6 100644
--- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -64,7 +64,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l
\param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
\param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
*/
-static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
const float* bPtr= bVector;
@@ -85,7 +85,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con
\param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
*/
extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
index 919280d510..dce897ff57 100644
--- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -1,3 +1,67 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
#define INCLUDED_volk_32fc_conjugate_32fc_a_H
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index 4106f38513..0d33ed7e28 100644
--- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -57,7 +57,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB
\param qBuffer The Q buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
const float* complexVectorPtr = (float*)complexVector;
float* iBufferPtr = iBuffer;
float* qBufferPtr = qBuffer;
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
new file mode 100644
index 0000000000..4a4c5509bd
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int halfPoints = num_points / 2;
+ __m128 cplxValue, fVal;
+ __m128d dVal;
+
+ for(;number < halfPoints; number++){
+
+ cplxValue = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm_cvtps_pd(fVal);
+ _mm_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 2;
+ qBufferPtr += 2;
+ }
+
+ number = halfPoints * 2;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ for(number = 0; number < num_points; number++){
+ *iBufferPtr++ = (double)*complexVectorPtr++;
+ *qBufferPtr++ = (double)*complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index c88809bebd..b1968296f5 100644
--- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l
\param qBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const float* complexVectorPtr = (float*)complexVector;
float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
index 0d6c6b7af4..3d57598135 100644
--- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const float* complexVectorPtr = (float*)complexVector;
float* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h
index 1e346bacaf..1fa66e8add 100644
--- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h
@@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const float* complexVectorPtr = (float*)complexVector;
double* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/kernels/volk/volk_32fc_index_max_16u.h
index 842a6a0420..c8d7212401 100644
--- a/volk/include/volk/volk_32fc_index_max_16u_a.h
+++ b/volk/kernels/volk/volk_32fc_index_max_16u.h
@@ -11,9 +11,9 @@
#include<pmmintrin.h>
-static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
-
+static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*8;
union bit128 holderf;
union bit128 holderi;
@@ -189,7 +189,10 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
float sq_dist = 0.0;
float max = 0.0;
unsigned int index = 0;
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index efb84a904b..64e99cc1be 100644
--- a/volk/include/volk/volk_32fc_magnitude_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -1,3 +1,121 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ result = _mm_sqrt_ps(result);
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
#define INCLUDED_volk_32fc_magnitude_32f_a_H
@@ -123,7 +241,7 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
-static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
new file mode 100644
index 0000000000..0af81401a8
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -0,0 +1,228 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h
index d86bd63c1c..b076ab44ef 100644
--- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -139,7 +139,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv
\param normalizeFactor The atan2 results will be divided by this normalization factor.
\param num_points The number of complex values in the input vector.
*/
-static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
float* outPtr = outputVector;
const float* inPtr = (float*)inputVector;
const float invNormalizeFactor = 1.0 / normalizeFactor;
diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
index 1c17fb70c6..9e10217a0f 100644
--- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
@@ -63,7 +63,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer,
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
const float* complexVectorPtr = (float*)complexVector;
int16_t* iBufferPtr = iBuffer;
unsigned int number = 0;
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h
index 38fd609d31..09abd967d6 100644
--- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h
@@ -129,7 +129,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
-static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
const float* complexVectorPtr = (float*)complexVector;
int16_t* magnitudeVectorPtr = magnitudeVector;
unsigned int number = 0;
@@ -150,7 +150,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVect
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_32fc_s32f_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h
index 3106edbefd..d4a1d17469 100644
--- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h
@@ -94,7 +94,7 @@ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_
\param power The power value to be applied to each data point
\param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
*/
-static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = 0;
diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
index 30a77dbc18..f76d9d35e4 100644
--- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
@@ -96,7 +96,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu
\param normalizationFactor This value is divided agains all the input values before the power is calculated
\param num_points The number of fft data points
*/
-static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+static inline void volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
// Calculate the Power of the complex point
const float* inputPtr = (float*)complexFFTInput;
float* realFFTDataPointsPtr = logPowerOutput;
diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
index 27f755351d..e73eb09f8f 100644
--- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -103,7 +103,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo
\param rbw The resolution bandwith of the fft spectrum
\param num_points The number of fft data points
*/
-static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
// Calculate the Power of the complex point
const float* inputPtr = (float*)complexFFTInput;
float* realFFTDataPointsPtr = logPowerOutput;
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index f206c5e874..668a047609 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -1,3 +1,90 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
index eee9f0064f..ab6b7fb1df 100644
--- a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
@@ -4,7 +4,7 @@
#include <volk/volk_complex.h>
#include <stdio.h>
-#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
#ifdef LV_HAVE_GENERIC
@@ -19,9 +19,9 @@
*/
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
- volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points);
+ volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_GENERIC */
@@ -32,7 +32,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVe
static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
- volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+ volk_32fc_s32fc_x2_rotator_32fc_sse4_1(outVector, inVector, phase_inc, phase, num_points);
}
@@ -58,7 +58,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec
static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
- volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points);
+ volk_32fc_s32fc_x2_rotator_32fc_avx(outVector, inVector, phase_inc, phase, num_points);
}
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
index 51b6041ec0..ffbbdff690 100644
--- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -20,7 +20,7 @@
*/
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
unsigned int i = 0;
int j = 0;
for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
@@ -42,7 +42,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVecto
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
@@ -153,7 +153,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
lv_32fc_t* cPtr = outVector;
const lv_32fc_t* aPtr = inVector;
lv_32fc_t incr = 1;
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
index e3dedf2fcd..e6ccf5c384 100644
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -1,3 +1,152 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+#include<volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+ }
+ /*
+ for(i = 0; i < num_bytes >> 3; ++i) {
+ *result += input[i] * conjf(taps[i]);
+ }
+ */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <mmintrin.h>
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int num_bytes = num_points*8;
+
+ // Variable never used?
+ //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+ union HalfMask {
+ uint32_t intRep[4];
+ __m128 vec;
+ } halfMask;
+
+ union NegMask {
+ int intRep[4];
+ __m128 vec;
+ } negMask;
+
+ unsigned int offset = 0;
+ float Rsum=0, Isum=0;
+ float Im,Re;
+
+ __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
+ __m128 zv = {0,0,0,0};
+
+ halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
+ halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
+
+ negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
+ negMask.intRep[1] = negMask.intRep[3] = 0;
+
+ // main loop
+ while(num_bytes >= 4*sizeof(float)){
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_mul_ps(in1, in2);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_mul_ps(in1, fehg);
+ Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ num_bytes -= 4*sizeof(float);
+ offset += 2;
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ // handle the last complex case ...
+ if(num_bytes > 0){
+
+ if(num_bytes != 4){
+ // bad things are happening
+ }
+
+ in1 = _mm_loadu_ps( (float*) (input+offset) );
+ in2 = _mm_loadu_ps( (float*) (taps+offset) );
+ Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
+ fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+ Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
+ Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
+ Ivm = _mm_xor_ps( negMask.vec, Iv );
+ Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
+ _mm_store_ss( &Im, Is );
+ _mm_store_ss( &Re, Rs );
+ Rsum += Re;
+ Isum += Im;
+ }
+
+ result[0] = lv_cmake(Rsum,Isum);
+ return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+
+
+
#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
@@ -9,7 +158,9 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
float * res = (float*) result;
float * in = (float*) input;
@@ -63,7 +214,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res
#if LV_HAVE_SSE && LV_HAVE_64
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
@@ -204,7 +357,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
#endif
#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index caef3e6f0d..066bed4439 100644
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -1,3 +1,119 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points &1;
+
+
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+
+ in += 4;
+ tp += 4;
+
+ }
+
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+
+
+ for(i = 0; i < isodd; ++i) {
+
+
+ *result += input[num_points - 1] * taps[num_points - 1];
+
+ }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points/2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(num_points % 1 != 0) {
+ dotProduct += (*a) * (*b);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
@@ -10,7 +126,9 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
float * res = (float*) result;
float * in = (float*) input;
@@ -46,8 +164,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
#if LV_HAVE_SSE && LV_HAVE_64
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*8;
asm
(
@@ -175,11 +294,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const
#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-
- volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes);
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
#if 0
+ const unsigned int num_bytes = num_points*8;
asm volatile
(
" #pushl %%ebp\n\t"
@@ -299,8 +418,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const
#include <pmmintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*8;
lv_32fc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(float));
@@ -356,7 +476,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
#include <smmintrin.h>
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index f79ddb59bf..7db68c1bd8 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -1,3 +1,80 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
@@ -81,7 +158,7 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
-static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
new file mode 100644
index 0000000000..cfd6c007f1
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -0,0 +1,162 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ y = _mm_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * lv_conj(*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
index 75eb9173d5..cb2e945015 100644
--- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
@@ -10,8 +10,9 @@
#include<xmmintrin.h>
#include<pmmintrin.h>
-static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*8;
__m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
@@ -106,7 +107,10 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
lv_32fc_t diff;
float sq_dist;
unsigned int i = 0;
diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
index b819eaffd4..27a081b7cf 100644
--- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -9,8 +9,9 @@
#include<xmmintrin.h>
#include<pmmintrin.h>
-static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+ const unsigned int num_bytes = num_points*8;
__m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -92,7 +93,10 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
lv_32fc_t diff;
float sq_dist;
unsigned int i = 0;
diff --git a/volk/kernels/volk/volk_32i_s32f_convert_32f.h b/volk/kernels/volk/volk_32i_s32f_convert_32f.h
new file mode 100644
index 0000000000..7a09883453
--- /dev/null
+++ b/volk/kernels/volk/volk_32i_s32f_convert_32f.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+
+ // Load the 4 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1(iScalar);
+ int32_t* inputPtr = (int32_t*)inputVector;
+ __m128i inputVal;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+
+ // Load the 4 values
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+
+ _mm_store_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ inputPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 32 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int32_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/kernels/volk/volk_32i_x2_and_32i.h
index e5330847b3..54ecb79812 100644
--- a/volk/include/volk/volk_32i_x2_and_32i_a.h
+++ b/volk/kernels/volk/volk_32i_x2_and_32i.h
@@ -51,7 +51,7 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV
\param bVector One of the vectors
\param num_points The number of values in aVector and bVector to be anded together and stored into cVector
*/
-static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
int32_t* cPtr = cVector;
const int32_t* aPtr = aVector;
const int32_t* bPtr= bVector;
@@ -72,7 +72,7 @@ static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t
\param num_points The number of values in aVector and bVector to be anded together and stored into cVector
*/
extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/kernels/volk/volk_32i_x2_or_32i.h
index 24045894c6..acadd5a57f 100644
--- a/volk/include/volk/volk_32i_x2_or_32i_a.h
+++ b/volk/kernels/volk/volk_32i_x2_or_32i.h
@@ -51,7 +51,7 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe
\param bVector One of the vectors to be ored
\param num_points The number of values in aVector and bVector to be ored together and stored into cVector
*/
-static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
int32_t* cPtr = cVector;
const int32_t* aPtr = aVector;
const int32_t* bPtr= bVector;
@@ -72,7 +72,7 @@ static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t*
\param num_points The number of values in aVector and bVector to be ored together and stored into cVector
*/
extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/kernels/volk/volk_32u_byteswap.h
index 71ae027d37..8f6e3ad7b5 100644
--- a/volk/include/volk/volk_32u_byteswap_a.h
+++ b/volk/kernels/volk/volk_32u_byteswap.h
@@ -1,3 +1,80 @@
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+
+ uint32_t* inputPtr = intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+ const uint64_t quarterPoints = num_points / 4;
+ for(;number < quarterPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = quarterPoints*4;
+ for(; number < num_points; number++){
+ uint32_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int32_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = intsToSwap;
+
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output = *inputPtr;
+ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
#ifndef INCLUDED_volk_32u_byteswap_a_H
#define INCLUDED_volk_32u_byteswap_a_H
diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/kernels/volk/volk_32u_popcnt.h
index b72d605c67..9783569729 100644
--- a/volk/include/volk/volk_32u_popcnt_a.h
+++ b/volk/kernels/volk/volk_32u_popcnt.h
@@ -7,7 +7,7 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) {
+static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) {
// This is faster than a lookup table
uint32_t retVal = value;
diff --git a/volk/kernels/volk/volk_64f_convert_32f.h b/volk/kernels/volk/volk_64f_convert_32f.h
new file mode 100644
index 0000000000..c27526ffaf
--- /dev/null
+++ b/volk/kernels/volk/volk_64f_convert_32f.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
+#ifndef INCLUDED_volk_64f_convert_32f_a_H
+#define INCLUDED_volk_64f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const double* inputVectorPtr = (const double*)inputVector;
+ float* outputVectorPtr = outputVector;
+ __m128 ret, ret2;
+ __m128d inputVal1, inputVal2;
+
+ for(;number < quarterPoints; number++){
+ inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+
+ ret = _mm_cvtpd_ps(inputVal1);
+ ret2 = _mm_cvtpd_ps(inputVal2);
+
+ ret = _mm_movelh_ps(ret, ret2);
+
+ _mm_store_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts the double values into float values
+ \param dVector The converted float vector values
+ \param fVector The double vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const double* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/kernels/volk/volk_64f_x2_max_64f.h
index 33aae6d102..f9a04c2c40 100644
--- a/volk/include/volk/volk_64f_x2_max_64f_a.h
+++ b/volk/kernels/volk/volk_64f_x2_max_64f.h
@@ -53,7 +53,7 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe
\param bVector The vector to be checked
\param num_points The number of values in aVector and bVector to be checked and stored into cVector
*/
-static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+static inline void volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
double* cPtr = cVector;
const double* aPtr = aVector;
const double* bPtr= bVector;
diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/kernels/volk/volk_64f_x2_min_64f.h
index 25d8b4c982..c77ca87fbd 100644
--- a/volk/include/volk/volk_64f_x2_min_64f_a.h
+++ b/volk/kernels/volk/volk_64f_x2_min_64f.h
@@ -53,7 +53,7 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe
\param bVector The vector to be checked
\param num_points The number of values in aVector and bVector to be checked and stored into cVector
*/
-static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+static inline void volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
double* cPtr = cVector;
const double* aPtr = aVector;
const double* bPtr= bVector;
diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/kernels/volk/volk_64u_byteswap.h
index 3d1d87623e..e05daf6d5c 100644
--- a/volk/include/volk/volk_64u_byteswap_a.h
+++ b/volk/kernels/volk/volk_64u_byteswap.h
@@ -1,3 +1,91 @@
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+ for(;number < halfPoints; number++){
+ // Load the 32t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Do the four shifts
+ byte1 = _mm_slli_epi32(input, 24);
+ byte2 = _mm_slli_epi32(input, 8);
+ byte3 = _mm_srli_epi32(input, 8);
+ byte4 = _mm_srli_epi32(input, 24);
+ // Or bytes together
+ output = _mm_or_si128(byte1, byte4);
+ byte2 = _mm_and_si128(byte2, byte2mask);
+ output = _mm_or_si128(output, byte2);
+ byte3 = _mm_and_si128(byte3, byte3mask);
+ output = _mm_or_si128(output, byte3);
+
+ // Reorder the two words
+ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+ number = halfPoints*2;
+ for(; number < num_points; number++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an aligned vector of int64_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ unsigned int point;
+ for(point = 0; point < num_points; point++){
+ uint32_t output1 = *inputPtr;
+ uint32_t output2 = inputPtr[1];
+
+ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+ *inputPtr++ = output2;
+ *inputPtr++ = output1;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
#ifndef INCLUDED_volk_64u_byteswap_a_H
#define INCLUDED_volk_64u_byteswap_a_H
diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/kernels/volk/volk_64u_popcnt.h
index 5e68ed2083..466cfa5dad 100644
--- a/volk/include/volk/volk_64u_popcnt_a.h
+++ b/volk/kernels/volk/volk_64u_popcnt.h
@@ -8,7 +8,7 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) {
+static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) {
//const uint32_t* valueVector = (const uint32_t*)&value;
diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/kernels/volk/volk_8i_convert_16i.h
index 9104f90cb0..3e5c92723f 100644
--- a/volk/include/volk/volk_8i_convert_16i_a.h
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -1,3 +1,76 @@
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+ __m128i* outputVectorPtr = (__m128i*)outputVector;
+ __m128i inputVal;
+ __m128i ret;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128(inputVectorPtr);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVal = _mm_srli_si128(inputVal, 8);
+ ret = _mm_cvtepi8_epi16(inputVal);
+ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+ _mm_storeu_si128(outputVectorPtr, ret);
+
+ outputVectorPtr++;
+
+ inputVectorPtr++;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (int16_t)(inputVector[number])*256;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
#ifndef INCLUDED_volk_8i_convert_16i_a_H
#define INCLUDED_volk_8i_convert_16i_a_H
@@ -73,7 +146,7 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in
\param num_points The number of data values to be converted
*/
extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
-static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
}
#endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/kernels/volk/volk_8i_s32f_convert_32f.h
index 02a7f356e0..bd7ff82d9a 100644
--- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h
+++ b/volk/kernels/volk/volk_8i_s32f_convert_32f.h
@@ -1,3 +1,97 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float* outputVectorPtr = outputVector;
+ const float iScalar = 1.0 / scalar;
+ __m128 invScalar = _mm_set_ps1( iScalar );
+ const int8_t* inputVectorPtr = inputVector;
+ __m128 ret;
+ __m128i inputVal;
+ __m128i interimVal;
+
+ for(;number < sixteenthPoints; number++){
+ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVal = _mm_srli_si128(inputVal, 4);
+ interimVal = _mm_cvtepi8_epi32(inputVal);
+ ret = _mm_cvtepi32_ps(interimVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ inputVectorPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) * iScalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ const float iScalar = 1.0 / scalar;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
#define INCLUDED_volk_8i_s32f_convert_32f_a_H
@@ -95,7 +189,7 @@ static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const
\param num_points The number of data values to be converted
*/
extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
-static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
float invscalar = 1.0 / scalar;
volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
}
diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index 8f13da32ff..b59d22d186 100644
--- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -59,7 +59,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
\param qBuffer The Q buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
const int8_t* complexVectorPtr = (const int8_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
int16_t* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
index d26b3d0d0d..82cedb2bb7 100644
--- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -49,7 +49,7 @@ static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, con
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const int8_t* complexVectorPtr = (const int8_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
index 21efed83e7..c8ff18e67b 100644
--- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -50,7 +50,7 @@ static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const
\param iBuffer The I buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const int8_t* complexVectorPtr = (int8_t*)complexVector;
int8_t* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
index d82da59fb1..9e244c8fc2 100644
--- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
+++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
@@ -146,7 +146,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float
\param scalar The scaling value being multiplied against each data point
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
const int8_t* complexVectorPtr = (const int8_t*)complexVector;
float* iBufferPtr = iBuffer;
float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
index b2c15d3a30..56a1adcbb5 100644
--- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
+++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
@@ -116,7 +116,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con
\param scalar The scaling value being multiplied against each data point
\param num_points The number of complex data values to be deinterleaved
*/
-static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const int8_t* complexVectorPtr = (const int8_t*)complexVector;
float* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
index f85fdb9995..685a21ddcd 100644
--- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
+++ b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
@@ -75,7 +75,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect
\param bVector The complex vector which will be converted to complex conjugate and multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
unsigned int number = 0;
int16_t* c16Ptr = (int16_t*)cVector;
int8_t* a8Ptr = (int8_t*)aVector;
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
index 4b16171cec..edb52ff509 100644
--- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
+++ b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
@@ -95,7 +95,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t*
\param bVector The complex vector which will be converted to complex conjugate and multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
-static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
float* cPtr = (float*)cVector;
const float invScalar = 1.0 / scalar;
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index 79655f1bd2..68fadc35b0 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -202,7 +202,7 @@ message(STATUS "Available machines: ${available_machines}")
#dependencies are all python, xml, and header implementation files
file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml)
file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py)
-file(GLOB h_files ${CMAKE_SOURCE_DIR}/include/volk/*.h)
+file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h)
macro(gen_template tmpl output)
list(APPEND volk_gen_sources ${output})
@@ -253,6 +253,7 @@ endforeach(machine_name)
include_directories(
${CMAKE_BINARY_DIR}/include
${CMAKE_SOURCE_DIR}/include
+ ${CMAKE_SOURCE_DIR}/kernels
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
)
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 4e361aece2..e526eb2d01 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -63,12 +63,12 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) {
}
}
-static std::vector<std::string> get_arch_list(struct volk_func_desc desc) {
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
std::vector<std::string> archlist;
- for(int i = 0; i < desc.n_archs; i++) {
+ for(size_t i = 0; i < desc.n_impls; i++) {
//if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc
- archlist.push_back(std::string(desc.indices[i]));
+ archlist.push_back(std::string(desc.impl_names[i]));
}
return archlist;
@@ -256,7 +256,7 @@ public:
private: std::list<std::vector<char> > _mems;
};
-bool run_volk_tests(struct volk_func_desc desc,
+bool run_volk_tests(volk_func_desc_t desc,
void (*manual_func)(),
std::string name,
float tol,
@@ -442,22 +442,32 @@ bool run_volk_tests(struct volk_func_desc desc,
arch_results.push_back(!fail);
}
- double best_time = std::numeric_limits<double>::max();
- std::string best_arch = "generic";
- for(size_t i=0; i < arch_list.size(); i++) {
- if((profile_times[i] < best_time) && arch_results[i]) {
- best_time = profile_times[i];
- best_arch = arch_list[i];
+ double best_time_a = std::numeric_limits<double>::max();
+ double best_time_u = std::numeric_limits<double>::max();
+ std::string best_arch_a = "generic";
+ std::string best_arch_u = "generic";
+ for(size_t i=0; i < arch_list.size(); i++)
+ {
+ if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
+ {
+ best_time_u = profile_times[i];
+ best_arch_u = arch_list[i];
+ }
+ if((profile_times[i] < best_time_a) && arch_results[i])
+ {
+ best_time_a = profile_times[i];
+ best_arch_a = arch_list[i];
}
}
- std::cout << "Best arch: " << best_arch << std::endl;
+ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
if(best_arch_vector) {
if(puppet_master_name == "NULL") {
- best_arch_vector->push_back(name + std::string(" ") + best_arch);
+ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
}
else {
- best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch);
+ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
}
}
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index 1e639ac3c6..0f17cdaa34 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -21,7 +21,7 @@ volk_type_t volk_type_from_string(std::string);
float uniform(void);
void random_floats(float *buf, unsigned n);
-bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
+bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 2e41c25daf..f133897cba 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -2,107 +2,89 @@
#include <volk/volk.h>
#include <boost/test/unit_test.hpp>
-//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000);
-//VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000);
-//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000);
-//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
-//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
-VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 1);
-//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 1);
-//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
-//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
+//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000);
+//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000);
+VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1);
+//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c
index 5e5c9dfff7..f787b5e2aa 100644
--- a/volk/lib/volk_prefs.c
+++ b/volk/lib/volk_prefs.c
@@ -7,7 +7,8 @@
//#include <Windows.h>
//#endif
-void get_config_path(char *path) {
+void volk_get_config_path(char *path)
+{
const char *suffix = "/.volk/volk_config";
char *home = NULL;
if (home == NULL) home = getenv("HOME");
@@ -20,38 +21,30 @@ void get_config_path(char *path) {
strcat(path, suffix);
}
-//passing by reference in C can (***********)
-int load_preferences(struct volk_arch_pref **prefs) {
+size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+{
FILE *config_file;
- char path[512], line[512], function[128], arch[32];
- int n_arch_prefs = 0;
- struct volk_arch_pref *t_pref;
+ char path[512], line[512];
+ size_t n_arch_prefs = 0;
+ volk_arch_pref_t *prefs = NULL;
//get the config path
- get_config_path(path);
+ volk_get_config_path(path);
if (path == NULL) return n_arch_prefs; //no prefs found
config_file = fopen(path, "r");
if(!config_file) return n_arch_prefs; //no prefs found
- while(fgets(line, 512, config_file) != NULL) {
- if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) {
- n_arch_prefs++;
- }
- }
-
- //now allocate the memory required for volk_arch_prefs
- (*prefs) = (struct volk_arch_pref *) malloc(n_arch_prefs * sizeof(struct volk_arch_pref));
- t_pref = (*prefs);
-
//reset the file pointer and write the prefs into volk_arch_prefs
- rewind(config_file);
- while(fgets(line, 512, config_file) != NULL) {
- if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) {
- strncpy(t_pref->name, function, 128);
- strncpy(t_pref->arch, arch, 32);
- t_pref++;
+ while(fgets(line, sizeof(line), config_file) != NULL)
+ {
+ prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
+ volk_arch_pref_t *p = prefs + n_arch_prefs;
+ if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
+ {
+ n_arch_prefs++;
}
}
fclose(config_file);
+ *prefs_res = prefs;
return n_arch_prefs;
}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
index 865d60955c..6ab013f269 100644
--- a/volk/lib/volk_rank_archs.c
+++ b/volk/lib/volk_rank_archs.c
@@ -1,43 +1,112 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
#include <volk_rank_archs.h>
#include <volk/volk_prefs.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) {
+#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
+ #define __popcnt __builtin_popcount
+#else
+ inline unsigned __popcnt(unsigned num)
+ {
+ unsigned pop = 0;
+ while(num)
+ {
+ if (num & 0x1) pop++;
+ num >>= 1;
+ }
+ return pop;
+ }
+#endif
+
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+){
unsigned int i;
- for(i=0; i<n_archs; i++) {
- if(!strncmp(indices[i], arch_name, 20)) {
+ for (i = 0; i < n_impls; i++) {
+ if(!strncmp(impl_names[i], impl_name, 20)) {
return i;
}
}
+ //TODO return -1;
//something terrible should happen here
printf("Volk warning: no arch found, returning generic impl\n");
- return get_index(indices, n_archs, "generic"); //but we'll fake it for now
+ return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
}
-unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char* name, unsigned int arch) {
- unsigned int i;
- unsigned int best_val = 0;
- static struct volk_arch_pref *volk_arch_prefs;
- static unsigned int n_arch_prefs = 0;
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+){
+ size_t i;
+ static volk_arch_pref_t *volk_arch_prefs;
+ static size_t n_arch_prefs = 0;
static int prefs_loaded = 0;
if(!prefs_loaded) {
- n_arch_prefs = load_preferences(&volk_arch_prefs);
+ n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
prefs_loaded = 1;
}
- //now look for the function name in the prefs list
- for(i=0; i < n_arch_prefs; i++) {
- if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it
- return get_index(indices, n_archs, volk_arch_prefs[i].arch);
- }
- }
+ //now look for the function name in the prefs list
+ for(i = 0; i < n_arch_prefs; i++)
+ {
+ if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+ {
+ const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+ return volk_get_index(impl_names, n_impls, impl_name);
+ }
+ }
- for(i=1; i < n_archs; ++i) {
- if((arch_defs[i]&(!arch)) == 0) {
- best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;
+ //return the best index with the largest deps
+ size_t best_index_a = 0;
+ size_t best_index_u = 0;
+ int best_value_a = -1;
+ int best_value_u = -1;
+ for(i = 0; i < n_impls; i++)
+ {
+ const signed val = __popcnt(impl_deps[i]);
+ if (alignment[i] && val > best_value_a)
+ {
+ best_index_a = i;
+ best_value_a = val;
+ }
+ if (!alignment[i] && val > best_value_u)
+ {
+ best_index_u = i;
+ best_value_u = val;
+ }
}
- }
- return best_val;
+
+ //when align and we found a best aligned, use it
+ if (align && best_value_a != -1) return best_index_a;
+
+ //otherwise return the best unaligned
+ return best_index_u;
}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
index 546240d2c6..b3bf8ff17c 100644
--- a/volk/lib/volk_rank_archs.h
+++ b/volk/lib/volk_rank_archs.h
@@ -1,12 +1,48 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
#ifndef INCLUDED_VOLK_RANK_ARCHS_H
#define INCLUDED_VOLK_RANK_ARCHS_H
+#include <stdlib.h>
+#include <stdbool.h>
+
#ifdef __cplusplus
extern "C" {
#endif
-unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name);
-unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char *name, unsigned int arch);
+int volk_get_index(
+ const char *impl_names[], //list of implementations by name
+ const size_t n_impls, //number of implementations available
+ const char *impl_name //the implementation name to find
+);
+
+int volk_rank_archs(
+ const char *kern_name, //name of the kernel to rank
+ const char *impl_names[], //list of implementations by name
+ const int* impl_deps, //requirement mask per implementation
+ const bool* alignment, //alignment status of each implementation
+ size_t n_impls, //number of implementations available
+ const bool align //if false, filter aligned implementations
+);
#ifdef __cplusplus
}
diff --git a/volk/python/volk_modtool/CMakeLists.txt b/volk/python/volk_modtool/CMakeLists.txt
new file mode 100644
index 0000000000..6fb87f2668
--- /dev/null
+++ b/volk/python/volk_modtool/CMakeLists.txt
@@ -0,0 +1,39 @@
+# Copyright 2013 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+########################################################################
+# Install python files and apps
+########################################################################
+include(GrPython)
+
+GR_PYTHON_INSTALL(
+ FILES
+ __init__.py
+ cfg.py
+ volk_modtool_generate.py
+ DESTINATION ${GR_PYTHON_DIR}/volk_modtool
+ COMPONENT "volk"
+)
+
+GR_PYTHON_INSTALL(
+ PROGRAMS
+ volk_modtool
+ DESTINATION ${GR_RUNTIME_DIR}
+ COMPONENT "volk"
+)
diff --git a/volk/python/volk_modtool/README b/volk/python/volk_modtool/README
new file mode 100644
index 0000000000..5413ff1f1a
--- /dev/null
+++ b/volk/python/volk_modtool/README
@@ -0,0 +1,114 @@
+The volk_modtool tool is installed along with VOLK as a way of helping
+to construct, add to, and interogate the VOLK library or companion
+libraries.
+
+volk_modtool is installed into $prefix/bin.
+
+VOLK modtool enables creating standalone (out-of-tree) VOLK modules
+and provides a few tools for sharing VOLK kernels between VOLK
+modules. If you need to design or work with VOLK kernels away from
+the canonical VOLK library, this is the tool. If you need to tailor
+your own VOLK library for whatever reason, this is the tool.
+
+The canonical VOLK library installs a volk.h and a libvolk.so. Your
+own library will install volk_$name.h and libvolk_$name.so. Ya Gronk?
+Good.
+
+There isn't a substantial difference between the canonical VOLK
+module and any other VOLK module. They're all peers. Any module
+created via VOLK modtool will come complete with a default
+volk_modtool.cfg file associating the module with the base from which
+it came, its distinctive $name and its destination (or path). These
+values (created from user input if VOLK modtool runs without a
+user-supplied config file or a default config file) serve as default
+values for some VOLK modtool actions. It's more or less intended for
+the user to change directories to the top level of a created VOLK
+module and then run volk_modtool to take advantage of the values
+stored in the default volk_modtool.cfg file.
+
+Apart from creating new VOLK modules, VOLK modtool allows you to list
+the names of kernels in other modules, list the names of kernels in
+the current module, add kernels from another module into the current
+module, and remove kernels from the current module. When moving
+kernels between modules, VOLK modtool does its best to keep the qa
+and profiling code for those kernels intact. If the base has a test
+or a profiling call for some kernel, those calls will follow the
+kernel when VOLK modtool adds that kernel. If QA or profiling
+requires a puppet kernel, the puppet kernel will follow the original
+kernel when VOLK modtool adds that original kernel. VOLK modtool
+respects puppets.
+
+======================================================================
+
+Installing a new VOLK Library:
+
+Run the command "volk_modtool -i". This will ask you three questions:
+
+ name: // the name to give your VOLK library: volk_<name>
+ destination: // directory new source tree is built under -- must exists.
+ // It will create <directory>/volk_<name>
+ base: // the directory containing the original VOLK source code
+
+The name provided must be alphanumeric (and cannot start with a
+number). No special characters including dashes and underscores are
+allowed.
+
+This will build a new skeleton directory in the destination provided
+with the name volk_<name>. It will contain the necessary structure to
+build:
+
+ mkdir build
+ cd build
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/volk ../
+ make
+ sudo make install
+
+Right now, the library is empty and contains no kernels. Kernels can
+be added from another VOLK library using the '-a' option. If not
+specified, the kernel will be extracted from the base VOLK
+directory. Using the '-b' allows us to specify another VOLK library to
+use for this purpose.
+
+ volk_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc
+
+This will put the code for the new kernel into
+<destination>/volk_<name>/kernels/volk_<name>/
+
+Other kernels must be added by hand. See the following webpages for
+more information about creating VOLK kernels:
+ http://gnuradio.org/doc/doxygen/volk_guide.html
+ http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk
+
+
+======================================================================
+
+OPTIONS
+
+Options for Adding and Removing Kernels:
+ -a, --add_kernel
+ Add kernel from existing VOLK module. Uses the base VOLK module
+ unless -b is used. Use -n to specify the kernel name.
+ Requires: -n.
+ Optional: -b
+
+ -A, --add_all_kernels
+ Add all kernels from existing VOLK module. Uses the base VOLK
+ module unless -b is used.
+ Optional: -b
+
+ -x, --remove_kernel
+ Remove kernel from module.
+ Required: -n.
+ Optional: -b
+
+Options for Listing Kernels:
+ -l, --list
+ Lists all kernels available in the base VOLK module.
+
+ -k, --kernels
+ Lists all kernels in this VOLK module.
+
+ -r, --remote-list
+ Lists all kernels in another VOLK module that is specified
+ using the -b option.
+
diff --git a/volk/python/volk_modtool/__init__.py b/volk/python/volk_modtool/__init__.py
new file mode 100644
index 0000000000..6ddf48da05
--- /dev/null
+++ b/volk/python/volk_modtool/__init__.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+#
+# Copyright 2013 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+from cfg import volk_modtool_config
+from volk_modtool_generate import volk_modtool
diff --git a/volk/python/volk_modtool/cfg.py b/volk/python/volk_modtool/cfg.py
new file mode 100644
index 0000000000..c58dc59091
--- /dev/null
+++ b/volk/python/volk_modtool/cfg.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+#
+# Copyright 2013 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import ConfigParser
+import sys
+import os
+import exceptions
+import re
+
+
+class volk_modtool_config:
+ def key_val_sub(self, num, stuff, section):
+ return re.sub('\$' + 'k' + str(num), stuff[num][0], (re.sub('\$' + str(num), stuff[num][1], section[1][num])));
+
+ def verify(self):
+ for i in self.verification:
+ self.verify_section(i)
+ def remap(self):
+ for i in self.remapification:
+ self.verify_section(i)
+
+ def verify_section(self, section):
+ stuff = self.cfg.items(section[0])
+ for i in range(len(section[1])):
+ eval(self.key_val_sub(i, stuff, section))
+ try:
+ val = eval(self.key_val_sub(i, stuff, section))
+ if val == False:
+ raise exceptions.ValueError
+ except ValueError:
+ raise exceptions.ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+ except:
+ raise exceptions.IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+
+
+ def __init__(self, cfg=None):
+ self.config_name = 'config'
+ self.config_defaults = ['name', 'destination', 'base']
+ self.config_defaults_remap = ['1',
+ 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))',
+ 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))']
+
+ self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')',
+ 'os.path.exists(\'$1\')',
+ 'os.path.exists(\'$2\')']
+ self.remapification = [(self.config_name, self.config_defaults_remap)]
+ self.verification = [(self.config_name, self.config_defaults_verify)]
+ default = os.path.join(os.getcwd(), 'volk_modtool.cfg')
+ icfg = ConfigParser.RawConfigParser()
+ if cfg:
+ icfg.read(cfg)
+ elif os.path.exists(default):
+ icfg.read(default)
+ else:
+ print "Initializing config file..."
+ icfg.add_section(self.config_name)
+ for kn in self.config_defaults:
+ rv = raw_input("%s: "%(kn))
+ icfg.set(self.config_name, kn, rv)
+ self.cfg = icfg
+ self.remap()
+ self.verify()
+
+
+
+ def read_map(self, name, inp):
+ if self.cfg.has_section(name):
+ self.cfg.remove_section(name)
+ self.cfg.add_section(name)
+ for i in inp:
+ self.cfg.set(name, i, inp[i])
+
+ def get_map(self, name):
+ retval = {}
+ stuff = self.cfg.items(name)
+ for i in stuff:
+ retval[i[0]] = i[1]
+ return retval
+
+
+
+
+
+
+
diff --git a/volk/python/volk_modtool/volk_modtool b/volk/python/volk_modtool/volk_modtool
new file mode 100755
index 0000000000..74b71adde2
--- /dev/null
+++ b/volk/python/volk_modtool/volk_modtool
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+#
+# Copyright 2013 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+from volk_modtool import volk_modtool, volk_modtool_config
+from optparse import OptionParser, OptionGroup
+
+import exceptions
+import os
+import sys
+
+if __name__ == '__main__':
+ parser = OptionParser();
+ actions = OptionGroup(parser, 'Actions');
+ actions.add_option('-i', '--install', action='store_true',
+ help='Create a new volk module.')
+ parser.add_option('-b', '--base_path', action='store', default=None,
+ help='Base path for action. By default, volk_modtool.cfg loads this value.')
+ parser.add_option('-n', '--kernel_name', action='store', default=None,
+ help='Kernel name for action. No default')
+ parser.add_option('-c', '--config', action='store', dest='config_file', default=None,
+ help='Config file for volk_modtool. By default, volk_modtool.cfg in the local directory will be used/created.')
+ actions.add_option('-a', '--add_kernel', action='store_true',
+ help='Add kernel from existing volk module. Requires: -n. Optional: -b')
+ actions.add_option('-A', '--add_all_kernels', action='store_true',
+ help='Add all kernels from existing volk module. Optional: -b')
+ actions.add_option('-x', '--remove_kernel', action='store_true',
+ help='Remove kernel from module. Required: -n. Optional: -b')
+ actions.add_option('-l', '--list', action='store_true',
+ help='List all kernels in the base.')
+ actions.add_option('-k', '--kernels', action='store_true',
+ help='List all kernels in the module.')
+ actions.add_option('-r', '--remote_list', action='store_true',
+ help='List all available kernels in remote volk module. Requires: -b.')
+ actions.add_option('-m', '--moo', action='store_true',
+ help='Have you mooed today?')
+ parser.add_option_group(actions)
+
+ (options, args) = parser.parse_args();
+ if len(sys.argv) < 2:
+ parser.print_help()
+
+ elif options.moo:
+ print " (__) "
+ print " (oo) "
+ print " /------\/ "
+ print " / | || "
+ print " * /\---/\ "
+ print " ~~ ~~ "
+
+ else:
+ my_cfg = volk_modtool_config(options.config_file);
+
+ my_modtool = volk_modtool(my_cfg.get_map(my_cfg.config_name));
+
+
+ if options.install:
+ my_modtool.make_module_skeleton();
+ my_modtool.write_default_cfg(my_cfg.cfg);
+
+
+ if options.add_kernel:
+ if not options.kernel_name:
+ raise exceptions.IOError("This action requires the -n option.");
+ else:
+ name = options.kernel_name;
+ if options.base_path:
+ base = options.base_path;
+ else:
+ base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+ my_modtool.import_kernel(name, base);
+
+ if options.remove_kernel:
+ if not options.kernel_name:
+ raise exceptions.IOError("This action requires the -n option.");
+ else:
+ name = options.kernel_name;
+ my_modtool.remove_kernel(name);
+
+ if options.add_all_kernels:
+
+ if options.base_path:
+ base = options.base_path;
+ else:
+ base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+ kernelset = my_modtool.get_current_kernels(base);
+ for i in kernelset:
+ my_modtool.import_kernel(i, base);
+
+ if options.remote_list:
+ if not options.base_path:
+ raise exceptions.IOError("This action requires the -b option. Try -l or -k for listing kernels in the base or the module.")
+ else:
+ base = options.base_path;
+ kernelset = my_modtool.get_current_kernels(base);
+ for i in kernelset:
+ print i;
+
+ if options.list:
+ kernelset = my_modtool.get_current_kernels();
+ for i in kernelset:
+ print i;
+
+ if options.kernels:
+ dest = my_cfg.cfg.get(my_cfg.config_name, 'destination');
+ name = my_cfg.cfg.get(my_cfg.config_name, 'name');
+ base = os.path.join(dest, 'volk_' + name);
+ kernelset = my_modtool.get_current_kernels(base);
+ for i in kernelset:
+ print i;
diff --git a/volk/python/volk_modtool/volk_modtool_generate.py b/volk/python/volk_modtool/volk_modtool_generate.py
new file mode 100644
index 0000000000..80c2aed598
--- /dev/null
+++ b/volk/python/volk_modtool/volk_modtool_generate.py
@@ -0,0 +1,310 @@
+#
+# Copyright 2013 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING. If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+#
+
+import os
+import sys
+import re
+import glob
+import shutil
+import exceptions
+from sets import Set
+
+class volk_modtool:
+ def __init__(self, cfg):
+ self.volk = re.compile('volk');
+ self.remove_after_underscore = re.compile("_.*");
+ self.volk_run_tests = re.compile('^\s*VOLK_RUN_TESTS.*\n', re.MULTILINE);
+ self.volk_profile = re.compile('^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE);
+ self.my_dict = cfg;
+ self.lastline = re.compile('\s*char path\[1024\];.*');
+ self.badassert = re.compile('^\s*assert\(toked\[0\] == "volk_.*\n', re.MULTILINE);
+ self.goodassert = ' assert(toked[0] == "volk");\n'
+ self.baderase = re.compile('^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE);
+ self.gooderase = ' toked.erase(toked.begin());\n toked.erase(toked.begin());\n';
+
+ def get_basename(self, base=None):
+ if not base:
+ base = self.my_dict['base']
+ candidate = base.split('/')[-1];
+ if len(candidate.split('_')) == 1:
+ return '';
+ else:
+ return candidate.split('_')[-1];
+
+ def get_current_kernels(self, base=None):
+ if not base:
+ base = self.my_dict['base']
+ name = self.get_basename();
+ else:
+ name = self.get_basename(base);
+ if name == '':
+ hdr_files = glob.glob(os.path.join(base, "kernels/volk/*.h"));
+ begins = re.compile("(?<=volk_).*")
+ else:
+ hdr_files = glob.glob(os.path.join(base, "kernels/volk_" + name + "/*.h"));
+ begins = re.compile("(?<=volk_" + name + "_).*")
+
+ datatypes = [];
+ functions = [];
+
+
+ for line in hdr_files:
+
+ subline = re.search(".*\.h.*", os.path.basename(line))
+ if subline:
+ subsubline = begins.search(subline.group(0));
+ if subsubline:
+ dtype = self.remove_after_underscore.sub("", subsubline.group(0));
+ subdtype = re.search("[0-9]+[A-z]+", dtype);
+ if subdtype:
+ datatypes.append(subdtype.group(0));
+
+
+ datatypes = set(datatypes);
+
+ for line in hdr_files:
+ for dt in datatypes:
+ if dt in line:
+ #subline = re.search("(?<=volk_)" + dt + ".*(?=\.h)", line);
+ subline = re.search(begins.pattern[:-2] + dt + ".*(?=\.h)", line);
+ if subline:
+ functions.append(subline.group(0));
+
+ return set(functions);
+
+ def make_module_skeleton(self):
+
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'])
+ if os.path.exists(dest):
+ raise exceptions.IOError("Destination %s already exits!"%(dest));
+
+ if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name'])):
+ os.makedirs(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name']))
+
+ current_kernel_names = self.get_current_kernels();
+
+ for root, dirnames, filenames in os.walk(self.my_dict['base']):
+ for name in filenames:
+ t_table = map(lambda a: re.search(a, name), current_kernel_names);
+ t_table = set(t_table);
+ if t_table == set([None]):
+ infile = os.path.join(root, name);
+ instring = open(infile, 'r').read();
+ outstring = re.sub(self.volk, 'volk_' + self.my_dict['name'], instring);
+ newname = re.sub(self.volk, 'volk_' + self.my_dict['name'], name);
+ relpath = os.path.relpath(infile, self.my_dict['base']);
+ newrelpath = re.sub(self.volk, 'volk_' + self.my_dict['name'], relpath);
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname);
+
+ if not os.path.exists(os.path.dirname(dest)):
+ os.makedirs(os.path.dirname(dest))
+ open(dest, 'w+').write(outstring);
+
+
+ infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc');
+ instring = open(infile, 'r').read();
+ outstring = re.sub(self.volk_run_tests, '', instring);
+ open(infile, 'w+').write(outstring);
+
+ infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc');
+ instring = open(infile, 'r').read();
+ outstring = re.sub(self.volk_profile, '', instring);
+ open(infile, 'w+').write(outstring);
+
+ infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/qa_utils.cc');
+ instring = open(infile, 'r').read();
+ outstring = re.sub(self.badassert, self.goodassert, instring);
+ outstring = re.sub(self.baderase, self.gooderase, outstring);
+ open(infile, 'w+').write(outstring);
+
+ def write_default_cfg(self, cfg):
+ outfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'volk_modtool.cfg'), 'wb');
+ cfg.write(outfile);
+ outfile.close();
+
+
+ def convert_kernel(self, oldvolk, name, base, inpath, top):
+ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h');
+ instring = open(infile, 'r').read();
+ outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring);
+ newname = 'volk_' + self.my_dict['name'] + '_' + name + '.h';
+ relpath = os.path.relpath(infile, base);
+ newrelpath = re.sub(oldvolk, 'volk_' + self.my_dict['name'], relpath);
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname);
+
+
+
+ if not os.path.exists(os.path.dirname(dest)):
+ os.makedirs(os.path.dirname(dest))
+ open(dest, 'w+').write(outstring);
+
+ def remove_kernel(self, name):
+ basename = self.my_dict['name'];
+ if len(basename) > 0:
+ top = 'volk_' + basename + '_';
+ else:
+ top = 'volk_'
+ base = os.path.join(self.my_dict['destination'], top[:-1]) ;
+
+ if not name in self.get_current_kernels():
+
+ raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base));
+
+
+
+ inpath = os.path.abspath(base);
+
+
+ kernel = re.compile(name)
+ search_kernels = Set([kernel])
+ profile = re.compile('^\s*VOLK_PROFILE')
+ puppet = re.compile('^\s*VOLK_PUPPET')
+ src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc');
+ infile = open(src_dest);
+ otherlines = infile.readlines();
+ open(src_dest, 'w+').write('');
+
+ for otherline in otherlines:
+ write_okay = True;
+ if kernel.search(otherline):
+ write_okay = False;
+ if puppet.match(otherline):
+ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline)
+ m_func = args.group(0).split(',')[0];
+ func = re.search('(?<=' + top + ').*', m_func);
+ search_kernels.add(re.compile(func.group(0)));
+ if write_okay:
+ open(src_dest, 'a').write(otherline);
+
+
+ src_dest = os.path.join(inpath, 'lib/testqa.cc')
+ infile = open(src_dest);
+ otherlines = infile.readlines();
+ open(src_dest, 'w+').write('');
+
+ for otherline in otherlines:
+ write_okay = True;
+
+ for kernel in search_kernels:
+ if kernel.search(otherline):
+ write_okay = False;
+
+ if write_okay:
+ open(src_dest, 'a').write(otherline);
+
+ for kernel in search_kernels:
+ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h');
+ print "Removing kernel %s"%(kernel.pattern)
+ if os.path.exists(infile):
+ os.remove(infile);
+
+ def import_kernel(self, name, base):
+ if not (base):
+ base = self.my_dict['base'];
+ basename = self.getbasename();
+ else:
+ basename = self.get_basename(base);
+ if not name in self.get_current_kernels(base):
+ raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base));
+
+ inpath = os.path.abspath(base);
+ if len(basename) > 0:
+ top = 'volk_' + basename + '_';
+ else:
+ top = 'volk_'
+ oldvolk = re.compile(top[:-1]);
+
+ self.convert_kernel(oldvolk, name, base, inpath, top);
+
+ kernel = re.compile(name)
+ search_kernels = Set([kernel])
+
+ profile = re.compile('^\s*VOLK_PROFILE')
+ puppet = re.compile('^\s*VOLK_PUPPET')
+ infile = open(os.path.join(inpath, 'apps/', oldvolk.pattern + '_profile.cc'));
+ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc'));
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc');
+ lines = infile.readlines();
+ otherlines = otherinfile.readlines();
+ open(dest, 'w+').write('');
+ insert = False;
+ inserted = False
+ for otherline in otherlines:
+
+ if self.lastline.match(otherline):
+ insert = True;
+ if insert and not inserted:
+ inserted = True;
+ for line in lines:
+ if kernel.search(line):
+ if profile.match(line):
+ outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line);
+ open(dest, 'a').write(outline);
+ elif puppet.match(line):
+ outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line);
+ open(dest, 'a').write(outline);
+ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line)
+ m_func = args.group(0).split(',')[0];
+ func = re.search('(?<=' + top + ').*', m_func);
+ search_kernels.add(re.compile(func.group(0)));
+ self.convert_kernel(oldvolk, func.group(0), base, inpath, top);
+ write_okay = True;
+ for kernel in search_kernels:
+ if kernel.search(otherline):
+ write_okay = False
+ if write_okay:
+ open(dest, 'a').write(otherline);
+
+ for kernel in search_kernels:
+ print "Adding kernel %s from module %s"%(kernel.pattern,base)
+
+ infile = open(os.path.join(inpath, 'lib/testqa.cc'));
+ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc'));
+ dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc');
+ lines = infile.readlines();
+ otherlines = otherinfile.readlines();
+ open(dest, 'w+').write('');
+ inserted = False;
+ insert = False
+ for otherline in otherlines:
+
+ if (re.match('\s*', otherline) == None or re.match('\s*#.*', otherline) == None):
+
+ insert = True;
+ if insert and not inserted:
+ inserted = True;
+ for line in lines:
+ for kernel in search_kernels:
+ if kernel.search(line):
+ if self.volk_run_tests.match(line):
+ outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line);
+ open(dest, 'a').write(outline);
+ write_okay = True;
+ for kernel in search_kernels:
+ if kernel.search(otherline):
+ write_okay = False
+ if write_okay:
+ open(dest, 'a').write(otherline);
+
+
+
+
+
diff --git a/volk/tmpl/volk.tmpl.c b/volk/tmpl/volk.tmpl.c
index c3a1544ff8..f915f157f6 100644
--- a/volk/tmpl/volk.tmpl.c
+++ b/volk/tmpl/volk.tmpl.c
@@ -27,6 +27,10 @@
#include <volk/volk.h>
#include <stdio.h>
#include <string.h>
+#include <assert.h>
+
+static size_t __alignment = 0;
+static intptr_t __alignment_mask = 0;
struct volk_machine *get_machine(void) {
extern struct volk_machine *volk_machines[];
@@ -46,45 +50,118 @@ struct volk_machine *get_machine(void) {
}
}
printf("Using Volk machine: %s\n", machine->name);
+ __alignment = machine->alignment;
+ __alignment_mask = (intptr_t)(__alignment-1);
return machine;
}
}
-unsigned int volk_get_alignment(void) {
- return get_machine()->alignment;
+size_t volk_get_alignment(void)
+{
+ get_machine(); //ensures alignment is set
+ return __alignment;
+}
+
+bool volk_is_aligned(const void *ptr)
+{
+ return ((intptr_t)(ptr) & __alignment_mask) == 0;
}
+#define LV_HAVE_GENERIC
+#define LV_HAVE_DISPATCHER
+
#for $kern in $kernels
-void get_$(kern.name)($kern.arglist_namedefs) {
- $kern.name = get_machine()->$(kern.name)_archs[volk_rank_archs(
- get_machine()->$(kern.name)_indices,
- get_machine()->$(kern.name)_arch_defs,
- get_machine()->$(kern.name)_n_archs,
- get_machine()->$(kern.name)_name,
- volk_get_lvarch()
- )];
+#if $kern.has_dispatcher
+#include <volk/$(kern.name).h> //pulls in the dispatcher
+#end if
+
+static inline void __$(kern.name)_d($kern.arglist_full)
+{
+ #if $kern.has_dispatcher
+ $(kern.name)_dispatcher($kern.arglist_names);
+ return;
+ #end if
+
+ if (volk_is_aligned(
+ #set $num_open_parens = 0
+ #for $arg_type, $arg_name in $kern.args
+ #if '*' in $arg_type
+ VOLK_OR_PTR($arg_name,
+ #set $num_open_parens += 1
+ #end if
+ #end for
+ 0$(')'*$num_open_parens)
+ )){
+ $(kern.name)_a($kern.arglist_names);
+ }
+ else{
+ $(kern.name)_u($kern.arglist_names);
+ }
+}
+
+static inline void __init_$(kern.name)(void)
+{
+ const char *name = get_machine()->$(kern.name)_name;
+ const char **impl_names = get_machine()->$(kern.name)_impl_names;
+ const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+ const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+ const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+ const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
+ const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
+ $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a];
+ $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u];
+
+ assert($(kern.name)_a);
+ assert($(kern.name)_u);
+
+ $(kern.name) = &__$(kern.name)_d;
+}
+
+static inline void __$(kern.name)_a($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)_a($kern.arglist_names);
+}
+
+static inline void __$(kern.name)_u($kern.arglist_full)
+{
+ __init_$(kern.name)();
+ $(kern.name)_u($kern.arglist_names);
+}
+
+static inline void __$(kern.name)($kern.arglist_full)
+{
+ __init_$(kern.name)();
$(kern.name)($kern.arglist_names);
}
-$kern.pname $kern.name = &get_$(kern.name);
+$kern.pname $(kern.name)_a = &__$(kern.name)_a;
+$kern.pname $(kern.name)_u = &__$(kern.name)_u;
+$kern.pname $(kern.name) = &__$(kern.name);
-void $(kern.name)_manual($kern.arglist_namedefs, const char* arch) {
- const size_t index = get_index(
- get_machine()->$(kern.name)_indices,
- get_machine()->$(kern.name)_n_archs,
- arch
+void $(kern.name)_manual($kern.arglist_full, const char* impl_name)
+{
+ const int index = volk_get_index(
+ get_machine()->$(kern.name)_impl_names,
+ get_machine()->$(kern.name)_n_impls,
+ impl_name
);
- get_machine()->$(kern.name)_archs[index](
+ get_machine()->$(kern.name)_impls[index](
$kern.arglist_names
);
}
-struct volk_func_desc $(kern.name)_get_func_desc(void) {
- struct volk_func_desc desc = {
- get_machine()->$(kern.name)_indices,
- get_machine()->$(kern.name)_arch_defs,
- get_machine()->$(kern.name)_n_archs
+volk_func_desc_t $(kern.name)_get_func_desc(void) {
+ const char **impl_names = get_machine()->$(kern.name)_impl_names;
+ const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+ const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+ const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+ volk_func_desc_t desc = {
+ impl_names,
+ impl_deps,
+ alignment,
+ n_impls
};
return desc;
}
diff --git a/volk/tmpl/volk.tmpl.h b/volk/tmpl/volk.tmpl.h
index 161579e46d..464b65598a 100644
--- a/volk/tmpl/volk.tmpl.h
+++ b/volk/tmpl/volk.tmpl.h
@@ -27,20 +27,59 @@
#include <volk/volk_common.h>
#include <volk/volk_complex.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
__VOLK_DECL_BEGIN
-struct volk_func_desc {
- const char **indices;
- const int *arch_defs;
- const int n_archs;
-};
+typedef struct volk_func_desc
+{
+ const char **impl_names;
+ const int *impl_deps;
+ const bool *impl_alignment;
+ const size_t n_impls;
+} volk_func_desc_t;
+
+//! Get the machine alignment in bytes
+VOLK_API size_t volk_get_alignment(void);
+
+/*!
+ * The VOLK_OR_PTR macro is a convenience macro
+ * for checking the alignment of a set of pointers.
+ * Example usage:
+ * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2)))
+ */
+#define VOLK_OR_PTR(ptr0, ptr1) \
+ (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1)))
-VOLK_API unsigned int volk_get_alignment(void);
+/*!
+ * Is the pointer on a machine alignment boundary?
+ *
+ * Note: for performance reasons, this function
+ * is not usable until another volk API call is made
+ * which will perform certain initialization tasks.
+ *
+ * \param ptr the pointer to some memory buffer
+ * \return 1 for alignment boundary, else 0
+ */
+VOLK_API bool volk_is_aligned(const void *ptr);
#for $kern in $kernels
+
+//! A function pointer to the dispatcher implementation
extern VOLK_API $kern.pname $kern.name;
-extern VOLK_API void $(kern.name)_manual($kern.arglist_namedefs, const char* arch);
-extern VOLK_API struct volk_func_desc $(kern.name)_get_func_desc(void);
+
+//! A function pointer to the fastest aligned implementation
+extern VOLK_API $kern.pname $(kern.name)_a;
+
+//! A function pointer to the fastest unaligned implementation
+extern VOLK_API $kern.pname $(kern.name)_u;
+
+//! Call into a specific implementation given by name
+extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name);
+
+//! Get description paramaters for this kernel
+extern VOLK_API volk_func_desc_t $(kern.name)_get_func_desc(void);
#end for
__VOLK_DECL_END
diff --git a/volk/tmpl/volk_machine_xxx.tmpl.c b/volk/tmpl/volk_machine_xxx.tmpl.c
index e405bd6938..68d7f3eba2 100644
--- a/volk/tmpl/volk_machine_xxx.tmpl.c
+++ b/volk/tmpl/volk_machine_xxx.tmpl.c
@@ -44,18 +44,23 @@ $(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp
#end def
########################################################################
-#def make_tag_str_list($tags)
-{$(', '.join(['"%s"'%a for a in $tags]))}#slurp
+#def make_impl_name_list($impls)
+{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp
#end def
########################################################################
-#def make_tag_have_list($deps)
-{$(', '.join([' | '.join(['(1 << LV_%s)'%a.upper() for a in d]) for d in $deps]))}#slurp
+#def make_impl_align_list($impls)
+{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp
#end def
########################################################################
-#def make_tag_kern_list($name, $tags)
-{$(', '.join(['%s_%s'%($name, a) for a in $tags]))}#slurp
+#def make_impl_deps_list($impls)
+{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_fcn_list($name, $impls)
+{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp
#end def
struct volk_machine volk_machine_$(this_machine.name) = {
@@ -63,11 +68,12 @@ struct volk_machine volk_machine_$(this_machine.name) = {
"$this_machine.name",
$this_machine.alignment,
#for $kern in $kernels
- #set $taglist, $tagdeps = $kern.get_tags($arch_names)
- "$kern.name",
- $make_tag_str_list($taglist),
- $make_tag_have_list($tagdeps),
- $make_tag_kern_list($kern.name, $taglist),
- $(len($taglist)),
+ #set $impls = $kern.get_impls($arch_names)
+ "$kern.name", ##//kernel name
+ $make_impl_name_list($impls), ##//list of kernel implementations by name
+ $make_impl_deps_list($impls), ##//list of arch dependencies per implementation
+ $make_impl_align_list($impls), ##//alignment required? for each implementation
+ $make_impl_fcn_list($kern.name, $impls), ##//pointer to each implementation
+ $(len($impls)), ##//number of implementations listed here
#end for
};
diff --git a/volk/tmpl/volk_machines.tmpl.h b/volk/tmpl/volk_machines.tmpl.h
index b30e600ed8..7e11b10795 100644
--- a/volk/tmpl/volk_machines.tmpl.h
+++ b/volk/tmpl/volk_machines.tmpl.h
@@ -25,18 +25,22 @@
#include <volk/volk_common.h>
#include <volk/volk_typedefs.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
__VOLK_DECL_BEGIN
struct volk_machine {
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format)
const char *name;
- const unsigned int alignment; //the maximum byte alignment required for functions in this library
+ const size_t alignment; //the maximum byte alignment required for functions in this library
#for $kern in $kernels
const char *$(kern.name)_name;
- const char *$(kern.name)_indices[$(len($archs))];
- const int $(kern.name)_arch_defs[$(len($archs))];
- const $(kern.pname) $(kern.name)_archs[$(len($archs))];
- const int $(kern.name)_n_archs;
+ const char *$(kern.name)_impl_names[$(len($archs))];
+ const int $(kern.name)_impl_deps[$(len($archs))];
+ const bool $(kern.name)_impl_alignment[$(len($archs))];
+ const $(kern.pname) $(kern.name)_impls[$(len($archs))];
+ const size_t $(kern.name)_n_impls;
#end for
};
diff --git a/volk/tmpl/volk_typedefs.tmpl.h b/volk/tmpl/volk_typedefs.tmpl.h
index 52a87242fe..6f5426965f 100644
--- a/volk/tmpl/volk_typedefs.tmpl.h
+++ b/volk/tmpl/volk_typedefs.tmpl.h
@@ -26,7 +26,7 @@
#include <volk/volk_complex.h>
#for $kern in $kernels
-typedef $kern.rettype (*$(kern.pname))($kern.arglist_defs);
+typedef void (*$(kern.pname))($kern.arglist_types);
#end for
#endif /*INCLUDED_VOLK_TYPEDEFS*/