141 files changed, 4376 insertions, 4097 deletions
diff --git a/volk/CMakeLists.txt b/volk/CMakeLists.txt
index 68385f9740..00544c5466 100644
--- a/volk/CMakeLists.txt
+++ b/volk/CMakeLists.txt
@@ -75,7 +75,7 @@ set(Boost_ADDITIONAL_VERSIONS
     "1.60.0" "1.60" "1.61.0" "1.61" "1.62.0" "1.62" "1.63.0" "1.63" "1.64.0" "1.64"
     "1.65.0" "1.65" "1.66.0" "1.66" "1.67.0" "1.67" "1.68.0" "1.68" "1.69.0" "1.69"
 )
-find_package(Boost COMPONENTS unit_test_framework)
+find_package(Boost COMPONENTS unit_test_framework filesystem system)
 
 find_package(ORC)
 
@@ -103,12 +103,15 @@ install(
 # Install all headers in the include directories
 ########################################################################
 install(
-    DIRECTORY ${CMAKE_SOURCE_DIR}/include/volk
+    DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk
     DESTINATION include COMPONENT "volk_devel"
     FILES_MATCHING PATTERN "*.h"
 )
 
 install(FILES
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h
     ${CMAKE_BINARY_DIR}/include/volk/volk.h
     ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h
     ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index d35d90deea..3b1ab370d3 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -5,144 +5,124 @@ extern "C" {
 }
 #include <vector>
 #include <boost/foreach.hpp>
+#include <boost/filesystem.hpp>
 #include <iostream>
 #include <fstream>
 #include <sys/stat.h>
 #include <sys/types.h>
 
+namespace fs = boost::filesystem;
+
 int main(int argc, char *argv[]) {
 
     std::vector<std::string> results;
 
-    //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results);
-    //VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results);
-    VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc_a, volk_32fc_s32fc_x2_rotator_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results);
-    VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_16ic_deinterleave_real_16i_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16ic_magnitude_16i_a, 1, 0, 204600, 100, &results);
-    VOLK_PROFILE(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16i_convert_8i_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16i_convert_8i_u, 0, 0, 204600, 10000, &results);
-    //VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results);
-    //VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results);
-    //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results);
-    //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results);
-    VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results);
-    VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 204600, 100, &results);
-    //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000, &results);
-    VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results);
-    VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_magnitude_32f_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_magnitude_squared_32f_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_magnitude_squared_32f_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_convert_32i_u, 1, 2<<31, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_convert_64f_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_convert_64f_u, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_convert_8i_a, 1, 128, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_convert_8i_u, 1, 128, 204600, 10000, &results);
-    //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000, &results);
-    VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 20460, 100, &results);
-    VOLK_PROFILE(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results);
-    VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results);
-    //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results);
-    VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results);
-    VOLK_PROFILE(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results);
-    VOLK_PROFILE(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000, &results);
-    VOLK_PROFILE(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_x2_multiply_32f_u, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100, &results);
-    VOLK_PROFILE(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100, &results);
-    VOLK_PROFILE(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 204600, 3000, &results);
-    VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 204600, 3000, &results);
-    VOLK_PROFILE(volk_32f_x2_subtract_32f_a, 1e-4, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 204600, 5000, &results);
-    VOLK_PROFILE(volk_32i_x2_and_32i_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32i_s32f_convert_32f_a, 1e-4, 100, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32i_s32f_convert_32f_u, 1e-4, 100, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32i_x2_or_32i_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32u_byteswap_a, 0, 0, 204600, 2000, &results);
-    //VOLK_PROFILE(volk_32u_popcnt_a, 0, 0, 2046, 10000, &results);
-    VOLK_PROFILE(volk_64f_convert_32f_a, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_64f_convert_32f_u, 1e-4, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_64f_x2_max_64f_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_64f_x2_min_64f_a, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_64u_byteswap_a, 0, 0, 204600, 1000, &results);
-    //VOLK_PROFILE(volk_64u_popcnt_a, 0, 0, 2046, 10000, &results);
-    VOLK_PROFILE(volk_8ic_deinterleave_16i_x2_a, 0, 0, 204600, 3000, &results);
-    VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 204600, 3000, &results);
-    VOLK_PROFILE(volk_8ic_deinterleave_real_16i_a, 0, 256, 204600, 3000, &results);
-    VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 204600, 3000, &results);
-    VOLK_PROFILE(volk_8ic_deinterleave_real_8i_a, 0, 0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 204600, 400, &results);
-    VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 204600, 400, &results);
-    VOLK_PROFILE(volk_8i_convert_16i_a, 0, 0, 204600, 20000, &results);
-    VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results);
-    VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results);
-    VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results);
-    //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results);
-    VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results);
-    VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 1.0, 204600, 10000, &results);
-    VOLK_PROFILE(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 204600, 1000, &results);
+    //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results);
+    //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results);
+    VOLK_PUPPET_PROFILE(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, 1e-2, (lv_32fc_t)lv_cmake(.95393, .3), 20460, 10000, &results);
+    VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16ic_deinterleave_16i_x2, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_16ic_deinterleave_real_16i, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16ic_magnitude_16i, 1, 0, 204600, 100, &results);
+    VOLK_PROFILE(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16i_convert_8i, 0, 0, 204600, 10000, &results);
+    //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204600, 10000, &results);
+    //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204600, 10000, &results);
+    //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results);
+    //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results);
+    VOLK_PROFILE(volk_16u_byteswap, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204600, 50, &results);
+    VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204600, 100, &results);
+    //VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results);
+    VOLK_PROFILE(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32fc_deinterleave_real_32f, 1e-4, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32fc_deinterleave_real_64f, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_index_max_16u, 3, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_s32f_magnitude_16i, 1, 32768, 204600, 100, &results);
+    VOLK_PROFILE(volk_32fc_magnitude_32f, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_magnitude_squared_32f, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_x2_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_conjugate_32fc, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32f_s32f_convert_16i, 1, 32768, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_s32f_convert_32i, 1, 2<<31, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_convert_64f, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_s32f_convert_8i, 1, 128, 204600, 10000, &results);
+    //VOLK_PROFILE(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results);
+    VOLK_PROFILE(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 20460, 100, &results);
+    VOLK_PROFILE(volk_32fc_x2_square_dist_32f, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_x2_divide_32f, 1e-4, 0, 204600, 2000, &results);
+    VOLK_PROFILE(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 5000, &results);
+    //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results);
+    VOLK_PROFILE(volk_32f_index_max_16u, 3, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic, 1, 32768, 204600, 3000, &results);
+    VOLK_PROFILE(volk_32f_x2_interleave_32fc, 0, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32f_x2_max_32f, 1e-4, 0, 204600, 2000, &results);
+    VOLK_PROFILE(volk_32f_x2_min_32f, 1e-4, 0, 204600, 2000, &results);
+    VOLK_PROFILE(volk_32f_x2_multiply_32f, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_s32f_normalize, 1e-4, 100, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_s32f_power_32f, 1e-4, 4, 204600, 100, &results);
+    VOLK_PROFILE(volk_32f_sqrt_32f, 1e-4, 0, 204600, 100, &results);
+    VOLK_PROFILE(volk_32f_s32f_stddev_32f, 1e-4, 100, 204600, 3000, &results);
+    VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 204600, 3000, &results);
+    VOLK_PROFILE(volk_32f_x2_subtract_32f, 1e-4, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32u_byteswap, 0, 0, 204600, 2000, &results);
+    //VOLK_PROFILE(volk_32u_popcnt, 0, 0, 2046, 10000, &results);
+    VOLK_PROFILE(volk_64f_convert_32f, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_64f_x2_max_64f, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_64f_x2_min_64f, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_64u_byteswap, 0, 0, 204600, 1000, &results);
+    //VOLK_PROFILE(volk_64u_popcnt, 0, 0, 2046, 10000, &results);
+    VOLK_PROFILE(volk_8ic_deinterleave_16i_x2, 0, 0, 204600, 3000, &results);
+    VOLK_PROFILE(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 204600, 3000, &results);
+    VOLK_PROFILE(volk_8ic_deinterleave_real_16i, 0, 256, 204600, 3000, &results);
+    VOLK_PROFILE(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 204600, 3000, &results);
+    VOLK_PROFILE(volk_8ic_deinterleave_real_8i, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 204600, 400, &results);
+    VOLK_PROFILE(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 204600, 400, &results);
+    VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 20000, &results);
+    VOLK_PROFILE(volk_8i_convert_16i, 0, 0, 204600, 2000, &results);
+    VOLK_PROFILE(volk_8i_s32f_convert_32f, 1e-4, 100, 204600, 2000, &results);
+    //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results);
+    VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 204600, 1000, &results);
+    VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 0, 204600, 1000, &results);
+
 
+    char path[1024];
+    volk_get_config_path(path);
+    const fs::path config_path(path);
+
+    if (not fs::exists(config_path.branch_path()))
+    {
+        std::cout << "Creating " << config_path.branch_path() << "..." << std::endl;
+        fs::create_directories(config_path.branch_path());
+    }
 
-    char path[256];
-    get_config_path(path);
-    std::string config_path(path);
-    std::ofstream config;
-    std::cout << "filename: " << config_path << std::endl;
-    config.open(config_path.c_str());
+    std::cout << "Writing " << config_path << "..." << std::endl;
+    std::ofstream config(config_path.string().c_str());
     if(!config.is_open()) { //either we don't have write access or we don't have the dir yet
-        std::string dir(getenv("HOME"));
-        dir += "/.volk";
-        if(mkdir(dir.c_str(), 0777) == -1) {
-            std::cout << "Error creating directory " << dir << std::endl;
-            return -1;
-        }
-        config.open(config_path.c_str());
-        if(!config.is_open()) {
-            std::cout << "Error opening file " << config_path << std::endl;
-            return -1;
-        }
+        std::cout << "Error opening file " << config_path << std::endl;
     }
 
     config << "\
diff --git a/volk/cmake/msvc/stdbool.h b/volk/cmake/msvc/stdbool.h
new file mode 100644
index 0000000000..fc8ee28f40
--- /dev/null
+++ b/volk/cmake/msvc/stdbool.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2005, 2006 Apple Computer, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef STDBOOL_WIN32_H
+#define STDBOOL_WIN32_H
+
+#if !COMPILER(MSVC)
+#error "This stdbool.h file should only be compiled with MSVC"
+#endif
+
+#ifndef __cplusplus
+
+typedef unsigned char bool;
+
+#define true 1
+#define false 0
+
+#ifndef CASSERT
+#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1];
+#endif
+
+CASSERT(sizeof(bool) == 1, bool_is_one_byte)
+CASSERT(true, true_is_true)
+CASSERT(!false, false_is_false)
+
+#endif
+
+#endif
diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml
index a18455801d..2c9ab41a55 100644
--- a/volk/gen/archs.xml
+++ b/volk/gen/archs.xml
@@ -2,7 +2,6 @@
 <grammar>
 
 <arch name="generic"> <!-- name is required-->
-  <alignment>1</alignment>
 </arch>
 
 <arch name="altivec">
diff --git a/volk/gen/volk_arch_defs.py b/volk/gen/volk_arch_defs.py
index 41154d5a7a..3c75e1374e 100644
--- a/volk/gen/volk_arch_defs.py
+++ b/volk/gen/volk_arch_defs.py
@@ -18,9 +18,6 @@
 archs = list()
 arch_dict = dict()
 
-#TODO enable this when we are ready
-create_unaligned_archs = False
-
 class arch_class:
     def __init__(self, flags, checks, **kwargs):
         for key, cast, failval in (
@@ -49,10 +46,6 @@ def register_arch(**kwargs):
     arch = arch_class(**kwargs)
     archs.append(arch)
     arch_dict[arch.name] = arch
-    if arch.alignment > 1 and create_unaligned_archs:
-        kwargs['name'] += '_u'
-        kwargs['alignment'] = 1
-        register_arch(**kwargs)
 
 ########################################################################
 # register the arches
diff --git a/volk/gen/volk_kernel_defs.py b/volk/gen/volk_kernel_defs.py
index 52cdb684c2..f246db0f96 100644
--- a/volk/gen/volk_kernel_defs.py
+++ b/volk/gen/volk_kernel_defs.py
@@ -24,201 +24,186 @@ import re
 import sys
 import glob
 
-from volk_arch_defs import archs
-
-remove_after_underscore = re.compile("_.*");
-space_remove = re.compile(" ");
-leading_space_remove = re.compile("^ *");
-replace_arch = re.compile(", const char\* arch");
-replace_bracket = re.compile(" {");
-replace_volk = re.compile("volk");
-
-def strip_trailing(tostrip, stripstr):
-    lindex = tostrip.rfind(stripstr)
-    tostrip = tostrip[0:lindex] + tostrip[lindex:len(tostrip)].replace(stripstr, "");
-    return tostrip
+########################################################################
+# Strip comments from a c/cpp file.
+# Input is code string, output is code string without comments.
+# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments
+########################################################################
+def comment_remover(text):
+    def replacer(match):
+        s = match.group(0)
+        if s.startswith('/'):
+            return ""
+        else:
+            return s
+    pattern = re.compile(
+        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE
+    )
+    return re.sub(pattern, replacer, text)
+
+########################################################################
+# Split code into nested sections according to ifdef preprocessor macros
+########################################################################
+def split_into_nested_ifdef_sections(code):
+    sections = list()
+    section = ''
+    header = 'text'
+    in_section_depth = 0
+    for i, line in enumerate(code.splitlines()):
+        m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
+        line_is = 'normal'
+        if m:
+            p0, p1, fcn, stuff = m.groups()
+            if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if'
+            if fcn in ('else', 'elif'): line_is = 'else'
+            if fcn in ('endif',): line_is = 'end'
+
+        if line_is == 'if': in_section_depth += 1
+        if line_is == 'end': in_section_depth -= 1
+
+        if in_section_depth == 1 and line_is == 'if':
+            sections.append((header, section))
+            section = ''
+            header = line
+            continue
 
-srcdir = os.path.dirname(os.path.dirname(__file__))
-hdr_files = glob.glob(os.path.join(srcdir, "include/volk/*.h"))
-
-datatypes = [];
-functions = [];
-
-for line in hdr_files:
-    subline = re.search(".*_(a|u)\.h.*", os.path.basename(line))
-    if subline:
-        subsubline = re.search("(?<=volk_).*", subline.group(0));
-        if subsubline:
-            dtype = remove_after_underscore.sub("", subsubline.group(0));
-            subdtype = re.search("[0-9]+[A-z]+", dtype);
-            if subdtype:
-                datatypes.append(subdtype.group(0));
-
-
-datatypes = set(datatypes);
-
-for line in hdr_files:
-    for dt in datatypes:
-        if dt in line:
-            subline = re.search("(volk_" + dt +"_.*(a|u).*\.h)", line);
-            if subline:
-
-                subsubline = re.search(".+(?=\.h)", subline.group(0));
-                functions.append(subsubline.group(0));
-
-archs_or = "("
-for arch in archs:
-    archs_or = archs_or + arch.name.upper() + "|";
-archs_or = archs_or[0:len(archs_or)-1];
-archs_or = archs_or + ")";
-
-taglist = [];
-fcountlist = [];
-arched_arglist = [];
-retlist = [];
-my_arglist = [];
-my_argtypelist = [];
-for func in functions:
-    tags = [];
-    fcount = [];
-    infile_source = open(os.path.join(srcdir, 'include', 'volk', func + ".h"))
-    begun_name = 0;
-    begun_paren = 0;
-    sourcefile = infile_source.readlines();
-    infile_source.close();
-    for line in sourcefile:
-#FIXME: make it work for multiple #if define()s
-        archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line);
-        if archline:
-            arch = archline.group(0);
-            archline = re.findall(archs_or + "(?=( |\n|&))", line);
-            if archline:
-                archsublist = [];
-                for tup in archline:
-                    archsublist.append(tup[0]);
-                fcount.append(archsublist);
-        testline = re.search("static inline.*?" + func, line);
-        if (not testline):
+        if in_section_depth == 1 and line_is == 'else':
+            sections.append((header, section))
+            section = ''
+            header = line
             continue
-        tagline = re.search(func + "_.+", line);
-        if tagline:
-            tag = re.search("(?<=" + func + "_)\w+(?= *\()",line);
-            if tag:
-                tag = re.search("\w+", tag.group(0));
-                if tag:
-                    tags.append(tag.group(0));
 
+        if in_section_depth == 0 and line_is == 'end':
+            sections.append((header, section))
+            section = ''
+            header = 'text'
+            continue
 
-        if begun_name == 0:
-            retline = re.search(".+(?=" + func + ")", line);
-            if retline:
-                ret = retline.group(0);
+        section += line + '\n'
 
+    sections.append((header, section)) #and pack remainder into sections
+    sections = [sec for sec in sections if sec[1].strip()] #filter empty sections
 
+    #recurse into non-text sections to fill subsections
+    for i, (header, section) in enumerate(sections):
+        if header == 'text': continue
+        sections[i] = (header, split_into_nested_ifdef_sections(section))
 
+    return sections
 
-            subline = re.search(func + ".*", line);
-            if subline:
-                subsubline = re.search("\(.*?\)", subline.group(0));
-                if subsubline:
-                    args = subsubline.group(0);
+########################################################################
+# Recursive print of sections to test code above
+########################################################################
+def print_sections(sections, indent = '  '):
+    for header, body in sections:
+        if header == 'text':
+            print indent, ('\n'+indent).join(body.splitlines())
+            continue
+        print indent.replace(' ', '-') + '>', header
+        print_sections(body, indent + '  ')
+
+########################################################################
+# Flatten a section to just body text
+########################################################################
+def flatten_section_text(sections):
+    output = ''
+    for hdr, bdy in sections:
+        if hdr != 'text': output += flatten_section_text(bdy)
+        else: output += bdy
+    return output
+
+########################################################################
+# Extract kernel info from section, represent as an implementation
+########################################################################
+class impl_class:
+    def __init__(self, kern_name, header, body):
+        #extract LV_HAVE_*
+        self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header)))
+        #extract function suffix and args
+        body = flatten_section_text(body)
+        try:
+            fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE)
+            body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket
+            m = fcn_matcher.match(body)
+            impl_name, the_rest = m.groups()
+            self.name = impl_name.replace(kern_name+'_', '')
+            self.args = list()
+            fcn_args = the_rest.split(',')
+            for fcn_arg in fcn_args:
+                arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE)
+                m = arg_matcher.match(fcn_arg)
+                arg_type, arg_name = m.groups()
+                self.args.append((arg_type, arg_name))
+        except Exception as ex:
+            raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex)
+
+        assert self.name
+        self.is_aligned = self.name.startswith('a_')
 
-                else:
-                    begun_name = 1;
-                    subsubline = re.search("\(.*", subline.group(0));
-                    if subsubline:
-                        args = subsubline.group(0);
-                        begun_paren = 1;
-        else:
-            if begun_paren == 1:
-                subline = re.search(".*?\)", line);
-                if subline:
-                    args = args + subline.group(0);
-                    begun_name = 0;
-                    begun_paren = 0;
-                else:
-                    subline = re.search(".*", line);
-                    args = args + subline.group(0);
-            else:
-                subline = re.search("\(.*?\)", line);
-                if subline:
-                    args = subline.group(0);
-                    begun_name = 0;
-                else:
-                    subline = re.search("\(.*", line);
-                    if subline:
-                        args = subline.group(0);
-                        begun_paren = 1;
-
-    replace = re.compile("static ");
-    ret = replace.sub("", ret);
-    replace = re.compile("inline ");
-    ret = replace.sub("", ret);
-    arched_args = args[args.find('(')+1:args.find(')')]
-
-    remove = re.compile('\)|\(|{');
-    rargs = remove.sub("", args);
-    sargs = rargs.split(',');
-
-
-
-    margs = [];
-    atypes = [];
-    for arg in sargs:
-        temp = arg.split(" ");
-        margs.append(temp[-1]);
-        replace = re.compile(" " + temp[-1]);
-        atypes.append(replace.sub("", arg));
-
-
-    my_args = ""
-    arg_types = ""
-    for arg in range(0, len(margs) - 1):
-        this_arg = leading_space_remove.sub("", margs[arg]);
-        my_args = my_args + this_arg + ", ";
-        this_type = leading_space_remove.sub("", atypes[arg]);
-        arg_types = arg_types + this_type + ", ";
-
-    this_arg = leading_space_remove.sub("", margs[-1]);
-    my_args = my_args + this_arg;
-    this_type = leading_space_remove.sub("", atypes[-1]);
-    arg_types = arg_types + this_type;
-    my_argtypelist.append(arg_types);
-
-    if(ret[-1] != ' '):
-        ret = ret + ' ';
-
-    arched_arglist.append(arched_args); #!!!!!!!!!!!
-    my_arglist.append(my_args) #!!!!!!!!!!!!!!!!!
-    retlist.append(ret);
-    fcountlist.append(fcount);
-    taglist.append(tags);
+    def __repr__(self):
+        return self.name
 
+########################################################################
+# Get sets of LV_HAVE_* from the code
+########################################################################
+def extract_lv_haves(code):
+    haves = list()
+    for line in code.splitlines():
+        if not line.strip().startswith('#'): continue
+        have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line)))
+        if have_set: haves.append(have_set)
+    return haves
+
+########################################################################
+# Represent a processing kernel, parse from file
+########################################################################
 class kernel_class:
-    def __init__(self, index):
-        self.name = functions[index]
+    def __init__(self, kernel_file):
+        self.name = os.path.splitext(os.path.basename(kernel_file))[0]
         self.pname = self.name.replace('volk_', 'p_')
-        self.rettype = retlist[index]
-        self.arglist_defs = my_argtypelist[index]
-        self.arglist_namedefs = arched_arglist[index]
-        self.arglist_names = my_arglist[index]
-        self._tagdeps = fcountlist[index]
-        self._taglist = taglist[index]
-
-    def get_tags(self, archs):
-        def is_in(x): return x.lower() in archs
-        taglist = list()
-        tagdeps = list()
-        for i in range(len(self._tagdeps)):
-            if all(map(is_in, self._tagdeps[i])):
-                taglist.append(self._taglist[i])
-                tagdeps.append(self._tagdeps[i])
-        return taglist, tagdeps
+        code = open(kernel_file, 'r').read()
+        code = comment_remover(code)
+        sections = split_into_nested_ifdef_sections(code)
+        self._impls = list()
+        for header, section in sections:
+            if 'ifndef' not in header.lower(): continue
+            for sub_hdr, body in section:
+                if 'if' not in sub_hdr.lower(): continue
+                if 'LV_HAVE_' not in sub_hdr: continue
+                self._impls.append(impl_class(
+                    kern_name=self.name, header=sub_hdr, body=body,
+                ))
+        assert(self._impls)
+        self.has_dispatcher = False
+        for impl in self._impls:
+            if impl.name == 'dispatcher':
+                self._impls.remove(impl)
+                self.has_dispatcher = True
+                break
+        self.args = self._impls[0].args
+        self.arglist_types = ', '.join([a[0] for a in self.args])
+        self.arglist_full = ', '.join(['%s %s'%a for a in self.args])
+        self.arglist_names = ', '.join([a[1] for a in self.args])
+
+    def get_impls(self, archs):
+        archs = set(archs)
+        impls = list()
+        for impl in self._impls:
+            if impl.deps.intersection(archs) == impl.deps:
+                impls.append(impl)
+        return impls
 
     def __repr__(self):
         return self.name
 
-kernels = map(kernel_class, range(len(retlist)))
+########################################################################
+# Extract information from the VOLK kernels
+########################################################################
+__file__ = os.path.abspath(__file__)
+srcdir = os.path.dirname(os.path.dirname(__file__))
+kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h"))
+kernels = map(kernel_class, kernel_files)
 
 if __name__ == '__main__':
     print kernels
diff --git a/volk/gen/volk_machine_defs.py b/volk/gen/volk_machine_defs.py
index d1a8569818..7293d47462 100644
--- a/volk/gen/volk_machine_defs.py
+++ b/volk/gen/volk_machine_defs.py
@@ -30,10 +30,6 @@ class machine_class:
             arch = arch_dict[arch_name]
             self.archs.append(arch)
             self.arch_names.append(arch_name)
-            arch_name += '_u'
-            if arch.alignment > 1 and arch_dict.has_key(arch_name):
-                arch = arch_dict[arch_name]
-                self.archs.append(arch)
         self.alignment = max(map(lambda a: a.alignment, self.archs))
 
     def __repr__(self): return self.name
diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h
deleted file mode 100644
index 84548c8c50..0000000000
--- a/volk/include/volk/volk_16i_convert_8i_a.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef INCLUDED_volk_16i_convert_8i_a_H
-#define INCLUDED_volk_16i_convert_8i_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_store_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h
deleted file mode 100644
index 7108ff6590..0000000000
--- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
-#define INCLUDED_volk_16i_s32f_convert_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
deleted file mode 100644
index 8ef627a628..0000000000
--- a/volk/include/volk/volk_16u_byteswap_u.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef INCLUDED_volk_16u_byteswap_u_H
-#define INCLUDED_volk_16u_byteswap_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
-  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-  uint16_t* inputPtr = intsToSwap;
-  __m128i input, left, right, output;
-
-  const unsigned int eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
-    // Load the 16t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_loadu_si128((__m128i*)inputPtr);
-    // Do the two shifts
-    left = _mm_slli_epi16(input, 8);
-    right = _mm_srli_epi16(input, 8);
-    // Or the left and right halves together
-    output = _mm_or_si128(left, right);
-    // Store the results
-    _mm_storeu_si128((__m128i*)inputPtr, output);
-    inputPtr += 8;
-  }
-
-  // Byteswap any remaining points:
-  number = eighthPoints*8;
-  for(; number < num_points; number++){
-    uint16_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int point;
-  uint16_t* inputPtr = intsToSwap;
-  for(point = 0; point < num_points; point++){
-    uint16_t output = *inputPtr;
-    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
-    *inputPtr = output;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h
deleted file mode 100644
index 2c469ac421..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_a.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_volk_32f_convert_64f_a_H
-#define INCLUDED_volk_32f_convert_64f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the float values into double values
-    \param dVector The converted double vector values
-    \param fVector The float vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
-
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_store_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_store_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the float values into double values
-  \param dVector The converted double vector values
-  \param fVector The float vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h
deleted file mode 100644
index 10d8a4f6c0..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_u.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_volk_32f_convert_64f_u_H
-#define INCLUDED_volk_32f_convert_64f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the float values into double values
-    \param dVector The converted double vector values
-    \param fVector The float vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
-
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the float values into double values
-  \param dVector The converted double vector values
-  \param fVector The float vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_convert_64f_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
deleted file mode 100644
index 9df4946f24..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ /dev/null
@@ -1,150 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
-#define INCLUDED_volk_32f_s32f_convert_16i_a_H
-
-#include <volk/volk_common.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = -32768;
-  float max_val = 32767;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-  __m128 ret1, ret2;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    // Scale and clip
-    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm_cvtps_epi32(ret1);
-    intInputVal2 = _mm_cvtps_epi32(ret2);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-
-  float min_val = -32768;
-  float max_val = 32767;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    // Scale and clip
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int16_t)rintf(r);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = -32768;
-  float max_val = 32767;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r  = *inputVectorPtr++ * scalar;
-    if(r < min_val)
-      r = min_val;
-    else if(r > max_val)
-      r = max_val;
-    *outputVectorPtr++ = (int16_t)rintf(r);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
deleted file mode 100644
index ee15edb464..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
-#define INCLUDED_volk_32f_s32f_convert_32i_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = -2147483647;
-  float max_val = 2147483647;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    intInputVal1 = _mm_cvtps_epi32(inputVal1);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)(r);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-
-  float min_val = -2147483647;
-  float max_val = 2147483647;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int32_t)(r);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = -2147483647;
-  float max_val = 2147483647;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++ * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    *outputVectorPtr++ = (int32_t)(r);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
deleted file mode 100644
index 800017d5da..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
-#define INCLUDED_volk_32f_s32f_convert_8i_a_H
-
-#include <volk/volk_common.h>
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-
-  float min_val = -128;
-  float max_val = 127;
-  float r;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
-    inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
-    inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
-    inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
-
-    intInputVal1 = _mm_cvtps_epi32(inputVal1);
-    intInputVal2 = _mm_cvtps_epi32(inputVal2);
-    intInputVal3 = _mm_cvtps_epi32(inputVal3);
-    intInputVal4 = _mm_cvtps_epi32(inputVal4);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int8_t)(r);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* inputVectorPtr = (const float*)inputVector;
-
-  float min_val = -128;
-  float max_val = 127;
-  float r;
-
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-  __m128 vmin_val = _mm_set_ps1(min_val);
-  __m128 vmax_val = _mm_set_ps1(max_val);
-
-  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    r = inputVector[number] * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    outputVector[number] = (int8_t)(r);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  float min_val = -128;
-  float max_val = 127;
-  float r;
-
-  for(number = 0; number < num_points; number++){
-    r = *inputVectorPtr++ * scalar;
-    if(r > max_val)
-      r = max_val;
-    else if(r < min_val)
-      r = min_val;
-    *outputVectorPtr++ = (int8_t)(r);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
deleted file mode 100644
index b3fae9b053..0000000000
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
-#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Scalar float multiply
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param scalar the scalar value
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-
-    __m128 aVal, bVal, cVal;
-    bVal = _mm_set_ps1(scalar);
-    for(;number < quarterPoints; number++){
-
-      aVal = _mm_loadu_ps(aPtr);
-
-      cVal = _mm_mul_ps(aVal, bVal);
-
-      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-/*!
-  \brief Scalar float multiply
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param scalar the scalar value
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-
-    __m256 aVal, bVal, cVal;
-    bVal = _mm256_set1_ps(scalar);
-    for(;number < eighthPoints; number++){
-
-      aVal = _mm256_loadu_ps(aPtr);
-
-      cVal = _mm256_mul_ps(aVal, bVal);
-
-      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 8;
-      cPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * scalar;
-    }
-}
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Scalar float multiply
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param scalar the scalar value
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const float* inputPtr = aVector;
-  float* outputPtr = cVector;
-  for(number = 0; number < num_points; number++){
-    *outputPtr = (*inputPtr) * scalar;
-    inputPtr++;
-    outputPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h
deleted file mode 100644
index 52e8286bc2..0000000000
--- a/volk/include/volk/volk_32f_x2_add_32f_u.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
-#define INCLUDED_volk_32f_x2_add_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-
-      aVal = _mm_loadu_ps(aPtr);
-      bVal = _mm_loadu_ps(bPtr);
-
-      cVal = _mm_add_ps(aVal, bVal);
-
-      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
deleted file mode 100644
index 067c33ad89..0000000000
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ /dev/null
@@ -1,290 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
-#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
-
-#include <volk/volk_common.h>
-#include<stdio.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE
-
-
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
-    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
-    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
-    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#ifdef LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#ifdef LV_HAVE_AVX
-
-#include <immintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val;
-  __m256 b0Val, b1Val;
-  __m256 c0Val, c1Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm256_load_ps(aPtr);
-    a1Val = _mm256_load_ps(aPtr+8);
-    b0Val = _mm256_load_ps(bPtr);
-    b1Val = _mm256_load_ps(bPtr+8);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_AVX*/
-
-#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
deleted file mode 100644
index b24e8b1f79..0000000000
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ /dev/null
@@ -1,290 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
-#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
-
-#include <volk/volk_common.h>
-#include<stdio.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE
-
-
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_loadu_ps(aPtr);
-    a1Val = _mm_loadu_ps(aPtr+4);
-    a2Val = _mm_loadu_ps(aPtr+8);
-    a3Val = _mm_loadu_ps(aPtr+12);
-    b0Val = _mm_loadu_ps(bPtr);
-    b1Val = _mm_loadu_ps(bPtr+4);
-    b2Val = _mm_loadu_ps(bPtr+8);
-    b3Val = _mm_loadu_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
-    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
-    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 a0Val, a1Val, a2Val, a3Val;
-  __m128 b0Val, b1Val, b2Val, b3Val;
-  __m128 c0Val, c1Val, c2Val, c3Val;
-
-  __m128 dotProdVal0 = _mm_setzero_ps();
-  __m128 dotProdVal1 = _mm_setzero_ps();
-  __m128 dotProdVal2 = _mm_setzero_ps();
-  __m128 dotProdVal3 = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm_loadu_ps(aPtr);
-    a1Val = _mm_loadu_ps(aPtr+4);
-    a2Val = _mm_loadu_ps(aPtr+8);
-    a3Val = _mm_loadu_ps(aPtr+12);
-    b0Val = _mm_loadu_ps(bPtr);
-    b1Val = _mm_loadu_ps(bPtr+4);
-    b2Val = _mm_loadu_ps(bPtr+8);
-    b3Val = _mm_loadu_ps(bPtr+12);
-
-    c0Val = _mm_mul_ps(a0Val, b0Val);
-    c1Val = _mm_mul_ps(a1Val, b1Val);
-    c2Val = _mm_mul_ps(a2Val, b2Val);
-    c3Val = _mm_mul_ps(a3Val, b3Val);
-
-    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
-    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
-    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
-    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
-  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#ifdef LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#ifdef LV_HAVE_AVX
-
-#include <immintrin.h>
-
-static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m256 a0Val, a1Val;
-  __m256 b0Val, b1Val;
-  __m256 c0Val, c1Val;
-
-  __m256 dotProdVal0 = _mm256_setzero_ps();
-  __m256 dotProdVal1 = _mm256_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-
-    a0Val = _mm256_loadu_ps(aPtr);
-    a1Val = _mm256_loadu_ps(aPtr+8);
-    b0Val = _mm256_loadu_ps(bPtr);
-    b1Val = _mm256_loadu_ps(bPtr+8);
-
-    c0Val = _mm256_mul_ps(a0Val, b0Val);
-    c1Val = _mm256_mul_ps(a1Val, b1Val);
-
-    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
-    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
-
-    aPtr += 16;
-    bPtr += 16;
-  }
-
-  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
-
-  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
-
-  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-  dotProduct += dotProductVector[4];
-  dotProduct += dotProductVector[5];
-  dotProduct += dotProductVector[6];
-  dotProduct += dotProductVector[7];
-
-  number = sixteenthPoints*16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-
-}
-
-#endif /*LV_HAVE_AVX*/
-
-#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
deleted file mode 100644
index bfb896d602..0000000000
--- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
-#define INCLUDED_volk_32f_x2_multiply_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-
-      aVal = _mm_loadu_ps(aPtr);
-      bVal = _mm_loadu_ps(bPtr);
-
-      cVal = _mm_mul_ps(aVal, bVal);
-
-      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-/*!
-  \brief Multiplies the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m256 aVal, bVal, cVal;
-    for(;number < eighthPoints; number++){
-
-      aVal = _mm256_loadu_ps(aPtr);
-      bVal = _mm256_loadu_ps(bPtr);
-
-      cVal = _mm256_mul_ps(aVal, bVal);
-
-      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 8;
-      bPtr += 8;
-      cPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
deleted file mode 100644
index e0d79ea7bc..0000000000
--- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
-#define INCLUDED_volk_32fc_conjugate_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Takes the conjugate of a complex vector.
-    \param cVector The vector where the results will be stored
-    \param aVector Vector to be conjugated
-    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
-  */
-static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-
-    for(;number < halfPoints; number++){
-
-      x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
-
-      x = _mm_xor_ps(x, conjugator); // conjugate register
-
-      _mm_storeu_ps((float*)c,x); // Store the results back into the C container
-
-      a += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = lv_conj(*a);
-    }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Takes the conjugate of a complex vector.
-    \param cVector The vector where the results will be stored
-    \param aVector Vector to be conjugated
-    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
-  */
-static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = lv_conj(*aPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
deleted file mode 100644
index 77566e671d..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
-#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    double* iBufferPtr = iBuffer;
-    double* qBufferPtr = qBuffer;
-
-    const unsigned int halfPoints = num_points / 2;
-    __m128 cplxValue, fVal;
-    __m128d dVal;
-
-    for(;number < halfPoints; number++){
-
-      cplxValue = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i1i2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
-      dVal = _mm_cvtps_pd(fVal);
-      _mm_store_pd(iBufferPtr, dVal);
-
-      // Arrange in q1q2q1q2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
-      dVal = _mm_cvtps_pd(fVal);
-      _mm_store_pd(qBufferPtr, dVal);
-
-      iBufferPtr += 2;
-      qBufferPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(; number < num_points; number++){
-      *iBufferPtr++ = *complexVectorPtr++;
-      *qBufferPtr++ = *complexVectorPtr++;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-  double* qBufferPtr = qBuffer;
-
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h
deleted file mode 100644
index feed54be8c..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_u.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
-#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    double* iBufferPtr = iBuffer;
-    double* qBufferPtr = qBuffer;
-
-    const unsigned int halfPoints = num_points / 2;
-    __m128 cplxValue, fVal;
-    __m128d dVal;
-
-    for(;number < halfPoints; number++){
-
-      cplxValue = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i1i2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
-      dVal = _mm_cvtps_pd(fVal);
-      _mm_storeu_pd(iBufferPtr, dVal);
-
-      // Arrange in q1q2q1q2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
-      dVal = _mm_cvtps_pd(fVal);
-      _mm_storeu_pd(qBufferPtr, dVal);
-
-      iBufferPtr += 2;
-      qBufferPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(; number < num_points; number++){
-      *iBufferPtr++ = *complexVectorPtr++;
-      *qBufferPtr++ = *complexVectorPtr++;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_x2_u_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-  double* qBufferPtr = qBuffer;
-
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h
deleted file mode 100644
index c8b3f0a088..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_32f_u.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
-#define INCLUDED_volk_32fc_magnitude_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-      result = _mm_sqrt_ps(result);
-
-      _mm_storeu_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      float val1Real = *complexVectorPtr++;
-      float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-    }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-      result = _mm_sqrt_ps(result);
-
-      _mm_storeu_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-       float val1Real = *complexVectorPtr++;
-       float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
deleted file mode 100644
index d3ac9717a8..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
-#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-      _mm_store_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      float val1Real = *complexVectorPtr++;
-      float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-    }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-      _mm_store_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-       float val1Real = *complexVectorPtr++;
-       float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
deleted file mode 100644
index 53a4e68eb4..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
-#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-      _mm_storeu_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      float val1Real = *complexVectorPtr++;
-      float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-    }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#ifdef LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-      _mm_storeu_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-       float val1Real = *complexVectorPtr++;
-       float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
deleted file mode 100644
index 5c7d15b02f..0000000000
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
-#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Multiplies the input vector by a scalar and stores the results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be multiplied
-  \param scalar The complex scalar to multiply aVector
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm_set_ps1(lv_creal(scalar));
-    yh = _mm_set_ps1(lv_cimag(scalar));
-
-    for(;number < halfPoints; number++){
-
-      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Multiplies the input vector by a scalar and stores the results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be multiplied
-  \param scalar The complex scalar to multiply aVector
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    unsigned int number = num_points;
-
-    // unwrap loop
-    while (number >= 8){
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      *cPtr++ = (*aPtr++) * scalar;
-      number -= 8;
-    }
-
-    // clean up any remaining
-    while (number-- > 0)
-      *cPtr++ = *aPtr++ * scalar;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
deleted file mode 100644
index 5b16b8639a..0000000000
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
-#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
-
-
-#include<volk/volk_complex.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  const unsigned int num_bytes = num_points*8;
-
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-
-
-
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  unsigned int i = 0;
-
-
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
-    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
-    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
-    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-
-
-    in += 4;
-    tp += 4;
-
-  }
-
-
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-
-
-
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-
-  }
-  /*
-  for(i = 0; i < num_bytes >> 3; ++i) {
-    *result += input[i] * conjf(taps[i]);
-  }
-  */
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#include <mmintrin.h>
-
-
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  unsigned int num_bytes = num_points*8;
-
-  // Variable never used?
-  //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
-  union HalfMask {
-    uint32_t intRep[4];
-    __m128 vec;
-    } halfMask;
-
-  union NegMask {
-    int intRep[4];
-    __m128 vec;
-  } negMask;
-
-  unsigned int offset = 0;
-  float Rsum=0, Isum=0;
-  float Im,Re;
-
-  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
-  __m128 zv = {0,0,0,0};
-
-  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
-  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
-
-  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
-  negMask.intRep[1] = negMask.intRep[3] = 0;
-
-  // main loop
-  while(num_bytes >= 4*sizeof(float)){
-
-    in1 = _mm_loadu_ps( (float*) (input+offset) );
-    in2 = _mm_loadu_ps( (float*) (taps+offset) );
-    Rv = _mm_mul_ps(in1, in2);
-    fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
-    Iv = _mm_mul_ps(in1, fehg);
-    Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
-    Ivm = _mm_xor_ps( negMask.vec, Iv );
-    Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
-    _mm_store_ss( &Im, Is );
-    _mm_store_ss( &Re, Rs );
-    num_bytes -= 4*sizeof(float);
-    offset += 2;
-    Rsum += Re;
-    Isum += Im;
-  }
-
-  // handle the last complex case ...
-  if(num_bytes > 0){
-
-    if(num_bytes != 4){
-      // bad things are happening
-    }
-
-    in1 = _mm_loadu_ps( (float*) (input+offset) );
-    in2 = _mm_loadu_ps( (float*) (taps+offset) );
-    Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
-    fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
-    Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
-    Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
-    Ivm = _mm_xor_ps( negMask.vec, Iv );
-    Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
-    _mm_store_ss( &Im, Is );
-    _mm_store_ss( &Re, Rs );
-    Rsum += Re;
-    Isum += Im;
-  }
-
-  result[0] = lv_cmake(Rsum,Isum);
-  return;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-
-#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
-
-
-
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
deleted file mode 100644
index 7c0dba7fd8..0000000000
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
-#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
-
-#include <volk/volk_common.h>
-#include <volk/volk_complex.h>
-#include <stdio.h>
-#include <string.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_points/2;
-  unsigned int isodd = num_points &1;
-
-
-
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  unsigned int i = 0;
-
-
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
-
-    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
-    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
-    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-
-
-    in += 4;
-    tp += 4;
-
-  }
-
-
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-
-
-
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[num_points - 1] * taps[num_points - 1];
-
-  }
-
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-#ifdef LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
-
-
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
-
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_points/2;
-
-  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
-
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
-
-  dotProdVal = _mm_setzero_ps();
-
-  for(;number < halfPoints; number++){
-
-    x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
-    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
-    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
-    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
-
-    a += 2;
-    b += 2;
-  }
-
-  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
-
-  _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
-
-  if(num_points % 1 != 0) {
-    dotProduct += (*a) * (*b);
-  }
-
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
deleted file mode 100644
index a998d6184e..0000000000
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
-#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-
-    for(;number < halfPoints; number++){
-
-      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
-      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      b += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * (*b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
deleted file mode 100644
index 2755192e96..0000000000
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
-#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector First vector to be multiplied
-    \param bVector Second vector that is conjugated before being multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-
-    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-
-    for(;number < halfPoints; number++){
-
-      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
-      y = _mm_xor_ps(y, conjugator); // conjugate y
-
-      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-      _mm_store_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      b += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * lv_conj(*b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector First vector to be multiplied
-    \param bVector Second vector that is conjugated before being multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
deleted file mode 100644
index 09dcd635b9..0000000000
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
-#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector First vector to be multiplied
-    \param bVector Second vector that is conjugated before being multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-
-    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-
-    for(;number < halfPoints; number++){
-
-      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-
-      y = _mm_xor_ps(y, conjugator); // conjugate y
-
-      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      b += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * lv_conj(*b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector First vector to be multiplied
-    \param bVector Second vector that is conjugated before being multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h
deleted file mode 100644
index 8f4123d719..0000000000
--- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
-#define INCLUDED_volk_32i_s32f_convert_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_load_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_store_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
deleted file mode 100644
index b3a8ab2015..0000000000
--- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
-#define INCLUDED_volk_32i_s32f_convert_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h
deleted file mode 100644
index e27d1f03dd..0000000000
--- a/volk/include/volk/volk_32u_byteswap_u.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_volk_32u_byteswap_u_H
-#define INCLUDED_volk_32u_byteswap_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int32_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-
-  uint32_t* inputPtr = intsToSwap;
-  __m128i input, byte1, byte2, byte3, byte4, output;
-  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
-  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_loadu_si128((__m128i*)inputPtr);
-    // Do the four shifts
-    byte1 = _mm_slli_epi32(input, 24);
-    byte2 = _mm_slli_epi32(input, 8);
-    byte3 = _mm_srli_epi32(input, 8);
-    byte4 = _mm_srli_epi32(input, 24);
-    // Or bytes together
-    output = _mm_or_si128(byte1, byte4);
-    byte2 = _mm_and_si128(byte2, byte2mask);
-    output = _mm_or_si128(output, byte2);
-    byte3 = _mm_and_si128(byte3, byte3mask);
-    output = _mm_or_si128(output, byte3);
-    // Store the results
-    _mm_storeu_si128((__m128i*)inputPtr, output);
-    inputPtr += 4;
-  }
-
-  // Byteswap any remaining points:
-  number = quarterPoints*4;
-  for(; number < num_points; number++){
-    uint32_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int32_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = intsToSwap;
-
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output = *inputPtr;
-    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
-
-    *inputPtr = output;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32u_byteswap_u_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h
deleted file mode 100644
index 11d51702bc..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_a.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_volk_64f_convert_32f_a_H
-#define INCLUDED_volk_64f_convert_32f_a_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the double values into float values
-    \param dVector The converted float vector values
-    \param fVector The double vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
-
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
-
-    ret = _mm_movelh_ps(ret, ret2);
-
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the double values into float values
-  \param dVector The converted float vector values
-  \param fVector The double vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h
deleted file mode 100644
index 31dc5b5fe9..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_u.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_volk_64f_convert_32f_u_H
-#define INCLUDED_volk_64f_convert_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the double values into float values
-    \param dVector The converted float vector values
-    \param fVector The double vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
-
-    ret = _mm_movelh_ps(ret, ret2);
-
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the double values into float values
-  \param dVector The converted float vector values
-  \param fVector The double vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h
deleted file mode 100644
index 41a4a3130f..0000000000
--- a/volk/include/volk/volk_64u_byteswap_u.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef INCLUDED_volk_64u_byteswap_u_H
-#define INCLUDED_volk_64u_byteswap_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int64_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
-    uint32_t* inputPtr = (uint32_t*)intsToSwap;
-    __m128i input, byte1, byte2, byte3, byte4, output;
-    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
-    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-    uint64_t number = 0;
-    const unsigned int halfPoints = num_points / 2;
-    for(;number < halfPoints; number++){
-      // Load the 32t values, increment inputPtr later since we're doing it in-place.
-      input = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Do the four shifts
-      byte1 = _mm_slli_epi32(input, 24);
-      byte2 = _mm_slli_epi32(input, 8);
-      byte3 = _mm_srli_epi32(input, 8);
-      byte4 = _mm_srli_epi32(input, 24);
-      // Or bytes together
-      output = _mm_or_si128(byte1, byte4);
-      byte2 = _mm_and_si128(byte2, byte2mask);
-      output = _mm_or_si128(output, byte2);
-      byte3 = _mm_and_si128(byte3, byte3mask);
-      output = _mm_or_si128(output, byte3);
-
-      // Reorder the two words
-      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
-      // Store the results
-      _mm_storeu_si128((__m128i*)inputPtr, output);
-      inputPtr += 4;
-    }
-
-    // Byteswap any remaining points:
-    number = halfPoints*2;
-    for(; number < num_points; number++){
-      uint32_t output1 = *inputPtr;
-      uint32_t output2 = inputPtr[1];
-
-      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-
-      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
-      *inputPtr++ = output2;
-      *inputPtr++ = output1;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int64_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h
deleted file mode 100644
index 7d7104f52b..0000000000
--- a/volk/include/volk/volk_8i_convert_16i_u.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_8i_convert_16i_u_H
-#define INCLUDED_volk_8i_convert_16i_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
deleted file mode 100644
index 8bb2c0d1a4..0000000000
--- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
-#define INCLUDED_volk_8i_s32f_convert_32f_u_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1( iScalar );
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_prefs.h b/volk/include/volk/volk_prefs.h
index 83d9baf89d..690e5f99f6 100644
--- a/volk/include/volk/volk_prefs.h
+++ b/volk/include/volk/volk_prefs.h
@@ -2,23 +2,26 @@
 #define INCLUDED_VOLK_PREFS_H
 
 #include <volk/volk_common.h>
+#include <stdlib.h>
 
 __VOLK_DECL_BEGIN
 
-struct volk_arch_pref {
-    char name[128];
-    char arch[32];
-};
+typedef struct volk_arch_pref
+{
+    char name[128];   //name of the kernel
+    char impl_a[128]; //best aligned impl
+    char impl_u[128]; //best unaligned impl
+} volk_arch_pref_t;
 
 ////////////////////////////////////////////////////////////////////////
 // get path to volk_config profiling info
 ////////////////////////////////////////////////////////////////////////
-VOLK_API void get_config_path(char *);
+VOLK_API void volk_get_config_path(char *);
 
 ////////////////////////////////////////////////////////////////////////
 // load prefs into global prefs struct
 ////////////////////////////////////////////////////////////////////////
-VOLK_API int load_preferences(struct volk_arch_pref **);
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
 
 __VOLK_DECL_END
 
diff --git a/volk/kernels/README.txt b/volk/kernels/README.txt
new file mode 100644
index 0000000000..5dd7434b54
--- /dev/null
+++ b/volk/kernels/README.txt
@@ -0,0 +1,67 @@
+########################################################################
+# How to create custom kernel dispatchers
+########################################################################
+A kernel dispatcher is kernel implementation that calls other kernel implementations.
+By default, a dispatcher is generated by the build system for every kernel such that:
+  * the best aligned implemention is called when all pointer arguments are aligned,
+  * and otherwise the best unaligned implementation is called.
+
+The author of a VOLK kernel may create a custom dispatcher,
+to be called in place of the automatically generated one.
+A custom dispatcher may be useful to handle head and tail cases,
+or to implement different alignment and bounds checking logic.
+
+########################################################################
+# Code for an example dispatcher w/ tail case
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+{
+    const unsigned int num_points_r = num_points%4;
+    const unsigned int num_points_x = num_points - num_points_r;
+
+    if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
+    {
+        volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
+    }
+    else
+    {
+        volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
+    }
+
+    volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
+}
+
+#endif //LV_HAVE_DISPATCHER
+
+########################################################################
+# Code for an example dispatcher w/ tail case and accumulator
+########################################################################
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
+{
+    const unsigned int num_points_r = num_points%16;
+    const unsigned int num_points_x = num_points - num_points_r;
+
+    if (volk_is_aligned(VOLK_OR_PTR(input, taps)))
+    {
+        volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
+    }
+    else
+    {
+        volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
+    }
+
+    float result_tail = 0;
+    volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
+
+    *result += result_tail;
+}
+
+#endif //LV_HAVE_DISPATCHER
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
index 1f6554af8b..8bc1569f61 100644
--- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -8,7 +8,7 @@
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
 
   static const int N_UNROLL = 4;
 
diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/kernels/volk/volk_16i_branch_4_state_8.h
index 6338fbdd17..cdfbc7ba13 100644
--- a/volk/include/volk/volk_16i_branch_4_state_8_a.h
+++ b/volk/kernels/volk/volk_16i_branch_4_state_8.h
@@ -138,7 +138,7 @@ static inline  void volk_16i_branch_4_state_8_a_ssse3(short* target,  short* src
 #endif /*LV_HAVE_SSEs*/
 
 #ifdef LV_HAVE_GENERIC
-static inline  void volk_16i_branch_4_state_8_a_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+static inline  void volk_16i_branch_4_state_8_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
 	int i = 0;
 
 	int bound = 4;
diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/kernels/volk/volk_16i_convert_8i.h
index 80608a1412..3789b2e4ab 100644
--- a/volk/include/volk/volk_16i_convert_8i_u.h
+++ b/volk/kernels/volk/volk_16i_convert_8i.h
@@ -54,7 +54,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_
   \param num_points The number of data values to be converted
   \note Input and output buffers do NOT need to be properly aligned
 */
-static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+static inline void volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
   int8_t* outputVectorPtr = outputVector;
   const int16_t* inputVectorPtr = inputVector;
   unsigned int number = 0;
@@ -69,3 +69,72 @@ static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int
 
 
 #endif /* INCLUDED_volk_16i_convert_8i_u_H */
+#ifndef INCLUDED_volk_16i_convert_8i_a_H
+#define INCLUDED_volk_16i_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/kernels/volk/volk_16i_max_star_16i.h
index ca81cf0d62..c67351c5fa 100644
--- a/volk/include/volk/volk_16i_max_star_16i_a.h
+++ b/volk/kernels/volk/volk_16i_max_star_16i.h
@@ -87,7 +87,7 @@ static inline  void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un
 
 #ifdef LV_HAVE_GENERIC
 
-static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_points) {
+static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) {
 
 	const unsigned int num_bytes = num_points*2;
 
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
index 13c235bc0b..ef88ec094f 100644
--- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h
+++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -112,7 +112,7 @@ static inline  void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
 
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
+static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
 
 	const unsigned int num_bytes = num_points*2;
 
diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h
index d91b36208a..7a01d172a3 100644
--- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h
+++ b/volk/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -118,7 +118,7 @@ static inline  void volk_16i_permute_and_scalar_add_a_sse2(short* target,  short
 
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
+static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
 
 	const unsigned int num_bytes = num_points*2;
 
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index 4ce8e8f35b..a810a601a0 100644
--- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -105,7 +105,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in
     \param num_points The number of data values to be converted
     \note Output buffer does NOT need to be properly aligned
   */
-static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
   float* outputVectorPtr = outputVector;
   const int16_t* inputVectorPtr = inputVector;
   unsigned int number = 0;
@@ -120,3 +120,122 @@ static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, cons
 
 
 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
index 18b2e3d845..56b2cc07ab 100644
--- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h
+++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -167,7 +167,7 @@ static inline  void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
 
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
 
 	const unsigned int num_bytes = num_points*2;
 
diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
index 677cb40e9f..9b6d19fd66 100644
--- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h
+++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -115,7 +115,7 @@ static inline  void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
 
 #ifdef LV_HAVE_GENERIC
 
-static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
 
 	const unsigned int num_bytes = num_points*2;
 
diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h
index f8aa30874f..9ce8012640 100644
--- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h
+++ b/volk/kernels/volk/volk_16ic_deinterleave_16i_x2.h
@@ -128,7 +128,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_
   \param qBuffer The Q buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
   const int16_t* complexVectorPtr = (const int16_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
   int16_t* qBufferPtr = qBuffer;
@@ -149,7 +149,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int
   \param num_points The number of complex data values to be deinterleaved
 */
 extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16ic_deinterleave_16i_x2_a_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
     volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h
index bac1f2e4b0..f6eccd77ee 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h
+++ b/volk/kernels/volk/volk_16ic_deinterleave_real_16i.h
@@ -103,7 +103,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_16ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const int16_t* complexVectorPtr = (int16_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h
index cd2fabb521..f3d0c83524 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a.h
+++ b/volk/kernels/volk/volk_16ic_deinterleave_real_8i.h
@@ -66,7 +66,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   int16_t* complexVectorPtr = (int16_t*)complexVector;
   int8_t* iBufferPtr = iBuffer;
@@ -85,7 +85,7 @@ static inline void volk_16ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, con
   \param num_points The number of complex data values to be deinterleaved
 */
 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16ic_deinterleave_real_8i_a_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
     volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/kernels/volk/volk_16ic_magnitude_16i.h
index 317075e85e..b33306a123 100644
--- a/volk/include/volk/volk_16ic_magnitude_16i_a.h
+++ b/volk/kernels/volk/volk_16ic_magnitude_16i.h
@@ -161,7 +161,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const
   \param magnitudeVector The vector containing the real output values
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
-static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
   const int16_t* complexVectorPtr = (const int16_t*)complexVector;
   int16_t* magnitudeVectorPtr = magnitudeVector;
   unsigned int number = 0;
@@ -182,7 +182,7 @@ static inline void volk_16ic_magnitude_16i_a_generic(int16_t* magnitudeVector, c
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
 extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-static inline void volk_16ic_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
     volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
index 1300395ff0..55243b4aa8 100644
--- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h
+++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
@@ -78,7 +78,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa
     \param scalar The data value to be divided against each input data value of the input complex vector
     \param num_points The number of complex data values to be deinterleaved
   */
-static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
   const int16_t* complexVectorPtr = (const int16_t*)complexVector;
   float* iBufferPtr = iBuffer;
   float* qBufferPtr = qBuffer;
@@ -100,7 +100,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer,
     \param num_points The number of complex data values to be deinterleaved
   */
 extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16ic_s32f_deinterleave_32f_x2_a_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
     volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
index 5e2d82b947..57d078a595 100644
--- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h
+++ b/volk/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
@@ -108,7 +108,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co
   \param scalar The scaling value being multiplied against each data point
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_16ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
   unsigned int number = 0;
   const int16_t* complexVectorPtr = (const int16_t*)complexVector;
   float* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h
index d20eea1a79..27901cb9ac 100644
--- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h
+++ b/volk/kernels/volk/volk_16ic_s32f_magnitude_32f.h
@@ -149,7 +149,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co
   \param scalar The data value to be divided against each input data value of the input complex vector
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
-static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
   const int16_t* complexVectorPtr = (const int16_t*)complexVector;
   float* magnitudeVectorPtr = magnitudeVector;
   unsigned int number = 0;
@@ -171,7 +171,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_generic(float* magnitudeVector
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
 extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16ic_s32f_magnitude_32f_a_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
     volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/kernels/volk/volk_16u_byteswap.h
index fc3eb5fa7a..57f2008991 100644
--- a/volk/include/volk/volk_16u_byteswap_a.h
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -1,3 +1,66 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+  uint16_t* inputPtr = intsToSwap;
+  __m128i input, left, right, output;
+
+  const unsigned int eighthPoints = num_points / 8;
+  for(;number < eighthPoints; number++){
+    // Load the 16t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_loadu_si128((__m128i*)inputPtr);
+    // Do the two shifts
+    left = _mm_slli_epi16(input, 8);
+    right = _mm_srli_epi16(input, 8);
+    // Or the left and right halves together
+    output = _mm_or_si128(left, right);
+    // Store the results
+    _mm_storeu_si128((__m128i*)inputPtr, output);
+    inputPtr += 8;
+  }
+
+  // Byteswap any remaining points:
+  number = eighthPoints*8;
+  for(; number < num_points; number++){
+    uint16_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int point;
+  uint16_t* inputPtr = intsToSwap;
+  for(point = 0; point < num_points; point++){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
 #ifndef INCLUDED_volk_16u_byteswap_a_H
 #define INCLUDED_volk_16u_byteswap_a_H
 
@@ -68,7 +131,7 @@ static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned in
   \param numDataPoints The number of data points
 */
 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
-static inline void volk_16u_byteswap_a_orc(uint16_t* intsToSwap, unsigned int num_points){
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
     volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/kernels/volk/volk_32f_accumulator_s32f.h
index 78364d0a01..a67d10f9b5 100644
--- a/volk/include/volk/volk_32f_accumulator_s32f_a.h
+++ b/volk/kernels/volk/volk_32f_accumulator_s32f.h
@@ -50,7 +50,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i
   \param inputBuffer The buffer of data to be accumulated
   \param num_points The number of values in inputBuffer to be accumulated
 */
-static inline void volk_32f_accumulator_s32f_a_generic(float* result, const float* inputBuffer, unsigned int num_points){
+static inline void volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
   const float* aPtr = inputBuffer;
   unsigned int number = 0;
   float returnValue = 0;
diff --git a/volk/kernels/volk/volk_32f_convert_64f.h b/volk/kernels/volk/volk_32f_convert_64f.h
new file mode 100644
index 0000000000..2f036955dd
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_convert_64f.h
@@ -0,0 +1,140 @@
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
+#ifndef INCLUDED_volk_32f_convert_64f_a_H
+#define INCLUDED_volk_32f_convert_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_store_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_store_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/kernels/volk/volk_32f_index_max_16u.h
index b9ca1dd3e7..dd1aed2459 100644
--- a/volk/include/volk/volk_32f_index_max_16u_a.h
+++ b/volk/kernels/volk/volk_32f_index_max_16u.h
@@ -124,7 +124,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa
 #endif /*LV_HAVE_SSE*/
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+static inline void volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
   if(num_points > 0){
     float max = src0[0];
     unsigned int index = 0;
diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
index 43713f8b5a..71881c2d5f 100644
--- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -87,7 +87,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co
   \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
   \param num_points The number of real values in the input vector.
 */
-static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
   if (num_points < 1) {
     return;
   }
diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
index db61e359d6..bf05a882d5 100644
--- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
@@ -128,7 +128,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois
   \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
   \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
 */
-static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
   float sumMean = 0.0;
   unsigned int number;
   // find the sum (for mean), etc
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
index 56e42c9bd5..9fd758655f 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -127,7 +127,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
     \param num_points The number of data values to be converted
     \note Input buffer does NOT need to be properly aligned
   */
-static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
   int16_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
@@ -150,3 +150,153 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co
 
 
 #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+  __m128 ret1, ret2;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    // Scale and clip
+    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+    intInputVal1 = _mm_cvtps_epi32(ret1);
+    intInputVal2 = _mm_cvtps_epi32(ret2);
+
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)rintf(r);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
+
+  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    // Scale and clip
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)rintf(r);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
+  for(number = 0; number < num_points; number++){
+    r  = *inputVectorPtr++ * scalar;
+    if(r < min_val)
+      r = min_val;
+    else if(r > max_val)
+      r = max_val;
+    *outputVectorPtr++ = (int16_t)rintf(r);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/kernels/volk/volk_32f_s32f_convert_32i.h
index 38e6b2e745..1a46093ee2 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_32i.h
@@ -1,3 +1,145 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
+
+  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
+  for(number = 0; number < num_points; number++){
+    r = *inputVectorPtr++ * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int32_t)(r);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
 #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
 #define INCLUDED_volk_32f_s32f_convert_32i_a_H
 
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/kernels/volk/volk_32f_s32f_convert_8i.h
index 870e9419bb..b451505221 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_8i.h
@@ -132,7 +132,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
     \param num_points The number of data values to be converted
     \note Input buffer does NOT need to be properly aligned
   */
-static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
   int8_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
@@ -155,3 +155,158 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons
 
 
 #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+    inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+    inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+    intInputVal1 = _mm_cvtps_epi32(inputVal1);
+    intInputVal2 = _mm_cvtps_epi32(inputVal2);
+    intInputVal3 = _mm_cvtps_epi32(inputVal3);
+    intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int8_t)(r);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
+
+  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int8_t)(r);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
+  for(number = 0; number < num_points; number++){
+    r = *inputVectorPtr++ * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int8_t)(r);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
index 99b8e68c5b..2dd86a17c2 100644
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -1,3 +1,105 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for(;number < quarterPoints; number++){
+
+      aVal = _mm_loadu_ps(aPtr);
+
+      cVal = _mm_mul_ps(aVal, bVal);
+
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for(;number < eighthPoints; number++){
+
+      aVal = _mm256_loadu_ps(aPtr);
+
+      cVal = _mm256_mul_ps(aVal, bVal);
+
+      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const float* inputPtr = aVector;
+  float* outputPtr = cVector;
+  for(number = 0; number < num_points; number++){
+    *outputPtr = (*inputPtr) * scalar;
+    inputPtr++;
+    outputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
 
@@ -108,7 +210,7 @@ static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const fl
   \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
 */
 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
     volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_normalize_a.h b/volk/kernels/volk/volk_32f_s32f_normalize.h
index f5fd0d1dba..a0bd33c7dc 100644
--- a/volk/include/volk/volk_32f_s32f_normalize_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_normalize.h
@@ -49,7 +49,7 @@ static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float s
   \param bVector One of the vectors to be normalizeed
   \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
 */
-static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
   unsigned int number = 0;
   float* inputPtr = vecBuffer;
   const float invScalar = 1.0 / scalar;
@@ -69,7 +69,7 @@ static inline void volk_32f_s32f_normalize_a_generic(float* vecBuffer, const flo
   \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
 */
 extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_s32f_normalize_a_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
     float invscalar = 1.0 / scalar;
     volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
 }
diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/kernels/volk/volk_32f_s32f_power_32f.h
index 633ad14b09..2822444686 100644
--- a/volk/include/volk/volk_32f_s32f_power_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_power_32f.h
@@ -127,7 +127,7 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV
     \param power The power value to be applied to each data point
     \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
   */
-static inline void volk_32f_s32f_power_32f_a_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+static inline void volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
   float* cPtr = cVector;
   const float* aPtr = aVector;
   unsigned int number = 0;
diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h
index 98401b2d42..0622b278a6 100644
--- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h
+++ b/volk/kernels/volk/volk_32f_s32f_stddev_32f.h
@@ -120,7 +120,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in
   \param mean The mean of the input buffer
   \param num_points The number of values in input buffer to used in the stddev calculation
 */
-static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
   float returnValue = 0;
   if(num_points > 0){
     const float* aPtr = inputBuffer;
diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/kernels/volk/volk_32f_sqrt_32f.h
index d9b16fc0fb..ab9fffd7dc 100644
--- a/volk/include/volk/volk_32f_sqrt_32f_a.h
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -47,7 +47,7 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector,
   \param aVector One of the vectors to be sqrted
   \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
 */
-static inline void volk_32f_sqrt_32f_a_generic(float* cVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points){
     float* cPtr = cVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
@@ -66,7 +66,7 @@ extern void volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
   \param aVector One of the vectors to be sqrted
   \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
 */
-static inline void volk_32f_sqrt_32f_a_orc(float* cVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points){
     volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
 }
 
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
index 7de32f7b18..9bded6713d 100644
--- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h
+++ b/volk/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
@@ -143,7 +143,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m
   \param inputBuffer The buffer of points to calculate the std deviation for
   \param num_points The number of values in input buffer to used in the stddev and mean calculations
 */
-static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
   float returnValue = 0;
   float newMean = 0;
   if(num_points > 0){
diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/kernels/volk/volk_32f_x2_add_32f.h
index 51e63e54d2..42278f6068 100644
--- a/volk/include/volk/volk_32f_x2_add_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_add_32f.h
@@ -1,3 +1,69 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+
+      aVal = _mm_loadu_ps(aPtr);
+      bVal = _mm_loadu_ps(bPtr);
+
+      cVal = _mm_add_ps(aVal, bVal);
+
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
 #define INCLUDED_volk_32f_x2_add_32f_a_H
 
@@ -72,7 +138,7 @@ static inline void volk_32f_x2_add_32f_a_generic(float* cVector, const float* aV
   \param num_points The number of values in aVector and bVector to be added together and stored into cVector
 */
 extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_add_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/kernels/volk/volk_32f_x2_divide_32f.h
index 7b60fb22ef..d5a7c7d7c0 100644
--- a/volk/include/volk/volk_32f_x2_divide_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_divide_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe
   \param bVector The divisor vector
   \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
 */
-static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     float* cPtr = cVector;
     const float* aPtr = aVector;
     const float* bPtr=  bVector;
@@ -72,7 +72,7 @@ static inline void volk_32f_x2_divide_32f_a_generic(float* cVector, const float*
   \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
 */
 extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_divide_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
index 961c2418ca..8fcc7deaed 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -8,7 +8,7 @@
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
 
   float dotProduct = 0;
   const float* aPtr = input;
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
new file mode 100644
index 0000000000..b91252e36f
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -0,0 +1,580 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_loadu_ps(aPtr);
+    a1Val = _mm_loadu_ps(aPtr+4);
+    a2Val = _mm_loadu_ps(aPtr+8);
+    a3Val = _mm_loadu_ps(aPtr+12);
+    b0Val = _mm_loadu_ps(bPtr);
+    b1Val = _mm_loadu_ps(bPtr+4);
+    b2Val = _mm_loadu_ps(bPtr+8);
+    b3Val = _mm_loadu_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_loadu_ps(aPtr);
+    a1Val = _mm_loadu_ps(aPtr+4);
+    a2Val = _mm_loadu_ps(aPtr+8);
+    a3Val = _mm_loadu_ps(aPtr+12);
+    b0Val = _mm_loadu_ps(bPtr);
+    b1Val = _mm_loadu_ps(bPtr+4);
+    b2Val = _mm_loadu_ps(bPtr+8);
+    b3Val = _mm_loadu_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m256 a0Val, a1Val;
+  __m256 b0Val, b1Val;
+  __m256 c0Val, c1Val;
+
+  __m256 dotProdVal0 = _mm256_setzero_ps();
+  __m256 dotProdVal1 = _mm256_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm256_loadu_ps(aPtr);
+    a1Val = _mm256_loadu_ps(aPtr+8);
+    b0Val = _mm256_loadu_ps(bPtr);
+    b1Val = _mm256_loadu_ps(bPtr+8);
+
+    c0Val = _mm256_mul_ps(a0Val, b0Val);
+    c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+  dotProduct += dotProductVector[4];
+  dotProduct += dotProductVector[5];
+  dotProduct += dotProductVector[6];
+  dotProduct += dotProductVector[7];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m256 a0Val, a1Val;
+  __m256 b0Val, b1Val;
+  __m256 c0Val, c1Val;
+
+  __m256 dotProdVal0 = _mm256_setzero_ps();
+  __m256 dotProdVal1 = _mm256_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm256_load_ps(aPtr);
+    a1Val = _mm256_load_ps(aPtr+8);
+    b0Val = _mm256_load_ps(bPtr);
+    b1Val = _mm256_load_ps(bPtr+8);
+
+    c0Val = _mm256_mul_ps(a0Val, b0Val);
+    c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+  dotProduct += dotProductVector[4];
+  dotProduct += dotProductVector[5];
+  dotProduct += dotProductVector[6];
+  dotProduct += dotProductVector[7];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
index 52d80b6bb3..0935cb32bd 100644
--- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h
+++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -56,7 +56,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c
   \param complexVector The complex output vector
   \param num_points The number of complex data values to be interleaved
 */
-static inline void volk_32f_x2_interleave_32fc_a_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
   float* complexVectorPtr = (float*)complexVector;
   const float* iBufferPtr = iBuffer;
   const float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/kernels/volk/volk_32f_x2_max_32f.h
index 79f2d04b56..27633acae8 100644
--- a/volk/include/volk/volk_32f_x2_max_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_max_32f.h
@@ -53,7 +53,7 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto
   \param bVector The vector to be checked
   \param num_points The number of values in aVector and bVector to be checked and stored into cVector
 */
-static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     float* cPtr = cVector;
     const float* aPtr = aVector;
     const float* bPtr=  bVector;
@@ -76,7 +76,7 @@ static inline void volk_32f_x2_max_32f_a_generic(float* cVector, const float* aV
   \param num_points The number of values in aVector and bVector to be checked and stored into cVector
 */
 extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_max_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/kernels/volk/volk_32f_x2_min_32f.h
index 42cac08339..4773d13211 100644
--- a/volk/include/volk/volk_32f_x2_min_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -53,7 +53,7 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto
   \param bVector The vector to be checked
   \param num_points The number of values in aVector and bVector to be checked and stored into cVector
 */
-static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     float* cPtr = cVector;
     const float* aPtr = aVector;
     const float* bPtr=  bVector;
@@ -76,7 +76,7 @@ static inline void volk_32f_x2_min_32f_a_generic(float* cVector, const float* aV
   \param num_points The number of values in aVector and bVector to be checked and stored into cVector
 */
 extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_min_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
index 340e051657..9fdbec0a2c 100644
--- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -1,3 +1,109 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+
+      aVal = _mm_loadu_ps(aPtr);
+      bVal = _mm_loadu_ps(bPtr);
+
+      cVal = _mm_mul_ps(aVal, bVal);
+
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Multiplies the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m256 aVal, bVal, cVal;
+    for(;number < eighthPoints; number++){
+
+      aVal = _mm256_loadu_ps(aPtr);
+      bVal = _mm256_loadu_ps(bPtr);
+
+      cVal = _mm256_mul_ps(aVal, bVal);
+
+      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      bPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
 
@@ -111,7 +217,7 @@ static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, const floa
   \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
 */
 extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_multiply_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
index 10fc267dcd..ce7b91a318 100644
--- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h
+++ b/volk/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
@@ -137,7 +137,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect
     \param scalar The scaling value being multiplied against each data point
     \param num_points The number of complex data values to be interleaved
   */
-static inline void volk_32f_x2_s32f_interleave_16ic_a_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
   int16_t* complexVectorPtr = (int16_t*)complexVector;
   const float* iBufferPtr = iBuffer;
   const float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
index e2b8be797f..8ea491f988 100644
--- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a
   \param bVector The vector to be subtracted
   \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
 */
-static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     float* cPtr = cVector;
     const float* aPtr = aVector;
     const float* bPtr=  bVector;
@@ -72,7 +72,7 @@ static inline void volk_32f_x2_subtract_32f_a_generic(float* cVector, const floa
   \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
 */
 extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_x2_subtract_32f_a_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
     volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index e33e5a916a..e975f14e92 100644
--- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -101,7 +101,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0
 
 #ifdef LV_HAVE_GENERIC
 
-static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
+static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
 
   const unsigned int num_bytes = num_points*4;
 
diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index 109b787e8c..e0a8a59ced 100644
--- a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -8,7 +8,7 @@
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
 
   float res[2];
   float *realpt = &res[0], *imagpt = &res[1];
diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
index 28d584bf2c..104e3250e6 100644
--- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -64,7 +64,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l
     \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
     \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
   */
-static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
   lv_32fc_t* cPtr = cVector;
   const lv_32fc_t* aPtr = aVector;
   const float* bPtr=  bVector;
@@ -85,7 +85,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con
     \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
   */
 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32fc_32f_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
     volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
index 919280d510..dce897ff57 100644
--- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -1,3 +1,67 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for(;number < halfPoints; number++){
+
+      x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+      x = _mm_xor_ps(x, conjugator); // conjugate register
+
+      _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+      a += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = lv_conj(*a);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = lv_conj(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
 #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
 #define INCLUDED_volk_32fc_conjugate_32fc_a_H
 
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index 4106f38513..0d33ed7e28 100644
--- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -57,7 +57,7 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB
   \param qBuffer The Q buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_32fc_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
   const float* complexVectorPtr = (float*)complexVector;
   float* iBufferPtr = iBuffer;
   float* qBufferPtr = qBuffer;
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
new file mode 100644
index 0000000000..4a4c5509bd
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for(;number < halfPoints; number++){
+
+      cplxValue = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm_cvtps_pd(fVal);
+      _mm_storeu_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm_cvtps_pd(fVal);
+      _mm_storeu_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 2;
+      qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  double* qBufferPtr = qBuffer;
+
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    *qBufferPtr++ = (double)*complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for(;number < halfPoints; number++){
+
+      cplxValue = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm_cvtps_pd(fVal);
+      _mm_store_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm_cvtps_pd(fVal);
+      _mm_store_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 2;
+      qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  double* qBufferPtr = qBuffer;
+
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    *qBufferPtr++ = (double)*complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index c88809bebd..b1968296f5 100644
--- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l
   \param qBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const float* complexVectorPtr = (float*)complexVector;
   float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
index 0d6c6b7af4..3d57598135 100644
--- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -51,7 +51,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_32fc_deinterleave_real_32f_a_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const float* complexVectorPtr = (float*)complexVector;
   float* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h
index 1e346bacaf..1fa66e8add 100644
--- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_real_64f.h
@@ -49,7 +49,7 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_32fc_deinterleave_real_64f_a_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const float* complexVectorPtr = (float*)complexVector;
   double* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/kernels/volk/volk_32fc_index_max_16u.h
index 0e2201152c..c8d7212401 100644
--- a/volk/include/volk/volk_32fc_index_max_16u_a.h
+++ b/volk/kernels/volk/volk_32fc_index_max_16u.h
@@ -189,7 +189,7 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_
 #endif /*LV_HAVE_SSE3*/
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
+static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
 
   const unsigned int num_bytes = num_points*8;
 
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index efb84a904b..64e99cc1be 100644
--- a/volk/include/volk/volk_32fc_magnitude_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -1,3 +1,121 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
 #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
 #define INCLUDED_volk_32fc_magnitude_32f_a_H
 
@@ -123,7 +241,7 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con
     \param num_points The number of complex values in complexVector to be calculated and stored into cVector
   */
 extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
-static inline void volk_32fc_magnitude_32f_a_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
     volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
new file mode 100644
index 0000000000..0af81401a8
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -0,0 +1,228 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h
index d86bd63c1c..b076ab44ef 100644
--- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -139,7 +139,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,  const lv
   \param normalizeFactor The atan2 results will be divided by this normalization factor.
   \param num_points The number of complex values in the input vector.
 */
-static inline void volk_32fc_s32f_atan2_32f_a_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
   float* outPtr = outputVector;
   const float* inPtr = (float*)inputVector;
   const float invNormalizeFactor = 1.0 / normalizeFactor;
diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
index 1c17fb70c6..9e10217a0f 100644
--- a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
@@ -63,7 +63,7 @@ static inline void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer,
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_32fc_s32f_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
   const float* complexVectorPtr = (float*)complexVector;
   int16_t* iBufferPtr = iBuffer;
   unsigned int number = 0;
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h
index 38fd609d31..09abd967d6 100644
--- a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_magnitude_16i.h
@@ -129,7 +129,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
   \param magnitudeVector The vector containing the real output values
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
-static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
   const float* complexVectorPtr = (float*)complexVector;
   int16_t* magnitudeVectorPtr = magnitudeVector;
   unsigned int number = 0;
@@ -150,7 +150,7 @@ static inline void volk_32fc_s32f_magnitude_16i_a_generic(int16_t* magnitudeVect
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
 extern void volk_32fc_s32f_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_32fc_s32f_magnitude_16i_a_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_32fc_s32f_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
     volk_32fc_s32f_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h
index 3106edbefd..d4a1d17469 100644
--- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_power_32fc.h
@@ -94,7 +94,7 @@ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_
     \param power The power value to be applied to each data point
     \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
   */
-static inline void volk_32fc_s32f_power_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
   lv_32fc_t* cPtr = cVector;
   const lv_32fc_t* aPtr = aVector;
   unsigned int number = 0;
diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
index 30a77dbc18..f76d9d35e4 100644
--- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
@@ -96,7 +96,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu
   \param normalizationFactor This value is divided agains all the input values before the power is calculated
   \param num_points The number of fft data points
 */
-static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+static inline void volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
   // Calculate the Power of the complex point
   const float* inputPtr = (float*)complexFFTInput;
   float* realFFTDataPointsPtr = logPowerOutput;
diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
index 27f755351d..e73eb09f8f 100644
--- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -103,7 +103,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo
   \param rbw The resolution bandwith of the fft spectrum
   \param num_points The number of fft data points
 */
-static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
   // Calculate the Power of the complex point
   const float* inputPtr = (float*)complexFFTInput;
   float* realFFTDataPointsPtr = logPowerOutput;
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index f206c5e874..668a047609 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -1,3 +1,90 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Multiplies the input vector by a scalar and stores the results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be multiplied
+  \param scalar The complex scalar to multiply aVector
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(scalar));
+    yh = _mm_set_ps1(lv_cimag(scalar));
+
+    for(;number < halfPoints; number++){
+
+      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Multiplies the input vector by a scalar and stores the results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be multiplied
+  \param scalar The complex scalar to multiply aVector
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8){
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0)
+      *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
 
diff --git a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
index eee9f0064f..ab6b7fb1df 100644
--- a/volk/include/volk/volk_32fc_s32fc_rotatorpuppet_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
@@ -4,7 +4,7 @@
 
 #include <volk/volk_complex.h>
 #include <stdio.h>
-#include <volk/volk_32fc_s32fc_x2_rotator_32fc_a.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
 
 
 #ifdef LV_HAVE_GENERIC
@@ -19,9 +19,9 @@
 */
 
 
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){    
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){    
     lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
-    volk_32fc_s32fc_x2_rotator_32fc_a_generic(outVector, inVector, phase_inc, phase, num_points);
+    volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc, phase, num_points);
     
 }
 #endif /* LV_HAVE_GENERIC */
@@ -32,7 +32,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_generic(lv_32fc_t* outVe
 
 static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){    
     lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
-    volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc, phase, num_points);
+    volk_32fc_s32fc_x2_rotator_32fc_sse4_1(outVector, inVector, phase_inc, phase, num_points);
     
 }
 
@@ -58,7 +58,7 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec
 
 static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){    
     lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
-    volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc, phase, num_points);
+    volk_32fc_s32fc_x2_rotator_32fc_avx(outVector, inVector, phase_inc, phase, num_points);
     
 }
     
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
index 51b6041ec0..ffbbdff690 100644
--- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -20,7 +20,7 @@
 */
 
 
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){    
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){    
     unsigned int i = 0; 
     int j = 0;    
     for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
@@ -42,7 +42,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVecto
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
 
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
     lv_32fc_t* cPtr = outVector;
     const lv_32fc_t* aPtr = inVector;
     lv_32fc_t incr = 1;
@@ -153,7 +153,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
 
 
 
-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
+static inline void volk_32fc_s32fc_x2_rotator_32fc_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
     lv_32fc_t* cPtr = outVector;
     const lv_32fc_t* aPtr = inVector;
     lv_32fc_t incr = 1;
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
index 0deb9c2f90..e6ccf5c384 100644
--- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -1,3 +1,152 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+#include<volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+  const unsigned int num_bytes = num_points*8;
+
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+
+
+
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  unsigned int i = 0;
+
+
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+
+    in += 4;
+    tp += 4;
+
+  }
+
+
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+
+
+
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+  }
+  /*
+  for(i = 0; i < num_bytes >> 3; ++i) {
+    *result += input[i] * conjf(taps[i]);
+  }
+  */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <mmintrin.h>
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+  unsigned int num_bytes = num_points*8;
+
+  // Variable never used?
+  //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+  union HalfMask {
+    uint32_t intRep[4];
+    __m128 vec;
+    } halfMask;
+
+  union NegMask {
+    int intRep[4];
+    __m128 vec;
+  } negMask;
+
+  unsigned int offset = 0;
+  float Rsum=0, Isum=0;
+  float Im,Re;
+
+  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
+  __m128 zv = {0,0,0,0};
+
+  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
+  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
+
+  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
+  negMask.intRep[1] = negMask.intRep[3] = 0;
+
+  // main loop
+  while(num_bytes >= 4*sizeof(float)){
+
+    in1 = _mm_loadu_ps( (float*) (input+offset) );
+    in2 = _mm_loadu_ps( (float*) (taps+offset) );
+    Rv = _mm_mul_ps(in1, in2);
+    fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+    Iv = _mm_mul_ps(in1, fehg);
+    Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
+    Ivm = _mm_xor_ps( negMask.vec, Iv );
+    Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
+    _mm_store_ss( &Im, Is );
+    _mm_store_ss( &Re, Rs );
+    num_bytes -= 4*sizeof(float);
+    offset += 2;
+    Rsum += Re;
+    Isum += Im;
+  }
+
+  // handle the last complex case ...
+  if(num_bytes > 0){
+
+    if(num_bytes != 4){
+      // bad things are happening
+    }
+
+    in1 = _mm_loadu_ps( (float*) (input+offset) );
+    in2 = _mm_loadu_ps( (float*) (taps+offset) );
+    Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
+    fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
+    Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
+    Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
+    Ivm = _mm_xor_ps( negMask.vec, Iv );
+    Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
+    _mm_store_ss( &Im, Is );
+    _mm_store_ss( &Re, Rs );
+    Rsum += Re;
+    Isum += Im;
+  }
+
+  result[0] = lv_cmake(Rsum,Isum);
+  return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+
+
+
 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
 
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 10ff4080ed..066bed4439 100644
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -1,3 +1,119 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_points/2;
+  unsigned int isodd = num_points &1;
+
+
+
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  unsigned int i = 0;
+
+
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+
+
+    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+
+    in += 4;
+    tp += 4;
+
+  }
+
+
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+
+
+
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[num_points - 1] * taps[num_points - 1];
+
+  }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int halfPoints = num_points/2;
+
+  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm_setzero_ps();
+
+  for(;number < halfPoints; number++){
+
+    x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+    a += 2;
+    b += 2;
+  }
+
+  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+  _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+  if(num_points % 1 != 0) {
+    dotProduct += (*a) * (*b);
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
 
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index f79ddb59bf..7db68c1bd8 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -1,3 +1,80 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < halfPoints; number++){
+
+      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
 
@@ -81,7 +158,7 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons
     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
   */
 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
-static inline void volk_32fc_x2_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
     volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
new file mode 100644
index 0000000000..cfd6c007f1
--- /dev/null
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -0,0 +1,162 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for(;number < halfPoints; number++){
+
+      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+      y = _mm_xor_ps(y, conjugator); // conjugate y
+
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * lv_conj(*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for(;number < halfPoints; number++){
+
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+      y = _mm_xor_ps(y, conjugator); // conjugate y
+
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * lv_conj(*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
index d985fcd7f5..cb2e945015 100644
--- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
@@ -107,7 +107,7 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t
 #endif /*LV_HAVE_SSE3*/
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
 
   const unsigned int num_bytes = num_points*8;
 
diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
index a10b6702bb..27a081b7cf 100644
--- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h
+++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -93,7 +93,7 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
 #endif /*LV_HAVE_SSE3*/
 
 #ifdef LV_HAVE_GENERIC
-static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
 
   const unsigned int num_bytes = num_points*8;
 
diff --git a/volk/kernels/volk/volk_32i_s32f_convert_32f.h b/volk/kernels/volk/volk_32i_s32f_convert_32f.h
new file mode 100644
index 0000000000..7a09883453
--- /dev/null
+++ b/volk/kernels/volk/volk_32i_s32f_convert_32f.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_store_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/kernels/volk/volk_32i_x2_and_32i.h
index e5330847b3..54ecb79812 100644
--- a/volk/include/volk/volk_32i_x2_and_32i_a.h
+++ b/volk/kernels/volk/volk_32i_x2_and_32i.h
@@ -51,7 +51,7 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV
   \param bVector One of the vectors
   \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
 */
-static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
     int32_t* cPtr = cVector;
     const int32_t* aPtr = aVector;
     const int32_t* bPtr=  bVector;
@@ -72,7 +72,7 @@ static inline void volk_32i_x2_and_32i_a_generic(int32_t* cVector, const int32_t
   \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
 */
 extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32i_x2_and_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
     volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/kernels/volk/volk_32i_x2_or_32i.h
index 24045894c6..acadd5a57f 100644
--- a/volk/include/volk/volk_32i_x2_or_32i_a.h
+++ b/volk/kernels/volk/volk_32i_x2_or_32i.h
@@ -51,7 +51,7 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe
   \param bVector One of the vectors to be ored
   \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
 */
-static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
     int32_t* cPtr = cVector;
     const int32_t* aPtr = aVector;
     const int32_t* bPtr=  bVector;
@@ -72,7 +72,7 @@ static inline void volk_32i_x2_or_32i_a_generic(int32_t* cVector, const int32_t*
   \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
 */
 extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32i_x2_or_32i_a_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
     volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/kernels/volk/volk_32u_byteswap.h
index 71ae027d37..8f6e3ad7b5 100644
--- a/volk/include/volk/volk_32u_byteswap_a.h
+++ b/volk/kernels/volk/volk_32u_byteswap.h
@@ -1,3 +1,80 @@
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+
+  uint32_t* inputPtr = intsToSwap;
+  __m128i input, byte1, byte2, byte3, byte4, output;
+  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the 32t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_loadu_si128((__m128i*)inputPtr);
+    // Do the four shifts
+    byte1 = _mm_slli_epi32(input, 24);
+    byte2 = _mm_slli_epi32(input, 8);
+    byte3 = _mm_srli_epi32(input, 8);
+    byte4 = _mm_srli_epi32(input, 24);
+    // Or bytes together
+    output = _mm_or_si128(byte1, byte4);
+    byte2 = _mm_and_si128(byte2, byte2mask);
+    output = _mm_or_si128(output, byte2);
+    byte3 = _mm_and_si128(byte3, byte3mask);
+    output = _mm_or_si128(output, byte3);
+    // Store the results
+    _mm_storeu_si128((__m128i*)inputPtr, output);
+    inputPtr += 4;
+  }
+
+  // Byteswap any remaining points:
+  number = quarterPoints*4;
+  for(; number < num_points; number++){
+    uint32_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = intsToSwap;
+
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output = *inputPtr;
+    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
 #ifndef INCLUDED_volk_32u_byteswap_a_H
 #define INCLUDED_volk_32u_byteswap_a_H
 
diff --git a/volk/include/volk/volk_32u_popcnt_a.h b/volk/kernels/volk/volk_32u_popcnt.h
index b72d605c67..9783569729 100644
--- a/volk/include/volk/volk_32u_popcnt_a.h
+++ b/volk/kernels/volk/volk_32u_popcnt.h
@@ -7,7 +7,7 @@
 
 #ifdef LV_HAVE_GENERIC
 
-static inline void volk_32u_popcnt_a_generic(uint32_t* ret, const uint32_t value) {
+static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) {
 
   // This is faster than a lookup table
   uint32_t retVal = value;
diff --git a/volk/kernels/volk/volk_64f_convert_32f.h b/volk/kernels/volk/volk_64f_convert_32f.h
new file mode 100644
index 0000000000..c27526ffaf
--- /dev/null
+++ b/volk/kernels/volk/volk_64f_convert_32f.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_storeu_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
+#ifndef INCLUDED_volk_64f_convert_32f_a_H
+#define INCLUDED_volk_64f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_store_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/kernels/volk/volk_64f_x2_max_64f.h
index 33aae6d102..f9a04c2c40 100644
--- a/volk/include/volk/volk_64f_x2_max_64f_a.h
+++ b/volk/kernels/volk/volk_64f_x2_max_64f.h
@@ -53,7 +53,7 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe
   \param bVector The vector to be checked
   \param num_points The number of values in aVector and bVector to be checked and stored into cVector
 */
-static inline void volk_64f_x2_max_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+static inline void volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
     double* cPtr = cVector;
     const double* aPtr = aVector;
     const double* bPtr=  bVector;
diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/kernels/volk/volk_64f_x2_min_64f.h
index 25d8b4c982..c77ca87fbd 100644
--- a/volk/include/volk/volk_64f_x2_min_64f_a.h
+++ b/volk/kernels/volk/volk_64f_x2_min_64f.h
@@ -53,7 +53,7 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe
   \param bVector The vector to be checked
   \param num_points The number of values in aVector and bVector to be checked and stored into cVector
 */
-static inline void volk_64f_x2_min_64f_a_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+static inline void volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
     double* cPtr = cVector;
     const double* aPtr = aVector;
     const double* bPtr=  bVector;
diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/kernels/volk/volk_64u_byteswap.h
index 3d1d87623e..e05daf6d5c 100644
--- a/volk/include/volk/volk_64u_byteswap_a.h
+++ b/volk/kernels/volk/volk_64u_byteswap.h
@@ -1,3 +1,91 @@
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+    uint64_t number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    for(;number < halfPoints; number++){
+      // Load the 32t values, increment inputPtr later since we're doing it in-place.
+      input = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Do the four shifts
+      byte1 = _mm_slli_epi32(input, 24);
+      byte2 = _mm_slli_epi32(input, 8);
+      byte3 = _mm_srli_epi32(input, 8);
+      byte4 = _mm_srli_epi32(input, 24);
+      // Or bytes together
+      output = _mm_or_si128(byte1, byte4);
+      byte2 = _mm_and_si128(byte2, byte2mask);
+      output = _mm_or_si128(output, byte2);
+      byte3 = _mm_and_si128(byte3, byte3mask);
+      output = _mm_or_si128(output, byte3);
+
+      // Reorder the two words
+      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+      // Store the results
+      _mm_storeu_si128((__m128i*)inputPtr, output);
+      inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = halfPoints*2;
+    for(; number < num_points; number++){
+      uint32_t output1 = *inputPtr;
+      uint32_t output2 = inputPtr[1];
+
+      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+      *inputPtr++ = output2;
+      *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output1 = *inputPtr;
+    uint32_t output2 = inputPtr[1];
+
+    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+    *inputPtr++ = output2;
+    *inputPtr++ = output1;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
 #ifndef INCLUDED_volk_64u_byteswap_a_H
 #define INCLUDED_volk_64u_byteswap_a_H
 
diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/kernels/volk/volk_64u_popcnt.h
index 5e68ed2083..466cfa5dad 100644
--- a/volk/include/volk/volk_64u_popcnt_a.h
+++ b/volk/kernels/volk/volk_64u_popcnt.h
@@ -8,7 +8,7 @@
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) {
+static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) {
 
   //const uint32_t* valueVector = (const uint32_t*)&value;
 
diff --git a/volk/include/volk/volk_8i_convert_16i_a.h b/volk/kernels/volk/volk_8i_convert_16i.h
index 9104f90cb0..3e5c92723f 100644
--- a/volk/include/volk/volk_8i_convert_16i_a.h
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -1,3 +1,76 @@
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
 #ifndef INCLUDED_volk_8i_convert_16i_a_H
 #define INCLUDED_volk_8i_convert_16i_a_H
 
@@ -73,7 +146,7 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in
     \param num_points The number of data values to be converted
   */
 extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
-static inline void volk_8i_convert_16i_a_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
     volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/kernels/volk/volk_8i_s32f_convert_32f.h
index 02a7f356e0..bd7ff82d9a 100644
--- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h
+++ b/volk/kernels/volk/volk_8i_s32f_convert_32f.h
@@ -1,3 +1,97 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1( iScalar );
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
 
@@ -95,7 +189,7 @@ static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, const
     \param num_points The number of data values to be converted
   */
 extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
-static inline void volk_8i_s32f_convert_32f_a_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
     float invscalar = 1.0 / scalar;
     volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
 }
diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index 8f13da32ff..b59d22d186 100644
--- a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -59,7 +59,7 @@ static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
   \param qBuffer The Q buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_8ic_deinterleave_16i_x2_a_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
   const int8_t* complexVectorPtr = (const int8_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
   int16_t* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
index d26b3d0d0d..82cedb2bb7 100644
--- a/volk/include/volk/volk_8ic_deinterleave_real_16i_a.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -49,7 +49,7 @@ static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, con
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_8ic_deinterleave_real_16i_a_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const int8_t* complexVectorPtr = (const int8_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
index 21efed83e7..c8ff18e67b 100644
--- a/volk/include/volk/volk_8ic_deinterleave_real_8i_a.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -50,7 +50,7 @@ static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const
   \param iBuffer The I buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_8ic_deinterleave_real_8i_a_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const int8_t* complexVectorPtr = (int8_t*)complexVector;
   int8_t* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
index d82da59fb1..9e244c8fc2 100644
--- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h
+++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
@@ -146,7 +146,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float
   \param scalar The scaling value being multiplied against each data point
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_8ic_s32f_deinterleave_32f_x2_a_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
   const int8_t* complexVectorPtr = (const int8_t*)complexVector;
   float* iBufferPtr = iBuffer;
   float* qBufferPtr = qBuffer;
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
index b2c15d3a30..56a1adcbb5 100644
--- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h
+++ b/volk/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
@@ -116,7 +116,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con
   \param scalar The scaling value being multiplied against each data point
   \param num_points The number of complex data values to be deinterleaved
 */
-static inline void volk_8ic_s32f_deinterleave_real_32f_a_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+static inline void volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
   unsigned int number = 0;
   const int8_t* complexVectorPtr = (const int8_t*)complexVector;
   float* iBufferPtr = iBuffer;
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
index f85fdb9995..685a21ddcd 100644
--- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h
+++ b/volk/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
@@ -75,7 +75,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect
   \param bVector The complex vector which will be converted to complex conjugate and multiplied
   \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
 */
-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
   unsigned int number = 0;
   int16_t* c16Ptr = (int16_t*)cVector;
   int8_t* a8Ptr = (int8_t*)aVector;
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
index 4b16171cec..edb52ff509 100644
--- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h
+++ b/volk/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
@@ -95,7 +95,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t*
   \param bVector The complex vector which will be converted to complex conjugate and multiplied
   \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
 */
-static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
   unsigned int number = 0;
   float* cPtr = (float*)cVector;
   const float invScalar = 1.0 / scalar;
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index d9aeb797c3..fcb0edb68f 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -202,7 +202,7 @@ message(STATUS "Available machines: ${available_machines}")
 #dependencies are all python, xml, and header implementation files
 file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml)
 file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py)
-file(GLOB h_files ${CMAKE_SOURCE_DIR}/include/volk/*.h)
+file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk/*.h)
 
 macro(gen_template tmpl output)
     list(APPEND volk_gen_sources ${output})
@@ -253,6 +253,7 @@ endforeach(machine_name)
 include_directories(
     ${CMAKE_BINARY_DIR}/include
     ${CMAKE_SOURCE_DIR}/include
+    ${CMAKE_SOURCE_DIR}/kernels
     ${CMAKE_CURRENT_BINARY_DIR}
     ${CMAKE_CURRENT_SOURCE_DIR}
 )
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 4e361aece2..e526eb2d01 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -63,12 +63,12 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) {
     }
 }
 
-static std::vector<std::string> get_arch_list(struct volk_func_desc desc) {
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
     std::vector<std::string> archlist;
 
-    for(int i = 0; i < desc.n_archs; i++) {
+    for(size_t i = 0; i < desc.n_impls; i++) {
         //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc
-        archlist.push_back(std::string(desc.indices[i]));
+        archlist.push_back(std::string(desc.impl_names[i]));
     }
 
     return archlist;
@@ -256,7 +256,7 @@ public:
 private: std::list<std::vector<char> > _mems;
 };
 
-bool run_volk_tests(struct volk_func_desc desc,
+bool run_volk_tests(volk_func_desc_t desc,
                     void (*manual_func)(),
                     std::string name,
                     float tol,
@@ -442,22 +442,32 @@ bool run_volk_tests(struct volk_func_desc desc,
         arch_results.push_back(!fail);
     }
 
-    double best_time = std::numeric_limits<double>::max();
-    std::string best_arch = "generic";
-    for(size_t i=0; i < arch_list.size(); i++) {
-        if((profile_times[i] < best_time) && arch_results[i]) {
-            best_time = profile_times[i];
-            best_arch = arch_list[i];
+    double best_time_a = std::numeric_limits<double>::max();
+    double best_time_u = std::numeric_limits<double>::max();
+    std::string best_arch_a = "generic";
+    std::string best_arch_u = "generic";
+    for(size_t i=0; i < arch_list.size(); i++)
+    {
+        if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
+        {
+            best_time_u = profile_times[i];
+            best_arch_u = arch_list[i];
+        }
+        if((profile_times[i] < best_time_a) && arch_results[i])
+        {
+            best_time_a = profile_times[i];
+            best_arch_a = arch_list[i];
         }
     }
 
-    std::cout << "Best arch: " << best_arch << std::endl;
+    std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+    std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
     if(best_arch_vector) {
         if(puppet_master_name == "NULL") {
-            best_arch_vector->push_back(name + std::string(" ") + best_arch);
+            best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
         }
         else {
-            best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch);
+            best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
         }
     }
 
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index 1e639ac3c6..0f17cdaa34 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -21,7 +21,7 @@ volk_type_t volk_type_from_string(std::string);
 float uniform(void);
 void random_floats(float *buf, unsigned n);
 
-bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
+bool run_volk_tests(volk_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
 
 
 #define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 9b7d9da5e3..f133897cba 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -2,109 +2,89 @@
 #include <volk/volk.h>
 #include <boost/test/unit_test.hpp>
 
-//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000);
-//VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000);
-//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000);
-//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
-//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
-VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 1);
-//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 1);
-//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
-//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
-VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1);
-//VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
+//VOLK_RUN_TESTS(volk_16i_branch_4_state_8, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_magnitude_16i, 1, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f, 1e-5, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_s32f_convert_32f, 1e-4, 32768.0, 20460, 1);
+VOLK_RUN_TESTS(volk_16i_convert_8i, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_max_star_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i, 0, 0, 20460, 10000);
+//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000);
+//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000);
+VOLK_RUN_TESTS(volk_16u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_accumulator_s32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i, 0, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_magnitude_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_16i, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_32i, 1, 2<<31, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_convert_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_convert_8i, 1, 128, 20460, 1);
+//VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f, 1e-4, 0, 2046, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, 1e-4, 10, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_divide_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i, 1e-4, 0, 204600, 1);
+//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000);
+VOLK_RUN_TESTS(volk_32f_index_max_16u, 3, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic, 1, 32767, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_max_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_min_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_normalize, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_power_32f, 1e-4, 4, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_32u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_64f_convert_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_max_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64f_x2_min_64f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_64u_byteswap, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_64u_popcnt, 0, 0, 2046, 10000);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i, 0, 256, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_convert_16i, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_8i_s32f_convert_32f, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_conjugate_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c
index 5e5c9dfff7..f787b5e2aa 100644
--- a/volk/lib/volk_prefs.c
+++ b/volk/lib/volk_prefs.c
@@ -7,7 +7,8 @@
 //#include <Windows.h>
 //#endif
 
-void get_config_path(char *path) {
+void volk_get_config_path(char *path)
+{
     const char *suffix = "/.volk/volk_config";
     char *home = NULL;
     if (home == NULL) home = getenv("HOME");
@@ -20,38 +21,30 @@ void get_config_path(char *path) {
     strcat(path, suffix);
 }
 
-//passing by reference in C can (***********)
-int load_preferences(struct volk_arch_pref **prefs) {
+size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
+{
     FILE *config_file;
-    char path[512], line[512], function[128], arch[32];
-    int n_arch_prefs = 0;
-    struct volk_arch_pref *t_pref;
+    char path[512], line[512];
+    size_t n_arch_prefs = 0;
+    volk_arch_pref_t *prefs = NULL;
 
     //get the config path
-    get_config_path(path);
+    volk_get_config_path(path);
     if (path == NULL) return n_arch_prefs; //no prefs found
     config_file = fopen(path, "r");
     if(!config_file) return n_arch_prefs; //no prefs found
 
-    while(fgets(line, 512, config_file) != NULL) {
-        if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) {
-            n_arch_prefs++;
-        }
-    }
-
-    //now allocate the memory required for volk_arch_prefs
-    (*prefs) = (struct volk_arch_pref *) malloc(n_arch_prefs * sizeof(struct volk_arch_pref));
-    t_pref = (*prefs);
-
     //reset the file pointer and write the prefs into volk_arch_prefs
-    rewind(config_file);
-    while(fgets(line, 512, config_file) != NULL) {
-        if(sscanf(line, "%s %s", function, arch) == 2 && !strncmp(function, "volk_", 5)) {
-            strncpy(t_pref->name, function, 128);
-            strncpy(t_pref->arch, arch, 32);
-            t_pref++;
+    while(fgets(line, sizeof(line), config_file) != NULL)
+    {
+        prefs = (volk_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
+        volk_arch_pref_t *p = prefs + n_arch_prefs;
+        if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
+        {
+            n_arch_prefs++;
         }
     }
     fclose(config_file);
+    *prefs_res = prefs;
     return n_arch_prefs;
 }
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
index 865d60955c..dfe44e2924 100644
--- a/volk/lib/volk_rank_archs.c
+++ b/volk/lib/volk_rank_archs.c
@@ -1,43 +1,105 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
 #include <volk_rank_archs.h>
 #include <volk/volk_prefs.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name) {
+#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
+    #define __popcnt __builtin_popcount
+#elif HAVE_INTRIN_H
+    #include <intrin.h> //defines __popcnt
+#else
+    #error no popcnt for your compiler, add one here
+#endif
+
+int volk_get_index(
+    const char *impl_names[], //list of implementations by name
+    const size_t n_impls,     //number of implementations available
+    const char *impl_name     //the implementation name to find
+){
     unsigned int i;
-    for(i=0; i<n_archs; i++) {
-        if(!strncmp(indices[i], arch_name, 20)) {
+    for (i = 0; i < n_impls; i++) {
+        if(!strncmp(impl_names[i], impl_name, 20)) {
             return i;
         }
     }
+    //TODO return -1;
     //something terrible should happen here
     printf("Volk warning: no arch found, returning generic impl\n");
-    return get_index(indices, n_archs, "generic"); //but we'll fake it for now
+    return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
 }
 
-unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char* name, unsigned int arch) {
-  unsigned int i;
-  unsigned int best_val = 0;
-  static struct volk_arch_pref *volk_arch_prefs;
-  static unsigned int n_arch_prefs = 0;
+int volk_rank_archs(
+    const char *kern_name,    //name of the kernel to rank
+    const char *impl_names[], //list of implementations by name
+    const int* impl_deps,     //requirement mask per implementation
+    const bool* alignment,    //alignment status of each implementation
+    size_t n_impls,            //number of implementations available
+    const bool align          //if false, filter aligned implementations
+){
+  size_t i;
+  static volk_arch_pref_t *volk_arch_prefs;
+  static size_t n_arch_prefs = 0;
   static int prefs_loaded = 0;
   if(!prefs_loaded) {
-      n_arch_prefs = load_preferences(&volk_arch_prefs);
+      n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
       prefs_loaded = 1;
   }
 
-  //now look for the function name in the prefs list
-  for(i=0; i < n_arch_prefs; i++) {
-      if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it
-        return get_index(indices, n_archs, volk_arch_prefs[i].arch);
-      }
-  }
+    //now look for the function name in the prefs list
+    for(i = 0; i < n_arch_prefs; i++)
+    {
+        if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
+        {
+            const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+            return volk_get_index(impl_names, n_impls, impl_name);
+        }
+    }
 
-  for(i=1; i < n_archs; ++i) {
-    if((arch_defs[i]&(!arch)) == 0) {
-      best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;
+    //return the best index with the largest deps
+    size_t best_index_a = 0;
+    size_t best_index_u = 0;
+    int best_value_a = -1;
+    int best_value_u = -1;
+    for(i = 0; i < n_impls; i++)
+    {
+        const signed val = __popcnt(impl_deps[i]);
+        if (alignment[i] && val > best_value_a)
+        {
+            best_index_a = i;
+            best_value_a = val;
+        }
+        if (!alignment[i] && val > best_value_u)
+        {
+            best_index_u = i;
+            best_value_u = val;
+        }
     }
-  }
-  return best_val;
+
+    //when align and we found a best aligned, use it
+    if (align && best_value_a != -1) return best_index_a;
+
+    //otherwise return the best unaligned
+    return best_index_u;
 }
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
index 546240d2c6..b3bf8ff17c 100644
--- a/volk/lib/volk_rank_archs.h
+++ b/volk/lib/volk_rank_archs.h
@@ -1,12 +1,48 @@
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
 #ifndef INCLUDED_VOLK_RANK_ARCHS_H
 #define INCLUDED_VOLK_RANK_ARCHS_H
 
+#include <stdlib.h>
+#include <stdbool.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-unsigned int get_index(const char *indices[], unsigned int n_archs, const char *arch_name);
-unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsigned int n_archs, const char *name, unsigned int arch);
+int volk_get_index(
+    const char *impl_names[], //list of implementations by name
+    const size_t n_impls,     //number of implementations available
+    const char *impl_name     //the implementation name to find
+);
+
+int volk_rank_archs(
+    const char *kern_name,    //name of the kernel to rank
+    const char *impl_names[], //list of implementations by name
+    const int* impl_deps,     //requirement mask per implementation
+    const bool* alignment,    //alignment status of each implementation
+    size_t n_impls,            //number of implementations available
+    const bool align          //if false, filter aligned implementations
+);
 
 #ifdef __cplusplus
 }
diff --git a/volk/tmpl/volk.tmpl.c b/volk/tmpl/volk.tmpl.c
index c3a1544ff8..f915f157f6 100644
--- a/volk/tmpl/volk.tmpl.c
+++ b/volk/tmpl/volk.tmpl.c
@@ -27,6 +27,10 @@
 #include <volk/volk.h>
 #include <stdio.h>
 #include <string.h>
+#include <assert.h>
+
+static size_t __alignment = 0;
+static intptr_t __alignment_mask = 0;
 
 struct volk_machine *get_machine(void) {
     extern struct volk_machine *volk_machines[];
@@ -46,45 +50,118 @@ struct volk_machine *get_machine(void) {
             }
         }
         printf("Using Volk machine: %s\n", machine->name);
+        __alignment = machine->alignment;
+        __alignment_mask = (intptr_t)(__alignment-1);
         return machine;
     }
 }
 
-unsigned int volk_get_alignment(void) {
-    return get_machine()->alignment;
+size_t volk_get_alignment(void)
+{
+    get_machine(); //ensures alignment is set
+    return __alignment;
+}
+
+bool volk_is_aligned(const void *ptr)
+{
+    return ((intptr_t)(ptr) & __alignment_mask) == 0;
 }
 
+#define LV_HAVE_GENERIC
+#define LV_HAVE_DISPATCHER
+
 #for $kern in $kernels
 
-void get_$(kern.name)($kern.arglist_namedefs) {
-    $kern.name = get_machine()->$(kern.name)_archs[volk_rank_archs(
-        get_machine()->$(kern.name)_indices,
-        get_machine()->$(kern.name)_arch_defs,
-        get_machine()->$(kern.name)_n_archs,
-        get_machine()->$(kern.name)_name,
-        volk_get_lvarch()
-    )];
+#if $kern.has_dispatcher
+#include <volk/$(kern.name).h> //pulls in the dispatcher
+#end if
+
+static inline void __$(kern.name)_d($kern.arglist_full)
+{
+    #if $kern.has_dispatcher
+    $(kern.name)_dispatcher($kern.arglist_names);
+    return;
+    #end if
+
+    if (volk_is_aligned(
+    #set $num_open_parens = 0
+    #for $arg_type, $arg_name in $kern.args
+        #if '*' in $arg_type
+        VOLK_OR_PTR($arg_name,
+        #set $num_open_parens += 1
+        #end if
+    #end for
+        0$(')'*$num_open_parens)
+    )){
+        $(kern.name)_a($kern.arglist_names);
+    }
+    else{
+        $(kern.name)_u($kern.arglist_names);
+    }
+}
+
+static inline void __init_$(kern.name)(void)
+{
+    const char *name = get_machine()->$(kern.name)_name;
+    const char **impl_names = get_machine()->$(kern.name)_impl_names;
+    const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+    const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+    const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+    const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
+    const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
+    $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a];
+    $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u];
+
+    assert($(kern.name)_a);
+    assert($(kern.name)_u);
+
+    $(kern.name) = &__$(kern.name)_d;
+}
+
+static inline void __$(kern.name)_a($kern.arglist_full)
+{
+    __init_$(kern.name)();
+    $(kern.name)_a($kern.arglist_names);
+}
+
+static inline void __$(kern.name)_u($kern.arglist_full)
+{
+    __init_$(kern.name)();
+    $(kern.name)_u($kern.arglist_names);
+}
+
+static inline void __$(kern.name)($kern.arglist_full)
+{
+    __init_$(kern.name)();
     $(kern.name)($kern.arglist_names);
 }
 
-$kern.pname $kern.name = &get_$(kern.name);
+$kern.pname $(kern.name)_a = &__$(kern.name)_a;
+$kern.pname $(kern.name)_u = &__$(kern.name)_u;
+$kern.pname $(kern.name)   = &__$(kern.name);
 
-void $(kern.name)_manual($kern.arglist_namedefs, const char* arch) {
-    const size_t index = get_index(
-        get_machine()->$(kern.name)_indices,
-        get_machine()->$(kern.name)_n_archs,
-        arch
+void $(kern.name)_manual($kern.arglist_full, const char* impl_name)
+{
+    const int index = volk_get_index(
+        get_machine()->$(kern.name)_impl_names,
+        get_machine()->$(kern.name)_n_impls,
+        impl_name
     );
-    get_machine()->$(kern.name)_archs[index](
+    get_machine()->$(kern.name)_impls[index](
         $kern.arglist_names
     );
 }
 
-struct volk_func_desc $(kern.name)_get_func_desc(void) {
-    struct volk_func_desc desc = {
-        get_machine()->$(kern.name)_indices,
-        get_machine()->$(kern.name)_arch_defs,
-        get_machine()->$(kern.name)_n_archs
+volk_func_desc_t $(kern.name)_get_func_desc(void) {
+    const char **impl_names = get_machine()->$(kern.name)_impl_names;
+    const int *impl_deps = get_machine()->$(kern.name)_impl_deps;
+    const bool *alignment = get_machine()->$(kern.name)_impl_alignment;
+    const size_t n_impls = get_machine()->$(kern.name)_n_impls;
+    volk_func_desc_t desc = {
+        impl_names,
+        impl_deps,
+        alignment,
+        n_impls
     };
     return desc;
 }
diff --git a/volk/tmpl/volk.tmpl.h b/volk/tmpl/volk.tmpl.h
index 161579e46d..464b65598a 100644
--- a/volk/tmpl/volk.tmpl.h
+++ b/volk/tmpl/volk.tmpl.h
@@ -27,20 +27,59 @@
 #include <volk/volk_common.h>
 #include <volk/volk_complex.h>
 
+#include <stdlib.h>
+#include <stdbool.h>
+
 __VOLK_DECL_BEGIN
 
-struct volk_func_desc {
-    const char **indices;
-    const int *arch_defs;
-    const int n_archs;
-};
+typedef struct volk_func_desc
+{
+    const char **impl_names;
+    const int *impl_deps;
+    const bool *impl_alignment;
+    const size_t n_impls;
+} volk_func_desc_t;
+
+//! Get the machine alignment in bytes
+VOLK_API size_t volk_get_alignment(void);
+
+/*!
+ * The VOLK_OR_PTR macro is a convenience macro
+ * for checking the alignment of a set of pointers.
+ * Example usage:
+ * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2)))
+ */
+#define VOLK_OR_PTR(ptr0, ptr1) \
+    (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1)))
 
-VOLK_API unsigned int volk_get_alignment(void);
+/*!
+ * Is the pointer on a machine alignment boundary?
+ *
+ * Note: for performance reasons, this function
+ * is not usable until another volk API call is made
+ * which will perform certain initialization tasks.
+ *
+ * \param ptr the pointer to some memory buffer
+ * \return 1 for alignment boundary, else 0
+ */
+VOLK_API bool volk_is_aligned(const void *ptr);
 
 #for $kern in $kernels
+
+//! A function pointer to the dispatcher implementation
 extern VOLK_API $kern.pname $kern.name;
-extern VOLK_API void $(kern.name)_manual($kern.arglist_namedefs, const char* arch);
-extern VOLK_API struct volk_func_desc $(kern.name)_get_func_desc(void);
+
+//! A function pointer to the fastest aligned implementation
+extern VOLK_API $kern.pname $(kern.name)_a;
+
+//! A function pointer to the fastest unaligned implementation
+extern VOLK_API $kern.pname $(kern.name)_u;
+
+//! Call into a specific implementation given by name
+extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name);
+
+//! Get description paramaters for this kernel
+extern VOLK_API volk_func_desc_t $(kern.name)_get_func_desc(void);
 #end for
 
 __VOLK_DECL_END
diff --git a/volk/tmpl/volk_machine_xxx.tmpl.c b/volk/tmpl/volk_machine_xxx.tmpl.c
index e405bd6938..68d7f3eba2 100644
--- a/volk/tmpl/volk_machine_xxx.tmpl.c
+++ b/volk/tmpl/volk_machine_xxx.tmpl.c
@@ -44,18 +44,23 @@ $(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp
 #end def
 
 ########################################################################
-#def make_tag_str_list($tags)
-{$(', '.join(['"%s"'%a for a in $tags]))}#slurp
+#def make_impl_name_list($impls)
+{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp
 #end def
 
 ########################################################################
-#def make_tag_have_list($deps)
-{$(', '.join([' | '.join(['(1 << LV_%s)'%a.upper() for a in d]) for d in $deps]))}#slurp
+#def make_impl_align_list($impls)
+{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp
 #end def
 
 ########################################################################
-#def make_tag_kern_list($name, $tags)
-{$(', '.join(['%s_%s'%($name, a) for a in $tags]))}#slurp
+#def make_impl_deps_list($impls)
+{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp
+#end def
+
+########################################################################
+#def make_impl_fcn_list($name, $impls)
+{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp
 #end def
 
 struct volk_machine volk_machine_$(this_machine.name) = {
@@ -63,11 +68,12 @@ struct volk_machine volk_machine_$(this_machine.name) = {
     "$this_machine.name",
     $this_machine.alignment,
     #for $kern in $kernels
-        #set $taglist, $tagdeps = $kern.get_tags($arch_names)
-    "$kern.name",
-    $make_tag_str_list($taglist),
-    $make_tag_have_list($tagdeps),
-    $make_tag_kern_list($kern.name, $taglist),
-    $(len($taglist)),
+        #set $impls = $kern.get_impls($arch_names)
+    "$kern.name",                                   ##//kernel name
+    $make_impl_name_list($impls),                   ##//list of kernel implementations by name
+    $make_impl_deps_list($impls),                   ##//list of arch dependencies per implementation
+    $make_impl_align_list($impls),                  ##//alignment required? for each implementation
+    $make_impl_fcn_list($kern.name, $impls),        ##//pointer to each implementation
+    $(len($impls)),                                 ##//number of implementations listed here
     #end for
 };
diff --git a/volk/tmpl/volk_machines.tmpl.h b/volk/tmpl/volk_machines.tmpl.h
index b30e600ed8..7e11b10795 100644
--- a/volk/tmpl/volk_machines.tmpl.h
+++ b/volk/tmpl/volk_machines.tmpl.h
@@ -25,18 +25,22 @@
 #include <volk/volk_common.h>
 #include <volk/volk_typedefs.h>
 
+#include <stdbool.h>
+#include <stdlib.h>
+
 __VOLK_DECL_BEGIN
 
 struct volk_machine {
     const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format)
     const char *name;
-    const unsigned int alignment; //the maximum byte alignment required for functions in this library
+    const size_t alignment; //the maximum byte alignment required for functions in this library
     #for $kern in $kernels
     const char *$(kern.name)_name;
-    const char *$(kern.name)_indices[$(len($archs))];
-    const int $(kern.name)_arch_defs[$(len($archs))];
-    const $(kern.pname) $(kern.name)_archs[$(len($archs))];
-    const int $(kern.name)_n_archs;
+    const char *$(kern.name)_impl_names[$(len($archs))];
+    const int $(kern.name)_impl_deps[$(len($archs))];
+    const bool $(kern.name)_impl_alignment[$(len($archs))];
+    const $(kern.pname) $(kern.name)_impls[$(len($archs))];
+    const size_t $(kern.name)_n_impls;
     #end for
 };
 
diff --git a/volk/tmpl/volk_typedefs.tmpl.h b/volk/tmpl/volk_typedefs.tmpl.h
index 52a87242fe..6f5426965f 100644
--- a/volk/tmpl/volk_typedefs.tmpl.h
+++ b/volk/tmpl/volk_typedefs.tmpl.h
@@ -26,7 +26,7 @@
 #include <volk/volk_complex.h>
 
 #for $kern in $kernels
-typedef $kern.rettype (*$(kern.pname))($kern.arglist_defs);
+typedef void (*$(kern.pname))($kern.arglist_types);
 #end for
 
 #endif /*INCLUDED_VOLK_TYPEDEFS*/