From bef3db60e73953f2d2ecdc6a86a81e11df3b103d Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Mon, 13 Dec 2010 19:18:45 -0800
Subject: volk: committed some stuff i neglected

---
 volk/include/volk/Makefile.am               |  2 +-
 volk/include/volk/archs.xml                 |  4 ++++
 volk/include/volk/make_set_simd.py          |  3 +++
 volk/include/volk/volk_32f_sqrt_aligned16.h | 13 +++++++++++++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 04a43bd34b..99276ab87b 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -179,4 +179,4 @@ distclean-local:
 	rm -f Makefile.in
 	rm -f volk_environment_init.h
 	rm -f volk_mktables
-	rm -f $(BUILT_SOURCES)
\ No newline at end of file
+	rm -f $(BUILT_SOURCES)
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
index b7c98500f8..a828e5ad09 100644
--- a/volk/include/volk/archs.xml
+++ b/volk/include/volk/archs.xml
@@ -5,6 +5,10 @@
   <flag>none</flag>
 </arch>
 
+<arch name="orc" type="all">
+  <flag>lorc-0.4</flag>
+</arch>
+
 <arch name="altivec" type="powerpc">
   <flag>maltivec</flag>
 </arch>
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index 842366b181..e568aebfaf 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -111,6 +111,9 @@ def make_set_simd(dom) :
         tempstring = tempstring + "  AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n";
     tempstring = tempstring + "  ADDONS=\"\"\n";
     tempstring = tempstring + "  BUILT_ARCHS=\"generic\"\n";
+    tempstring = tempstring + "  if test $HAVE_ORC = yes; then\n";
+    tempstring = tempstring + "    BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n";
+    tempstring = tempstring + "  fi\n";
     tempstring = tempstring + "  _MAKE_FAKE_PROCCPU\n";
     tempstring = tempstring + "  OVERRULE_FLAG=\"no\"\n";
     tempstring = tempstring + "  if test -z \"$cf_with_lv_arch\"; then\n";
diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h
index 0b2eaf2519..f6996ad5fc 100644
--- a/volk/include/volk/volk_32f_sqrt_aligned16.h
+++ b/volk/include/volk/volk_32f_sqrt_aligned16.h
@@ -58,6 +58,19 @@ static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float*
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int);
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){
+    volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
 
 
 
-- 
cgit v1.2.3


From 05f4bced29987a0a573d1fc5b214f3fa01dc84bd Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 14 Dec 2010 13:36:55 -0800
Subject: Volk: More autotools stuff for Orc. Should build OK with or without
 Orc now.

---
 volk/Makefile.am                   |  7 ++++++-
 volk/config/lv_configure.m4        |  4 ----
 volk/config/orc.m4                 |  1 -
 volk/configure.ac                  |  5 ++++-
 volk/include/volk/make_set_simd.py | 10 ++++++++--
 volk/lib/Makefile.am               | 18 +++++++++++++-----
 6 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'volk/include')

diff --git a/volk/Makefile.am b/volk/Makefile.am
index 608aa2e27e..d1ebdc78dc 100644
--- a/volk/Makefile.am
+++ b/volk/Makefile.am
@@ -24,7 +24,12 @@ ACLOCAL_AMFLAGS = -I config
 include $(top_srcdir)/Makefile.common
 
 EXTRA_DIST = bootstrap configure config.h.in volk_config.h
-SUBDIRS = config orc include lib
+SUBDIRS = config
+if HAVE_ORC
+SUBDIRS += orc
+endif
+SUBDIRS += include lib
+
 #if USE_PYTHON
 #SUBDIRS += python 
 #endif
diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4
index 47ad543f8a..f98b2dc5b1 100644
--- a/volk/config/lv_configure.m4
+++ b/volk/config/lv_configure.m4
@@ -104,10 +104,6 @@ dnl  AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes])
   ORC_CHECK
   
   LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS"
-  
-  if test HAVE_ORC = true; then
-    LDFLAGS="$LDFLAGS $ORC_LDFLAGS"
-  fi
 
   AC_CHECK_PROG([XMLTO],[xmlto],[yes],[])
   AM_CONDITIONAL([HAS_XMLTO], [test x$XMLTO = xyes])
diff --git a/volk/config/orc.m4 b/volk/config/orc.m4
index 03c12a8f9f..fa48b10df8 100644
--- a/volk/config/orc.m4
+++ b/volk/config/orc.m4
@@ -45,7 +45,6 @@ AC_DEFUN([ORC_CHECK],
     AC_DEFINE(DISABLE_ORC, 1, [Disable Orc])
     HAVE_ORC=no
     HAVE_ORCC=no
-    ORC_LDFLAGS=""
   fi
   AM_CONDITIONAL(HAVE_ORC, [test "x$HAVE_ORC" = "xyes"])
   AM_CONDITIONAL(HAVE_ORCC, [test "x$HAVE_ORCC" = "xyes"])
diff --git a/volk/configure.ac b/volk/configure.ac
index 8e2f5b8b90..5a1eac3f2f 100644
--- a/volk/configure.ac
+++ b/volk/configure.ac
@@ -76,9 +76,12 @@ AC_CONFIG_FILES([\
 	  include/Makefile \
 	  include/volk/Makefile \
 	  lib/Makefile \
-	  orc/Makefile \
 	  volk.pc \
 	])
+	
+if test "$HAVE_ORC" = yes; then
+  AC_CONFIG_FILES([orc/Makefile])
+fi
 
 AC_OUTPUT
 
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index e568aebfaf..78a00476d5 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -183,8 +183,14 @@ def make_set_simd(dom) :
             tempstring = tempstring + "    indCXX=no\n"
             tempstring = tempstring + "    indLV_ARCH=no\n"
         elif atype == "all":
-            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
-            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            if arch == "orc":
+                tempstring = tempstring + "      if test $HAVE_ORC = yes; then\n";
+                tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+                tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+                tempstring = tempstring + "      fi\n";
+            else:
+                tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+                tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
     tempstring = tempstring + "  ;;\n"
         
     tempstring = tempstring + "  (powerpc)\n"
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
index 649d461e0c..385401ae13 100644
--- a/volk/lib/Makefile.am
+++ b/volk/lib/Makefile.am
@@ -138,10 +138,13 @@ endif
 
 
 
-libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4
-libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+if HAVE_ORC
 libvolk_la_LIBADD = ../orc/libvolk_orc.la
-
+libvolk_la_LDFLAGS += -lorc-0.4
+libvolk_runtime_la_LDFLAGS += -lorc-0.4
+endif
 
 
 # ----------------------------------------------------------------
@@ -237,13 +240,18 @@ libvolk_qa_la_SOURCES = \
 	qa_32f_stddev_aligned16.cc \
 	qa_32f_stddev_and_mean_aligned16.cc
 
-libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4
+libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
 
 libvolk_qa_la_LIBADD = \
 	libvolk.la \
 	libvolk_runtime.la \
-	../orc/libvolk_orc.la \
 	$(CPPUNIT_LIBS)
+	
+if HAVE_ORC
+libvolk_qa_la_LIBADD += \
+    ../orc/libvolk_orc.la
+    libvolk_qa_la_LDFLAGS += -lorc-0.4
+endif
 
 # ----------------------------------------------------------------
 # headers that don't get installed
-- 
cgit v1.2.3


From be78b530701850b964118fd0f63ba2bbdca9759d Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 14 Dec 2010 14:14:00 -0800
Subject: pre-patch...

---
 volk/include/volk/volk_32s_and_aligned16.h        | 14 +++++++++++++-
 volk/include/volk/volk_8s_convert_16s_aligned16.h | 12 ++++++++++++
 volk/include/volk/volk_8s_convert_32f_aligned16.h | 13 +++++++++++++
 volk/orc/Makefile.am                              |  5 ++++-
 4 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h
index e9f1e3a437..16c63fd482 100644
--- a/volk/include/volk/volk_32s_and_aligned16.h
+++ b/volk/include/volk/volk_32s_and_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h
index 0efe3c6a14..c52c64eaef 100644
--- a/volk/include/volk/volk_8s_convert_16s_aligned16.h
+++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h
@@ -65,6 +65,18 @@ static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector,
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 
diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h
index 54b66ef8f0..700a0fa422 100644
--- a/volk/include/volk/volk_8s_convert_32f_aligned16.h
+++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h
@@ -86,6 +86,19 @@ static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, co
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 092e665340..a4af7dac69 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -23,7 +23,10 @@ lib_LTLIBRARIES = libvolk_orc.la
 libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS)
 
 libvolk_orc_la_SOURCES = \
-volk_32f_sqrt_aligned16_orc_impl.orc
+volk_32f_sqrt_aligned16_orc_impl.orc \
+volk_8s_convert_16s_aligned16_orc_impl.orc \
+volk_8s_convert_32f_aligned16_orc_impl.orc \
+volk_32s_and_aligned16_orc_impl
 
 
 
-- 
cgit v1.2.3


From 2e9a7d350713b4e1b21458db8f3fce8a557858ae Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 14 Dec 2010 17:13:40 -0800
Subject: Volk: Added QA tests for all the Orc stuff. Added a 16u_byteswap but
 it's broken right now.

---
 volk/include/volk/volk_16u_byteswap_aligned16.h   | 12 ++++++++++++
 volk/include/volk/volk_32f_add_aligned16.h        | 14 +++++++++++++-
 volk/lib/qa_16u_byteswap_aligned16.cc             |  9 +++++++++
 volk/lib/qa_32f_add_aligned16.cc                  |  9 +++++++++
 volk/lib/qa_32s_and_aligned16.cc                  |  9 +++++++++
 volk/lib/qa_8s_convert_32f_aligned16.cc           |  8 ++++++++
 volk/orc/Makefile.am                              |  3 ++-
 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc |  4 ++++
 8 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
index 698e958e4c..8e628ab17a 100644
--- a/volk/include/volk/volk_16u_byteswap_aligned16.h
+++ b/volk/include/volk/volk_16u_byteswap_aligned16.h
@@ -61,5 +61,17 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points);
+static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){
+    volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
 
 #endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h
index 721c60fd6e..e7d8de265d 100644
--- a/volk/include/volk/volk_32f_add_aligned16.h
+++ b/volk/include/volk/volk_32f_add_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32f_add_aligned16_generic(float* cVector, const float* a
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc
index 6b19828a4c..c30b6ba41f 100644
--- a/volk/lib/qa_16u_byteswap_aligned16.cc
+++ b/volk/lib/qa_16u_byteswap_aligned16.cc
@@ -24,6 +24,7 @@ void qa_16u_byteswap_aligned16::t1() {
   
   uint16_t output0[vlen] __attribute__ ((aligned (16)));
   uint16_t output01[vlen] __attribute__ ((aligned (16)));
+  uint16_t output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
@@ -40,6 +41,13 @@ void qa_16u_byteswap_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16u_byteswap_aligned16_manual(output02, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2");
   }
@@ -54,6 +62,7 @@ void qa_16u_byteswap_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);    
   }
 }
 
diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc
index 002aebfc97..d9214e8a25 100644
--- a/volk/lib/qa_32f_add_aligned16.cc
+++ b/volk/lib/qa_32f_add_aligned16.cc
@@ -78,6 +78,7 @@ void qa_32f_add_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -93,6 +94,13 @@ void qa_32f_add_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_add_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -107,6 +115,7 @@ void qa_32f_add_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc
index 72d05cf6f0..5720ee8690 100644
--- a/volk/lib/qa_32s_and_aligned16.cc
+++ b/volk/lib/qa_32s_and_aligned16.cc
@@ -25,6 +25,7 @@ void qa_32s_and_aligned16::t1() {
   
   int32_t output0[vlen] __attribute__ ((aligned (16)));
   int32_t output01[vlen] __attribute__ ((aligned (16)));
+  int32_t output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
@@ -40,6 +41,13 @@ void qa_32s_and_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_and_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -54,6 +62,7 @@ void qa_32s_and_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc
index 522da0b9dc..3b3aa69196 100644
--- a/volk/lib/qa_8s_convert_32f_aligned16.cc
+++ b/volk/lib/qa_8s_convert_32f_aligned16.cc
@@ -40,6 +40,14 @@ void qa_8s_convert_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 5d82f540f9..3f155d0be6 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -29,7 +29,8 @@ volk_32f_sqrt_aligned16_orc_impl.orc \
 volk_8s_convert_16s_aligned16_orc_impl.orc \
 volk_8s_convert_32f_aligned16_orc_impl.orc \
 volk_32s_and_aligned16_orc_impl.orc \
-volk_32f_add_aligned16_orc_impl.orc
+volk_32f_add_aligned16_orc_impl.orc \
+volk_16u_byteswap_aligned16_orc_impl.orc
 
 
 
diff --git a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..73c3e345ea
--- /dev/null
+++ b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
@@ -0,0 +1,4 @@
+.function volk_16u_byteswap_aligned16_orc_impl
+.dest 2 dst
+.source 2 src
+swapw dst, src
-- 
cgit v1.2.3


From 26415a1445490cc3230c5d793a41703931ae9d01 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 14 Dec 2010 17:23:20 -0800
Subject: Volk: Nick's commits to make adding Orc a little more structurally
 sound

---
 volk/include/volk/archs.xml        |  2 ++
 volk/include/volk/make_set_simd.py | 22 ++++++++++------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
index a828e5ad09..b61bfca098 100644
--- a/volk/include/volk/archs.xml
+++ b/volk/include/volk/archs.xml
@@ -7,6 +7,8 @@
 
 <arch name="orc" type="all">
   <flag>lorc-0.4</flag>
+  <overrule>HAVE_ORC</overrule>
+  <overrule_val>no</overrule_val>
 </arch>
 
 <arch name="altivec" type="powerpc">
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index 78a00476d5..d7109cfcba 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -110,10 +110,7 @@ def make_set_simd(dom) :
         arch = str(domarch.attributes["name"].value);    
         tempstring = tempstring + "  AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n";
     tempstring = tempstring + "  ADDONS=\"\"\n";
-    tempstring = tempstring + "  BUILT_ARCHS=\"generic\"\n";
-    tempstring = tempstring + "  if test $HAVE_ORC = yes; then\n";
-    tempstring = tempstring + "    BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n";
-    tempstring = tempstring + "  fi\n";
+    tempstring = tempstring + "  BUILT_ARCHS=\"\"\n";
     tempstring = tempstring + "  _MAKE_FAKE_PROCCPU\n";
     tempstring = tempstring + "  OVERRULE_FLAG=\"no\"\n";
     tempstring = tempstring + "  if test -z \"$cf_with_lv_arch\"; then\n";
@@ -183,14 +180,11 @@ def make_set_simd(dom) :
             tempstring = tempstring + "    indCXX=no\n"
             tempstring = tempstring + "    indLV_ARCH=no\n"
         elif atype == "all":
-            if arch == "orc":
-                tempstring = tempstring + "      if test $HAVE_ORC = yes; then\n";
-                tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
-                tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
-                tempstring = tempstring + "      fi\n";
-            else:
-                tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
-                tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            tempstring = tempstring + "    if  test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
+            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            tempstring = tempstring + "      BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
+            tempstring = tempstring + "    fi\n"
     tempstring = tempstring + "  ;;\n"
         
     tempstring = tempstring + "  (powerpc)\n"
@@ -234,11 +228,15 @@ def make_set_simd(dom) :
             tempstring = tempstring + "    indCXX=no\n"
             tempstring = tempstring + "    indLV_ARCH=no\n"
         elif atype == "all":
+            tempstring = tempstring + "    if  test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
             tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
             tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            tempstring = tempstring + "      BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
+            tempstring = tempstring + "    fi\n"
     tempstring = tempstring + "  ;;\n"
     tempstring = tempstring + "  esac\n"
     tempstring = tempstring + "  LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
+    tempstring = tempstring + "  AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n";
     tempstring = tempstring + "])\n"
    
     return tempstring;
-- 
cgit v1.2.3


From 21426265324c883c91eeaaf75a81f2ccdc6e249d Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 14 Dec 2010 21:12:49 -0800
Subject: Volk: Build fixes to work with/without Orc.

---
 volk/Makefile.am                   |  2 +-
 volk/config/orc.m4                 | 14 ++---------
 volk/include/volk/archs.xml        |  2 +-
 volk/include/volk/make_set_simd.py | 51 +++++++++++++++++++++++++++++++++++---
 volk/lib/Makefile.am               |  2 +-
 5 files changed, 52 insertions(+), 19 deletions(-)

(limited to 'volk/include')

diff --git a/volk/Makefile.am b/volk/Makefile.am
index d1ebdc78dc..271d495cde 100644
--- a/volk/Makefile.am
+++ b/volk/Makefile.am
@@ -25,7 +25,7 @@ include $(top_srcdir)/Makefile.common
 
 EXTRA_DIST = bootstrap configure config.h.in volk_config.h
 SUBDIRS = config
-if HAVE_ORC
+if LV_HAVE_ORC
 SUBDIRS += orc
 endif
 SUBDIRS += include lib
diff --git a/volk/config/orc.m4 b/volk/config/orc.m4
index 7845a940c8..9645661b01 100644
--- a/volk/config/orc.m4
+++ b/volk/config/orc.m4
@@ -6,18 +6,8 @@ dnl ORC_CHECK([REQUIRED_VERSION])
 AC_DEFUN([ORC_CHECK],
 [
   ORC_REQ=ifelse([$1], , "0.4.6", [$1])
-
-  AC_ARG_ENABLE(orc,
-  AC_HELP_STRING([--enable-orc],[use Orc if installed]),
-  [case "${enableval}" in
-    auto) enable_orc=auto ;;
-    yes) enable_orc=yes ;;
-    no)  enable_orc=no ;;
-    *) AC_MSG_ERROR(bad value ${enableval} for --enable-orc) ;;
-  esac
-  ],
-  [enable_orc=auto]) dnl Default value
-
+  
+  enable_orc = auto
   if test "x$enable_orc" != "xno" ; then
     PKG_CHECK_MODULES(ORC, orc-0.4 >= $ORC_REQ, [
       AC_DEFINE(HAVE_ORC, 1, [Use Orc])
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
index b61bfca098..a19a5add98 100644
--- a/volk/include/volk/archs.xml
+++ b/volk/include/volk/archs.xml
@@ -7,7 +7,7 @@
 
 <arch name="orc" type="all">
   <flag>lorc-0.4</flag>
-  <overrule>HAVE_ORC</overrule>
+  <overrule>LV_HAVE_ORC</overrule>
   <overrule_val>no</overrule_val>
 </arch>
 
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index d7109cfcba..f2b7c06562 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -180,11 +180,22 @@ def make_set_simd(dom) :
             tempstring = tempstring + "    indCXX=no\n"
             tempstring = tempstring + "    indLV_ARCH=no\n"
         elif atype == "all":
-            tempstring = tempstring + "    if  test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
+            tempstring = tempstring + "    for i in $cf_with_lv_arch\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X" + arch + "; then\n";
+            tempstring = tempstring + "        indLV_ARCH=yes\n"
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    if  test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+            tempstring = tempstring + "      indLV_ARCH=no\n"
+            tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    if test \"$indLV_ARCH\" == \"yes\"; then\n"        
             tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
             tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
             tempstring = tempstring + "      BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
             tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    indLV_ARCH=no\n"
+            
     tempstring = tempstring + "  ;;\n"
         
     tempstring = tempstring + "  (powerpc)\n"
@@ -228,11 +239,44 @@ def make_set_simd(dom) :
             tempstring = tempstring + "    indCXX=no\n"
             tempstring = tempstring + "    indLV_ARCH=no\n"
         elif atype == "all":
-            tempstring = tempstring + "    if  test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n"
+            tempstring = tempstring + "    for i in $cf_with_lv_arch\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X" + arch + "; then\n";
+            tempstring = tempstring + "        indLV_ARCH=yes\n"
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    if  test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+            tempstring = tempstring + "      indLV_ARCH=no\n"
+            tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    if test \"$indLV_ARCH\" == \"yes\"; then\n"
+            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            tempstring = tempstring + "      BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
+            tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    indLV_ARCH=no\n"
+    tempstring = tempstring + "  ;;\n"
+    tempstring = tempstring + "  (*)\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        atype = str(domarch.attributes["type"].value);
+        flag = domarch.getElementsByTagName("flag");
+        flag = str(flag[0].firstChild.data);
+        if atype == "all":
+            tempstring = tempstring + "    for i in $cf_with_lv_arch\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X" + arch + "; then\n";
+            tempstring = tempstring + "        indLV_ARCH=yes\n"
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    if  test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+            tempstring = tempstring + "      indLV_ARCH=no\n"
+            tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    if test \"$indLV_ARCH\" == \"yes\"; then\n"        
             tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
             tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
             tempstring = tempstring + "      BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n";
             tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    indLV_ARCH=no\n"
     tempstring = tempstring + "  ;;\n"
     tempstring = tempstring + "  esac\n"
     tempstring = tempstring + "  LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
@@ -240,6 +284,5 @@ def make_set_simd(dom) :
     tempstring = tempstring + "])\n"
    
     return tempstring;
-                
-                
+
         
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
index faab4a010c..2530334616 100644
--- a/volk/lib/Makefile.am
+++ b/volk/lib/Makefile.am
@@ -140,7 +140,7 @@ volk_orc_LDFLAGS = \
 volk_orc_LIBADD = \
 	../orc/libvolk_orc.la
 
-if HAVE_ORC
+if LV_HAVE_ORC
 libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
 libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
 libvolk_la_LIBADD = $(volk_orc_LIBADD)
-- 
cgit v1.2.3


From f9ee6a55cb397f9302769a25a8c959fa162354f0 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 14 Dec 2010 22:58:33 -0800
Subject: Volk: Some new basic Orc implementations with QA code

---
 volk/include/volk/volk_16u_byteswap_aligned16.h   |  4 ++--
 volk/include/volk/volk_32f_divide_aligned16.h     | 13 +++++++++++++
 volk/include/volk/volk_32f_multiply_aligned16.h   | 14 +++++++++++++-
 volk/include/volk/volk_32f_subtract_aligned16.h   | 14 ++++++++++++++
 volk/lib/qa_16u_byteswap_aligned16.cc             |  1 +
 volk/lib/qa_32f_divide_aligned16.cc               | 10 ++++++++++
 volk/lib/qa_32f_multiply_aligned16.cc             |  9 +++++++++
 volk/lib/qa_32f_subtract_aligned16.cc             |  9 +++++++++
 volk/orc/Makefile.am                              |  7 +++++--
 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc |  3 +--
 volk/orc/volk_32f_divide_aligned16_orc_impl.orc   |  5 +++++
 volk/orc/volk_32f_multiply_aligned16_orc_impl.orc |  5 +++++
 volk/orc/volk_32f_subtract_aligned16_orc_impl.orc |  5 +++++
 13 files changed, 92 insertions(+), 7 deletions(-)
 create mode 100644 volk/orc/volk_32f_divide_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_multiply_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_subtract_aligned16_orc_impl.orc

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
index 8e628ab17a..9d19d1a456 100644
--- a/volk/include/volk/volk_16u_byteswap_aligned16.h
+++ b/volk/include/volk/volk_16u_byteswap_aligned16.h
@@ -67,9 +67,9 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns
   \param intsToSwap The vector of data to byte swap
   \param numDataPoints The number of data points
 */
-extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points);
+extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
 static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){
-    volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points);
+    volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points);
 }
 #endif /* LV_HAVE_ORC */
 
diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h
index c00700cd8a..c595b5e92c 100644
--- a/volk/include/volk/volk_32f_divide_aligned16.h
+++ b/volk/include/volk/volk_32f_divide_aligned16.h
@@ -63,6 +63,19 @@ static inline void volk_32f_divide_aligned16_generic(float* cVector, const float
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 
diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h
index b557580aba..87ae7bcf8c 100644
--- a/volk/include/volk/volk_32f_multiply_aligned16.h
+++ b/volk/include/volk/volk_32f_multiply_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32f_multiply_aligned16_generic(float* cVector, const flo
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h
index ac3f5e5d17..e152429016 100644
--- a/volk/include/volk/volk_32f_subtract_aligned16.h
+++ b/volk/include/volk/volk_32f_subtract_aligned16.h
@@ -63,5 +63,19 @@ static inline void volk_32f_subtract_aligned16_generic(float* cVector, const flo
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
 
 #endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */
diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc
index c30b6ba41f..b740f91df9 100644
--- a/volk/lib/qa_16u_byteswap_aligned16.cc
+++ b/volk/lib/qa_16u_byteswap_aligned16.cc
@@ -30,6 +30,7 @@ void qa_16u_byteswap_aligned16::t1() {
     output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
   }
   memcpy(output01, output0, vlen*sizeof(uint16_t));
+  memcpy(output02, output0, vlen*sizeof(uint16_t));
 
   printf("16u_byteswap_aligned\n");
 
diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc
index 8826bf94f5..f104e04439 100644
--- a/volk/lib/qa_32f_divide_aligned16.cc
+++ b/volk/lib/qa_32f_divide_aligned16.cc
@@ -35,6 +35,7 @@ void qa_32f_divide_aligned16::t1() {
   float input1[vlen] __attribute__ ((aligned (16)));
   
   float output0[vlen] __attribute__ ((aligned (16)));
+  float output1[vlen] __attribute__ ((aligned (16)));
   float output_known[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
@@ -51,6 +52,14 @@ void qa_32f_divide_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_divide_aligned16_manual(output1, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   /*
   for(int i = 0; i < 10; ++i) {
@@ -61,6 +70,7 @@ void qa_32f_divide_aligned16::t1() {
   
   for(int i = 0; i < vlen; ++i) {
     CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]);
+    CPPUNIT_ASSERT_EQUAL(output1[i], output_known[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc
index e527484663..f9c034d706 100644
--- a/volk/lib/qa_32f_multiply_aligned16.cc
+++ b/volk/lib/qa_32f_multiply_aligned16.cc
@@ -78,6 +78,7 @@ void qa_32f_multiply_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -93,6 +94,13 @@ void qa_32f_multiply_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_multiply_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -107,6 +115,7 @@ void qa_32f_multiply_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc
index a7e1b5ae34..5a5a7c9b68 100644
--- a/volk/lib/qa_32f_subtract_aligned16.cc
+++ b/volk/lib/qa_32f_subtract_aligned16.cc
@@ -25,6 +25,7 @@ void qa_32f_subtract_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -40,6 +41,13 @@ void qa_32f_subtract_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_subtract_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -54,6 +62,7 @@ void qa_32f_subtract_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 3f155d0be6..c71625d87b 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -25,12 +25,15 @@ lib_LTLIBRARIES = libvolk_orc.la
 libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS)
 
 libvolk_orc_la_SOURCES = \
-volk_32f_sqrt_aligned16_orc_impl.orc \
 volk_8s_convert_16s_aligned16_orc_impl.orc \
 volk_8s_convert_32f_aligned16_orc_impl.orc \
+volk_16u_byteswap_aligned16_orc_impl.orc \
 volk_32s_and_aligned16_orc_impl.orc \
 volk_32f_add_aligned16_orc_impl.orc \
-volk_16u_byteswap_aligned16_orc_impl.orc
+volk_32f_subtract_aligned16_orc_impl.orc \
+volk_32f_divide_aligned16_orc_impl.orc \
+volk_32f_multiply_aligned16_orc_impl.orc \
+volk_32f_sqrt_aligned16_orc_impl.orc
 
 
 
diff --git a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
index 73c3e345ea..3ffd12ec0e 100644
--- a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
+++ b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
@@ -1,4 +1,3 @@
 .function volk_16u_byteswap_aligned16_orc_impl
 .dest 2 dst
-.source 2 src
-swapw dst, src
+swapw dst, dst
diff --git a/volk/orc/volk_32f_divide_aligned16_orc_impl.orc b/volk/orc/volk_32f_divide_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..870843f2af
--- /dev/null
+++ b/volk/orc/volk_32f_divide_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_divide_aligned16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
diff --git a/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..23619af4ed
--- /dev/null
+++ b/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_multiply_aligned16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc b/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..17dbcad465
--- /dev/null
+++ b/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_subtract_aligned16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
-- 
cgit v1.2.3


From 15ad4b5398e474bfb52fdb7e826b69f3e398c0b0 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Wed, 15 Dec 2010 16:27:42 -0800
Subject: Volk: A bunch of new ORC routines plus tests. Also fixed a typo in
 the generic version of 16sc_magnitude_16s_a16.

---
 .../volk/volk_16sc_magnitude_16s_aligned16.h       | 15 +++++++++++--
 .../volk/volk_16sc_magnitude_32f_aligned16.h       | 14 +++++++++++-
 .../volk/volk_32fc_magnitude_16s_aligned16.h       | 14 +++++++++++-
 .../volk/volk_32fc_magnitude_32f_aligned16.h       | 13 ++++++++++-
 volk/include/volk/volk_32s_or_aligned16.h          | 14 +++++++++++-
 volk/lib/qa_16sc_magnitude_16s_aligned16.cc        |  9 ++++++++
 volk/lib/qa_16sc_magnitude_32f_aligned16.cc        | 20 +++++++++++++++++
 volk/lib/qa_32f_divide_aligned16.cc                |  9 ++++++++
 volk/lib/qa_32fc_magnitude_16s_aligned16.cc        |  9 ++++++++
 volk/lib/qa_32fc_magnitude_32f_aligned16.cc        |  9 ++++++++
 volk/lib/qa_32s_or_aligned16.cc                    |  9 ++++++++
 volk/orc/Makefile.am                               |  6 +++++-
 .../volk_16sc_magnitude_16s_aligned16_orc_impl.orc | 24 +++++++++++++++++++++
 .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc | 25 ++++++++++++++++++++++
 .../volk_32fc_magnitude_16s_aligned16_orc_impl.orc | 25 ++++++++++++++++++++++
 .../volk_32fc_magnitude_32f_aligned16_orc_impl.orc | 21 ++++++++++++++++++
 volk/orc/volk_32s_or_aligned16_orc_impl.orc        |  5 +++++
 17 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32s_or_aligned16_orc_impl.orc

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
index 1482ab82e9..9f3222aa63 100644
--- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
@@ -164,7 +164,7 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
   const int16_t* complexVectorPtr = (const int16_t*)complexVector;
   int16_t* magnitudeVectorPtr = magnitudeVector;
   unsigned int number = 0;
-  const float scalar = 32786.0;
+  const float scalar = 32768.0;
   for(number = 0; number < num_points; number++){
     float real = ((float)(*complexVectorPtr++)) / scalar;
     float imag = ((float)(*complexVectorPtr++)) / scalar;
@@ -173,7 +173,18 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
index 9c2a488359..e063ae4328 100644
--- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
@@ -161,7 +161,19 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
index 4e64d8c22c..4e590e120e 100644
--- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
+++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
@@ -140,7 +140,19 @@ static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
index 7a8fd1ef98..3ea62da6a6 100644
--- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
+++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
@@ -115,7 +115,18 @@ static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVec
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
+static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h
index f4c427c4d0..64748d5354 100644
--- a/volk/include/volk/volk_32s_or_aligned16.h
+++ b/volk/include/volk/volk_32s_or_aligned16.h
@@ -63,7 +63,19 @@ static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
index b146107571..c8f13ff84a 100644
--- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
@@ -23,6 +23,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
   std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
   
   int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -40,6 +41,13 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse");
   }
@@ -64,6 +72,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1);
   }
 }
 
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
index 2c9e48f6e1..e7178863c9 100644
--- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
@@ -15,6 +15,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
   
   float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
   float output_known[vlen] __attribute__ ((aligned (16)));
 
   int16_t* inputLoad = (int16_t*)input0;
@@ -37,6 +38,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, scale, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   /*
   for(int i = 0; i < 100; ++i) {
@@ -48,6 +57,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_known[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_orc[i], output_known[i], fabs(output_generic[i])*1e-4);
   }
 }
 
@@ -63,6 +73,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
   
   float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
   float output_sse[vlen] __attribute__ ((aligned (16)));
   float output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -79,6 +90,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
@@ -104,6 +123,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc
index f104e04439..b2c2ecf9a5 100644
--- a/volk/lib/qa_32f_divide_aligned16.cc
+++ b/volk/lib/qa_32f_divide_aligned16.cc
@@ -88,6 +88,7 @@ void qa_32f_divide_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -103,6 +104,13 @@ void qa_32f_divide_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_divide_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -117,6 +125,7 @@ void qa_32f_divide_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
index a4be1616b1..c3e65866bc 100644
--- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
@@ -23,6 +23,7 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
   std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
   
   int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -40,6 +41,13 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_16s_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
   }
@@ -64,6 +72,7 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1);
   }
 }
 
diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
index d69ada408e..6a1d46c7af 100644
--- a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
+++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
@@ -23,6 +23,7 @@ void qa_32fc_magnitude_32f_aligned16::t1() {
   std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
   
   float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
   float output_sse[vlen] __attribute__ ((aligned (16)));
   float output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -40,6 +41,13 @@ void qa_32fc_magnitude_32f_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_32f_aligned16_manual(output_orc, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse");
   }
@@ -64,6 +72,7 @@ void qa_32fc_magnitude_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc
index e09dfb91c7..9ea5283a6e 100644
--- a/volk/lib/qa_32s_or_aligned16.cc
+++ b/volk/lib/qa_32s_or_aligned16.cc
@@ -25,6 +25,7 @@ void qa_32s_or_aligned16::t1() {
   
   int32_t output0[vlen] __attribute__ ((aligned (16)));
   int32_t output01[vlen] __attribute__ ((aligned (16)));
+  int32_t output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
@@ -40,6 +41,13 @@ void qa_32s_or_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_or_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -54,6 +62,7 @@ void qa_32s_or_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index c71625d87b..3f105fd807 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -29,11 +29,15 @@ volk_8s_convert_16s_aligned16_orc_impl.orc \
 volk_8s_convert_32f_aligned16_orc_impl.orc \
 volk_16u_byteswap_aligned16_orc_impl.orc \
 volk_32s_and_aligned16_orc_impl.orc \
+volk_32s_or_aligned16_orc_impl.orc \
 volk_32f_add_aligned16_orc_impl.orc \
 volk_32f_subtract_aligned16_orc_impl.orc \
 volk_32f_divide_aligned16_orc_impl.orc \
 volk_32f_multiply_aligned16_orc_impl.orc \
-volk_32f_sqrt_aligned16_orc_impl.orc
+volk_32f_sqrt_aligned16_orc_impl.orc \
+volk_16sc_magnitude_32f_aligned16_orc_impl.orc \
+volk_32fc_magnitude_32f_aligned16_orc_impl.orc \
+volk_32fc_magnitude_16s_aligned16_orc_impl.orc
 
 
 
diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..f6c959c00f
--- /dev/null
+++ b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
@@ -0,0 +1,24 @@
+.function volk_16sc_magnitude_16s_aligned16_orc_impl
+.source 4 src
+.dest 2 dst
+.temp 2 reals
+.temp 2 imags
+.temp 4 reall
+.temp 4 imagl
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+splitlw reals, imags, src
+convwl reall, reals
+convwl imagl, imags
+convlf realf, reall
+convlf imagf, imagl
+mulf realf, realf, (1.0 / 32768.0)
+mulf imagf, imagf, (1.0 / 32768.0)
+addf sumf, realf, imagf
+sqrtf rootf, sumf
+convfl rootl, rootf
+conflw dst, rootl
diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..44654ad8e7
--- /dev/null
+++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
@@ -0,0 +1,25 @@
+.function volk_16sc_magnitude_32f_aligned16_orc_impl
+.source 4 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 invscalar
+.temp 4 reall
+.temp 4 imagl
+.temp 2 reals
+.temp 2 imags
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+
+divf invscalar, 1.0, scalar
+splitlw reals, imags, src
+convswl reall, reals
+convswl imagl, imags
+convlf realf, reall
+convlf imagf, imagl
+mulf realf, realf, invscalar
+mulf imagf, imagf, invscalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
+addf sumf, realf, imagf
+sqrtf dst, sumf
diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..db8405e596
--- /dev/null
+++ b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
@@ -0,0 +1,25 @@
+.function volk_32fc_magnitude_16s_aligned16_orc_impl
+.source 8 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 4 invscalar
+.temp 4 reall
+.temp 4 imagl
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+divf invscalar, 1.0, scalar
+splitql reall, imagl, src
+convlf realf, reall
+convlf imagf, imagl
+mulf realf, realf, invscalar
+mulf imagf, imagf, invscalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
+addf sumf, realf, imagf
+sqrtf rootf, sumf
+convfl rootl, rootf
+convlw dst, rootl
diff --git a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..cc5c85b455
--- /dev/null
+++ b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
@@ -0,0 +1,21 @@
+.function volk_32fc_magnitude_32f_aligned16_orc_impl
+.source 8 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 invscalar
+.temp 4 reall
+.temp 4 imagl
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+
+divf invscalar, 1.0, scalar
+splitql reall, imagl, src
+convlf realf, reall
+convlf imagf, imagl
+mulf realf, realf, invscalar
+mulf imagf, imagf, invscalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
+addf sumf, realf, imagf
+sqrtf dst, sumf
diff --git a/volk/orc/volk_32s_or_aligned16_orc_impl.orc b/volk/orc/volk_32s_or_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..6d2a3859a3
--- /dev/null
+++ b/volk/orc/volk_32s_or_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32s_or_aligned16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
-- 
cgit v1.2.3


From c6fff77de9b686761f93f0e1de237f8543f5e919 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Fri, 17 Dec 2010 11:14:41 -0800
Subject: Volk: A bunch of new Orc routines plus a couple of build changes.
 32fc_magnitude_16s fails test_all right now.

---
 volk/config/orc.m4                                 |  2 +-
 .../volk/volk_16sc_deinterleave_16s_aligned16.h    | 14 ++++++++++++-
 .../volk/volk_16sc_deinterleave_32f_aligned16.h    | 15 +++++++++++++-
 .../volk_16sc_deinterleave_real_8s_aligned16.h     | 13 +++++++++++-
 .../volk/volk_16sc_magnitude_16s_aligned16.h       |  6 +++---
 .../volk/volk_16sc_magnitude_32f_aligned16.h       |  2 +-
 volk/include/volk/volk_32f_max_aligned16.h         | 14 +++++++++++++
 volk/include/volk/volk_32f_min_aligned16.h         | 14 +++++++++++++
 volk/lib/qa_16sc_deinterleave_16s_aligned16.cc     | 12 +++++++++++
 volk/lib/qa_16sc_deinterleave_32f_aligned16.cc     | 11 ++++++++++
 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc |  9 ++++++++
 volk/lib/qa_16sc_magnitude_16s_aligned16.cc        |  5 +++--
 volk/lib/qa_16sc_magnitude_32f_aligned16.cc        |  6 +++---
 volk/lib/qa_32f_max_aligned16.cc                   |  9 ++++++++
 volk/lib/qa_32f_min_aligned16.cc                   |  9 ++++++++
 volk/lib/qa_32fc_magnitude_16s_aligned16.cc        |  8 ++++----
 volk/lib/qa_volk.cc                                |  1 -
 volk/orc/Makefile.am                               | 10 ++++++---
 ...lk_16sc_deinterleave_16s_aligned16_orc_impl.orc |  5 +++++
 ...lk_16sc_deinterleave_32f_aligned16_orc_impl.orc | 12 +++++++++++
 ...6sc_deinterleave_real_8s_aligned16_orc_impl.orc |  6 ++++++
 .../volk_16sc_magnitude_16s_aligned16_orc_impl.orc | 14 ++++++++-----
 .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc |  8 ++++----
 volk/orc/volk_32f_max_aligned16_orc_impl.orc       |  5 +++++
 volk/orc/volk_32f_min_aligned16_orc_impl.orc       |  5 +++++
 .../volk_32fc_magnitude_16s_aligned16_orc_impl.orc | 24 ++++++++--------------
 .../volk_32fc_magnitude_32f_aligned16_orc_impl.orc | 22 +++++++-------------
 volk/volk.pc.in                                    |  2 +-
 28 files changed, 202 insertions(+), 61 deletions(-)
 create mode 100644 volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_max_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_min_aligned16_orc_impl.orc

(limited to 'volk/include')

diff --git a/volk/config/orc.m4 b/volk/config/orc.m4
index 9645661b01..a4653400cc 100644
--- a/volk/config/orc.m4
+++ b/volk/config/orc.m4
@@ -5,7 +5,7 @@ dnl ORC_CHECK([REQUIRED_VERSION])
 
 AC_DEFUN([ORC_CHECK],
 [
-  ORC_REQ=ifelse([$1], , "0.4.6", [$1])
+  ORC_REQ=ifelse([$1], , "0.4.10", [$1])
   
   enable_orc = auto
   if test "x$enable_orc" != "xno" ; then
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
index 32e13df983..cf94a3f385 100644
--- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
+++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
@@ -140,7 +140,19 @@ static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
index 86f67437d3..50b8b62d5d 100644
--- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
+++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
@@ -89,7 +89,20 @@ static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer,
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
index c0d1e941a9..2dd85a4225 100644
--- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
+++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
@@ -77,7 +77,18 @@ static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuf
 }
 #endif /* LV_HAVE_GENERIC */
 
-
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
 
 
 #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
index 9f3222aa63..41e8751d61 100644
--- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
@@ -173,16 +173,16 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV
 }
 #endif /* LV_HAVE_GENERIC */
 
-#if LV_HAVE_ORC
+#if LV_HAVE_ORC_DISABLED
 /*!
   \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
   \param complexVector The vector containing the complex input values
   \param magnitudeVector The vector containing the real output values
   \param num_points The number of complex values in complexVector to be calculated and stored into cVector
 */
-extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points);
+extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
 static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
+    volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
 }
 #endif /* LV_HAVE_ORC */
 
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
index e063ae4328..c2605d5519 100644
--- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
+++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
@@ -161,7 +161,7 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec
 }
 #endif /* LV_HAVE_GENERIC */
 
-#if LV_HAVE_ORC
+#if LV_HAVE_ORC_DISABLED
 /*!
   \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
   \param complexVector The vector containing the complex input values
diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h
index 96aafb2bf8..d4e30fba81 100644
--- a/volk/include/volk/volk_32f_max_aligned16.h
+++ b/volk/include/volk/volk_32f_max_aligned16.h
@@ -67,5 +67,19 @@ static inline void volk_32f_max_aligned16_generic(float* cVector, const float* a
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
 
 #endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h
index e247f4213a..55daafb6a9 100644
--- a/volk/include/volk/volk_32f_min_aligned16.h
+++ b/volk/include/volk/volk_32f_min_aligned16.h
@@ -67,5 +67,19 @@ static inline void volk_32f_min_aligned16_generic(float* cVector, const float* a
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
 
 #endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
index e700ac72ce..7e9e31df57 100644
--- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
+++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
@@ -26,6 +26,8 @@ void qa_16sc_deinterleave_16s_aligned16::t1() {
   int16_t output_generic1[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse21[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc1[vlen] __attribute__ ((aligned (16)));
   int16_t output_ssse3[vlen] __attribute__ ((aligned (16)));
   int16_t output_ssse31[vlen] __attribute__ ((aligned (16)));
 
@@ -43,6 +45,13 @@ void qa_16sc_deinterleave_16s_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_16s_aligned16_manual(output_orc, output_orc1, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2");
   }
@@ -70,6 +79,9 @@ void qa_16sc_deinterleave_16s_aligned16::t1() {
 
     CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_ssse3[i]);
     CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_ssse31[i]);
+    
+    CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_orc[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_orc1[i]);
   }
 }
 
diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
index 6ee0769983..45100206d9 100644
--- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
+++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
@@ -26,6 +26,8 @@ void qa_16sc_deinterleave_32f_aligned16::t1() {
   float output_generic1[vlen] __attribute__ ((aligned (16)));
   float output_sse2[vlen] __attribute__ ((aligned (16)));
   float output_sse21[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
+  float output_orc1[vlen] __attribute__ ((aligned (16)));
 
   int16_t* loadInput = (int16_t*)input0;
   for(int i = 0; i < vlen*2; ++i) {   
@@ -41,6 +43,13 @@ void qa_16sc_deinterleave_32f_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_32f_aligned16_manual(output_orc, output_orc1, input0, 32768.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse");
   }
@@ -57,6 +66,8 @@ void qa_16sc_deinterleave_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i],  output_sse21[i], fabs(output_generic1[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i],  output_orc1[i], fabs(output_generic1[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
index 5ab458bc90..d187d20c35 100644
--- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
+++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
@@ -24,6 +24,7 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() {
   
   int8_t output_generic[vlen] __attribute__ ((aligned (16)));
   int8_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+  int8_t output_orc[vlen] __attribute__ ((aligned (16)));
 
   int16_t* loadInput = (int16_t*)input0;
   for(int i = 0; i < vlen*2; ++i) {   
@@ -39,6 +40,13 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_8s_aligned16_manual(output_orc, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
   }
@@ -54,6 +62,7 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_orc[i]);
   }
 }
 
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
index c8f13ff84a..dd4ae75ff2 100644
--- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
@@ -40,13 +40,14 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
-  start = clock();
+/*  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc");
   }
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("orc_time: %f\n", total);
+*/
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse");
@@ -72,7 +73,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1);
+    //CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1);
   }
 }
 
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
index e7178863c9..53d42e28ce 100644
--- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
@@ -90,14 +90,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
-  start = clock();
+/*  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc");
   }
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("orc_time: %f\n", total);
-
+*/
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
@@ -123,7 +123,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
-    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
+//    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc
index 3ef3751768..cb1fd3627a 100644
--- a/volk/lib/qa_32f_max_aligned16.cc
+++ b/volk/lib/qa_32f_max_aligned16.cc
@@ -25,6 +25,7 @@ void qa_32f_max_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -40,6 +41,13 @@ void qa_32f_max_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_max_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -54,6 +62,7 @@ void qa_32f_max_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc
index 617e18b249..bf453f3606 100644
--- a/volk/lib/qa_32f_min_aligned16.cc
+++ b/volk/lib/qa_32f_min_aligned16.cc
@@ -25,6 +25,7 @@ void qa_32f_min_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -40,6 +41,13 @@ void qa_32f_min_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_min_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
@@ -54,6 +62,7 @@ void qa_32f_min_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
index c3e65866bc..105d32d0c6 100644
--- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
@@ -63,10 +63,10 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("sse3_time: %f\n", total);
 
-  for(int i = 0; i < 1; ++i) {
-    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
-    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
-  }
+  //for(int i = 0; i < 10; ++i) {
+  //  printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag());
+  //  printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]);
+  //}
   
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc
index c3c27b69b1..f6a334da7c 100644
--- a/volk/lib/qa_volk.cc
+++ b/volk/lib/qa_volk.cc
@@ -118,7 +118,6 @@ CppUnit::TestSuite *
 qa_volk::suite()
 {
   CppUnit::TestSuite *s = new CppUnit::TestSuite("volk");
-
   s->addTest(qa_16s_quad_max_star_aligned16::suite());
   s->addTest(qa_32fc_dot_prod_aligned16::suite());
   s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite());
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 3f105fd807..797efee189 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -35,13 +35,17 @@ volk_32f_subtract_aligned16_orc_impl.orc \
 volk_32f_divide_aligned16_orc_impl.orc \
 volk_32f_multiply_aligned16_orc_impl.orc \
 volk_32f_sqrt_aligned16_orc_impl.orc \
-volk_16sc_magnitude_32f_aligned16_orc_impl.orc \
+volk_32f_max_aligned16_orc_impl.orc \
+volk_32f_min_aligned16_orc_impl.orc \
 volk_32fc_magnitude_32f_aligned16_orc_impl.orc \
-volk_32fc_magnitude_16s_aligned16_orc_impl.orc
+volk_32fc_magnitude_16s_aligned16_orc_impl.orc \
+volk_16sc_deinterleave_16s_aligned16_orc_impl.orc \
+volk_16sc_deinterleave_32f_aligned16_orc_impl.orc \
+volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
 
 
 
-my_ORCC_FLAGS = --implementation --lazy-init $(ORCC_FLAGS)
+my_ORCC_FLAGS = --implementation $(ORCC_FLAGS)
 
 .orc.c:
 	$(ORCC) $(my_ORCC_FLAGS) -o $@ $<
diff --git a/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..d226064a73
--- /dev/null
+++ b/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_16sc_deinterleave_16s_aligned16_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
diff --git a/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..dddf682ca0
--- /dev/null
+++ b/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc
@@ -0,0 +1,12 @@
+.function volk_16sc_deinterleave_32f_aligned16_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..609750096a
--- /dev/null
+++ b/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_16sc_deinterleave_real_8s_aligned16_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
index f6c959c00f..83b867dca3 100644
--- a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
+++ b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
@@ -1,6 +1,7 @@
 .function volk_16sc_magnitude_16s_aligned16_orc_impl
 .source 4 src
 .dest 2 dst
+.floatparam 4 scalar
 .temp 2 reals
 .temp 2 imags
 .temp 4 reall
@@ -12,13 +13,16 @@
 .temp 4 rootl
 
 splitlw reals, imags, src
-convwl reall, reals
-convwl imagl, imags
+convswl reall, reals
+convswl imagl, imags
 convlf realf, reall
 convlf imagf, imagl
-mulf realf, realf, (1.0 / 32768.0)
-mulf imagf, imagf, (1.0 / 32768.0)
+divf realf, realf, scalar
+divf imagf, imagf, scalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
 addf sumf, realf, imagf
 sqrtf rootf, sumf
+mulf rootf, rootf, scalar
 convfl rootl, rootf
-conflw dst, rootl
+convlw dst, rootl
diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
index 44654ad8e7..6d2ed81978 100644
--- a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
+++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
@@ -2,7 +2,6 @@
 .source 4 src
 .dest 4 dst
 .floatparam 4 scalar
-.temp 4 invscalar
 .temp 4 reall
 .temp 4 imagl
 .temp 2 reals
@@ -11,14 +10,15 @@
 .temp 4 imagf
 .temp 4 sumf
 
-divf invscalar, 1.0, scalar
+
+
 splitlw reals, imags, src
 convswl reall, reals
 convswl imagl, imags
 convlf realf, reall
 convlf imagf, imagl
-mulf realf, realf, invscalar
-mulf imagf, imagf, invscalar
+divf realf, realf, scalar
+divf imagf, imagf, scalar
 mulf realf, realf, realf
 mulf imagf, imagf, imagf
 addf sumf, realf, imagf
diff --git a/volk/orc/volk_32f_max_aligned16_orc_impl.orc b/volk/orc/volk_32f_max_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..97f48ba4ad
--- /dev/null
+++ b/volk/orc/volk_32f_max_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_max_aligned16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_min_aligned16_orc_impl.orc b/volk/orc/volk_32f_min_aligned16_orc_impl.orc
new file mode 100644
index 0000000000..a597933deb
--- /dev/null
+++ b/volk/orc/volk_32f_min_aligned16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_min_aligned16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
index db8405e596..f71dd9a37c 100644
--- a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
+++ b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
@@ -2,24 +2,18 @@
 .source 8 src
 .dest 2 dst
 .floatparam 4 scalar
-.temp 4 invscalar
-.temp 4 reall
-.temp 4 imagl
-.temp 4 realf
-.temp 4 imagf
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
 .temp 4 sumf
 .temp 4 rootf
 .temp 4 rootl
 
-divf invscalar, 1.0, scalar
-splitql reall, imagl, src
-convlf realf, reall
-convlf imagf, imagl
-mulf realf, realf, invscalar
-mulf imagf, imagf, invscalar
-mulf realf, realf, realf
-mulf imagf, imagf, imagf
-addf sumf, realf, imagf
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
 sqrtf rootf, sumf
+mulf rootf, rootf, scalar
 convfl rootl, rootf
-convlw dst, rootl
+convssslw dst, rootl
diff --git a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
index cc5c85b455..47a10531d7 100644
--- a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
+++ b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
@@ -1,21 +1,13 @@
 .function volk_32fc_magnitude_32f_aligned16_orc_impl
 .source 8 src
 .dest 4 dst
-.floatparam 4 scalar
-.temp 4 invscalar
-.temp 4 reall
-.temp 4 imagl
-.temp 4 realf
-.temp 4 imagf
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
 .temp 4 sumf
 
-divf invscalar, 1.0, scalar
-splitql reall, imagl, src
-convlf realf, reall
-convlf imagf, imagl
-mulf realf, realf, invscalar
-mulf imagf, imagf, invscalar
-mulf realf, realf, realf
-mulf imagf, imagf, imagf
-addf sumf, realf, imagf
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
 sqrtf dst, sumf
diff --git a/volk/volk.pc.in b/volk/volk.pc.in
index a242988563..b03dbdada5 100644
--- a/volk/volk.pc.in
+++ b/volk/volk.pc.in
@@ -10,6 +10,6 @@ Name: volk
 Description: VOLK.. Vector Optimized Library of Kernels
 Requires:
 Version: @VERSION@
-Libs: -lvolk -lvolk_runtime
+Libs: -lvolk -lvolk_runtime -lvolk_orc
 Cflags: -I${includedir} ${LV_CXXFLAGS}
 
-- 
cgit v1.2.3


From 5b45b875ed58fd66234764a05da42c6eaff22c4d Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 11 Jan 2011 15:17:55 -0800
Subject: Volk: Added more Orc routines (including complex multiply). Started
 redoing the testing framework so it's easier to add new archs to tests.

---
 volk/include/volk/volk_32f_normalize_aligned16.h   | 15 ++++
 .../volk/volk_32fc_32f_multiply_aligned16.h        | 13 ++++
 volk/include/volk/volk_32fc_multiply_aligned16.h   | 17 +++++
 volk/lib/Makefile.am                               |  2 +
 volk/lib/qa_32f_normalize_aligned16.cc             | 13 ++++
 volk/lib/qa_32fc_32f_multiply_aligned16.cc         | 84 ++++++++++------------
 volk/lib/qa_32fc_multiply_aligned16.cc             | 12 ++++
 volk/orc/Makefile.am                               |  3 +
 8 files changed, 112 insertions(+), 47 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
index 1aabb1d9d3..27fb5f7faa 100644
--- a/volk/include/volk/volk_32f_normalize_aligned16.h
+++ b/volk/include/volk/volk_32f_normalize_aligned16.h
@@ -60,6 +60,21 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+/*!
+  \brief Normalizes the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be normalizeed
+  \param bVector One of the vectors to be normalizeed
+  \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points);
+static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+    float invscalar = 1.0 / scalar;
+    volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
 
 
 
diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
index 436656ca06..304ed8e2d0 100644
--- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
+++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
@@ -76,6 +76,19 @@ static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector,
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+  /*!
+    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+    volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
 
 
 
diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h
index 6a1649fdb4..c8f2418c32 100644
--- a/volk/include/volk/volk_32fc_multiply_aligned16.h
+++ b/volk/include/volk/volk_32fc_multiply_aligned16.h
@@ -4,6 +4,7 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk/volk_complex.h>
+#include <float.h>
 
 #if LV_HAVE_SSE3
 #include <pmmintrin.h>
@@ -72,6 +73,22 @@ static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, cons
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_ORC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
+static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    static const float mask = -0.0;
+    volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
 
 
 
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
index 2530334616..0aeafe4aaf 100644
--- a/volk/lib/Makefile.am
+++ b/volk/lib/Makefile.am
@@ -156,6 +156,7 @@ endif
 # ----------------------------------------------------------------
 libvolk_qa_la_SOURCES = \
 	qa_volk.cc \
+	qa_utils.cc \
 	qa_16s_quad_max_star_aligned16.cc \
 	qa_32fc_dot_prod_aligned16.cc \
 	qa_32fc_square_dist_aligned16.cc \
@@ -257,6 +258,7 @@ libvolk_qa_la_LIBADD = \
 noinst_HEADERS = \
 	volk_init.h \
 	qa_volk.h \
+	qa_utils.h \
 	assembly.h \
 	qa_16s_quad_max_star_aligned16.h \
 	qa_32fc_dot_prod_aligned16.h \
diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc
index 1c7b485a6a..0da43ecff7 100644
--- a/volk/lib/qa_32f_normalize_aligned16.cc
+++ b/volk/lib/qa_32f_normalize_aligned16.cc
@@ -26,13 +26,16 @@ void qa_32f_normalize_aligned16::t1() {
 
   float* output0;
   float* output01;
+  float* output02;
   ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float));
   ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float));
+  ret = posix_memalign((void**)&output02, 16, vlen*sizeof(float));
 
   for(int i = 0; i < vlen; ++i) {   
     output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
   }
   memcpy(output01, output0, vlen*sizeof(float));
+  memcpy(output02, output0, vlen*sizeof(float));
   printf("32f_normalize_aligned\n");
 
   start = clock();
@@ -49,6 +52,14 @@ void qa_32f_normalize_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("sse_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_normalize_aligned16_manual(output02, 1.15, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  
   for(int i = 0; i < 1; ++i) {
     //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
     //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
@@ -57,10 +68,12 @@ void qa_32f_normalize_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     // printf("%e...%e\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output02[i], fabs(output0[i])*1e-4);
   }
 
   free(output0);
   free(output01);
+  free(output02);
 }
 
 #endif
diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc
index 4eba0a3cda..7bb8d21c1d 100644
--- a/volk/lib/qa_32fc_32f_multiply_aligned16.cc
+++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc
@@ -2,28 +2,12 @@
 #include <volk/volk.h>
 #include <qa_32fc_32f_multiply_aligned16.h>
 #include <stdlib.h>
-#include <math.h>
 #include <time.h>
-
-#define assertcomplexEqual(expected, actual, delta)			\
-  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
-  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+#include <string.h>
+#include <qa_utils.h>
 
 #define	ERR_DELTA	(1e-4)
 
-//test for sse
-static float uniform() {
-  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
-}
-
-static void
-random_floats (float *buf, unsigned n)
-{
-  for (unsigned i = 0; i < n; i++)
-    buf[i] = uniform ();
-}
-
-#ifdef LV_HAVE_SSE3
 void qa_32fc_32f_multiply_aligned16::t1() {
 
   const int vlen = 2046;
@@ -36,50 +20,56 @@ void qa_32fc_32f_multiply_aligned16::t1() {
   std::complex<float>* input;
   float * taps;
   int i;
+  std::vector<std::string> archs;
+  archs.push_back("generic");
+#ifdef LV_HAVE_SSE3
+  archs.push_back("sse3");
+#endif
+#ifdef LV_HAVE_ORC
+  archs.push_back("orc");
+#endif
   
-  std::complex<float>* result_generic;
-  std::complex<float>* result_sse3;
+  std::vector<std::complex<float>* > results;
 
   ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float));
   ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float));
-  ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float));
-  ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float));
+  
+  for(i=0; i < archs.size(); i++) {
+      std::complex<float> *ptr;
+      ret = posix_memalign((void**)&ptr, 16, vlen * 2 * sizeof(float));
+      if(ret) {
+          printf("Couldn't allocate memory\n");
+          exit(1);
+      }
+      results.push_back(ptr);
+  }
 
   random_floats((float*)input, vlen * 2);
   random_floats(taps, vlen);
   
   printf("32fc_32f_multiply_aligned16\n");
 
-  start = clock();
-  for(int count = 0; count < ITERS; ++count) {
-    volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen,  "generic");
-  }
-  end = clock();
-  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
-  printf("generic_time: %f\n", total);
-
-  start = clock();
-  for(int count = 0; count < ITERS; ++count) {
-    volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3");
+  for(i=0; i < archs.size(); i++) {
+    start = clock();
+    for(int count = 0; count < ITERS; ++count) {
+      volk_32fc_32f_multiply_aligned16_manual(results[i], input, taps, vlen, archs[i].c_str());
+    }
+    end = clock();
+    total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+    printf("%s_time: %f\n", archs[i].c_str(), total);
   }
-  end = clock();
-  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
-  printf("sse3_time: %f\n", total);
 
-  for(i = 0; i < vlen; i++){
-    assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA);
+  for(i=0; i < vlen; i++) {
+      int j = 1;
+      for(j; j < archs.size(); j++) {
+          assertcomplexEqual(results[0][i], results[j][i], ERR_DELTA);
+      }
   }
 
   free(input);
   free(taps);
-  free(result_generic);
-  free(result_sse3);
-  
-}
-#else
-void qa_32fc_32f_multiply_aligned16::t1() {
-  printf("sse3 not available... no test performed\n");
+  for(i=0; i < archs.size(); i++) {      
+    free(results[i]);
+  }
 }
 
-#endif /* LV_HAVE_SSE3 */
-
diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc
index e1f7eab3d0..022b58ad61 100644
--- a/volk/lib/qa_32fc_multiply_aligned16.cc
+++ b/volk/lib/qa_32fc_multiply_aligned16.cc
@@ -41,11 +41,13 @@ void qa_32fc_multiply_aligned16::t1() {
   
   std::complex<float>* result_generic;
   std::complex<float>* result_sse3;
+  std::complex<float>* result_orc;
 
   ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float));
   ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float));
   ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float));
   ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float));
+  ret = posix_memalign((void**)&result_orc, 16, vlen*2*sizeof(float));
   
   random_floats((float*)input, vlen * 2);
   random_floats((float*)taps, vlen * 2);
@@ -67,15 +69,25 @@ void qa_32fc_multiply_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("sse3_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_multiply_aligned16_manual(result_orc, input, taps, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   for(i = 0; i < vlen; i++){
     assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA);
+    assertcomplexEqual(result_generic[i], result_orc[i], ERR_DELTA);
   }
 
   free(input);
   free(taps);
   free(result_generic);
   free(result_sse3);
+  free(result_orc);
   
 }
 #else
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index a469638c1a..066050a7c4 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -34,9 +34,12 @@ volk_32f_add_aligned16_orc_impl.orc \
 volk_32f_subtract_aligned16_orc_impl.orc \
 volk_32f_divide_aligned16_orc_impl.orc \
 volk_32f_multiply_aligned16_orc_impl.orc \
+volk_32fc_multiply_aligned16_orc_impl.orc \
+volk_32fc_32f_multiply_aligned16_orc_impl.orc \
 volk_32f_sqrt_aligned16_orc_impl.orc \
 volk_32f_max_aligned16_orc_impl.orc \
 volk_32f_min_aligned16_orc_impl.orc \
+volk_32f_normalize_aligned16_orc_impl.orc \
 volk_32fc_magnitude_32f_aligned16_orc_impl.orc \
 volk_32fc_magnitude_16s_aligned16_orc_impl.orc \
 volk_16sc_magnitude_16s_aligned16_orc_impl.orc \
-- 
cgit v1.2.3


From c501dc110d3cc7cfcfff178fecb21f30ac9bd54c Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 11 Jan 2011 15:35:04 -0800
Subject: Volk: fixed normalize.

---
 volk/include/volk/volk_32f_normalize_aligned16.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
index 27fb5f7faa..323d0513c2 100644
--- a/volk/include/volk/volk_32f_normalize_aligned16.h
+++ b/volk/include/volk/volk_32f_normalize_aligned16.h
@@ -68,10 +68,10 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const
   \param bVector One of the vectors to be normalizeed
   \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
 */
-extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points);
+extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
 static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){
     float invscalar = 1.0 / scalar;
-    volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points);
+    volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
 }
 #endif /* LV_HAVE_GENERIC */
 
-- 
cgit v1.2.3


From d486ff4b4c039c8b3b06b6519839d522cf69be69 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Sun, 16 Jan 2011 14:03:16 -0800
Subject: volk_rename: renamed basically everything in the volk lib to have
 logically consistent function names

---
 volk/config.guess                                  | 1506 +----------------
 volk/config.sub                                    | 1740 +-------------------
 volk/include/volk/Makefile.am                      |  174 +-
 volk/include/volk/make_c.py                        |    3 +-
 volk/include/volk/volk_16s_add_quad_a16.h          |  136 ++
 volk/include/volk/volk_16s_add_quad_aligned16.h    |  136 --
 volk/include/volk/volk_16s_branch_4_state_8_a16.h  |  194 +++
 .../volk/volk_16s_branch_4_state_8_aligned16.h     |  194 ---
 volk/include/volk/volk_16s_convert_32f_aligned16.h |  119 --
 .../volk/volk_16s_convert_32f_unaligned16.h        |  122 --
 volk/include/volk/volk_16s_convert_8s_a16.h        |   69 +
 volk/include/volk/volk_16s_convert_8s_aligned16.h  |   69 -
 volk/include/volk/volk_16s_convert_8s_ua16.h       |   71 +
 .../include/volk/volk_16s_convert_8s_unaligned16.h |   71 -
 volk/include/volk/volk_16s_max_star_16s_a16.h      |  108 ++
 volk/include/volk/volk_16s_max_star_aligned16.h    |  108 --
 .../volk/volk_16s_max_star_horizontal_16s_a16.h    |  130 ++
 .../volk/volk_16s_max_star_horizontal_aligned16.h  |  130 --
 .../volk/volk_16s_permute_and_scalar_add_a16.h     |  139 ++
 .../volk_16s_permute_and_scalar_add_aligned16.h    |  139 --
 volk/include/volk/volk_16s_quad_max_star_16s_a16.h |  191 +++
 .../volk/volk_16s_quad_max_star_aligned16.h        |  191 ---
 volk/include/volk/volk_16s_s32f_convert_32f_a16.h  |  119 ++
 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h |  122 ++
 .../volk/volk_16sc_deinterleave_16s_16s_a16.h      |  158 ++
 .../volk/volk_16sc_deinterleave_16s_aligned16.h    |  158 --
 .../volk/volk_16sc_deinterleave_32f_aligned16.h    |  108 --
 .../volk/volk_16sc_deinterleave_real_16s_a16.h     |  120 ++
 .../volk_16sc_deinterleave_real_16s_aligned16.h    |  120 --
 .../volk_16sc_deinterleave_real_32f_aligned16.h    |  125 --
 .../volk/volk_16sc_deinterleave_real_8s_a16.h      |   94 ++
 .../volk_16sc_deinterleave_real_8s_aligned16.h     |   94 --
 volk/include/volk/volk_16sc_magnitude_16s_a16.h    |  190 +++
 .../volk/volk_16sc_magnitude_16s_aligned16.h       |  190 ---
 .../volk/volk_16sc_magnitude_32f_aligned16.h       |  179 --
 .../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h |  108 ++
 .../volk_16sc_s32f_deinterleave_real_32f_a16.h     |  125 ++
 .../volk/volk_16sc_s32f_magnitude_32f_a16.h        |  179 ++
 volk/include/volk/volk_16u_byteswap_a16.h          |   77 +
 volk/include/volk/volk_16u_byteswap_aligned16.h    |   77 -
 .../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h    |  151 ++
 volk/include/volk/volk_32f_32f_add_32f_a16.h       |   81 +
 volk/include/volk/volk_32f_32f_divide_32f_a16.h    |   82 +
 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h  |  184 +++
 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h |  184 +++
 .../volk/volk_32f_32f_interleave_32fc_a16.h        |   75 +
 volk/include/volk/volk_32f_32f_max_32f_a16.h       |   85 +
 volk/include/volk/volk_32f_32f_min_32f_a16.h       |   85 +
 volk/include/volk/volk_32f_32f_multiply_32f_a16.h  |   81 +
 .../volk/volk_32f_32f_s32f_interleave_16sc_a16.h   |  155 ++
 volk/include/volk/volk_32f_32f_subtract_32f_a16.h  |   81 +
 volk/include/volk/volk_32f_accumulator_aligned16.h |   67 -
 volk/include/volk/volk_32f_accumulator_s32f_a16.h  |   67 +
 volk/include/volk/volk_32f_add_aligned16.h         |   81 -
 .../volk/volk_32f_calc_spectral_noise_floor_a16.h  |  167 ++
 .../volk_32f_calc_spectral_noise_floor_aligned16.h |  167 --
 volk/include/volk/volk_32f_convert_16s_aligned16.h |  110 --
 .../volk/volk_32f_convert_16s_unaligned16.h        |  113 --
 volk/include/volk/volk_32f_convert_32s_aligned16.h |  106 --
 .../volk/volk_32f_convert_32s_unaligned16.h        |  109 --
 volk/include/volk/volk_32f_convert_64f_a16.h       |   70 +
 volk/include/volk/volk_32f_convert_64f_aligned16.h |   70 -
 volk/include/volk/volk_32f_convert_64f_ua16.h      |   70 +
 .../volk/volk_32f_convert_64f_unaligned16.h        |   70 -
 volk/include/volk/volk_32f_convert_8s_aligned16.h  |  117 --
 .../include/volk/volk_32f_convert_8s_unaligned16.h |  120 --
 volk/include/volk/volk_32f_divide_aligned16.h      |   82 -
 volk/include/volk/volk_32f_dot_prod_aligned16.h    |  184 ---
 volk/include/volk/volk_32f_dot_prod_unaligned16.h  |  184 ---
 volk/include/volk/volk_32f_fm_detect_aligned16.h   |  120 --
 volk/include/volk/volk_32f_index_max_16u_a16.h     |  148 ++
 volk/include/volk/volk_32f_index_max_aligned16.h   |  148 --
 .../volk/volk_32f_interleave_16sc_aligned16.h      |  155 --
 .../volk/volk_32f_interleave_32fc_aligned16.h      |   75 -
 volk/include/volk/volk_32f_max_aligned16.h         |   85 -
 volk/include/volk/volk_32f_min_aligned16.h         |   85 -
 volk/include/volk/volk_32f_multiply_aligned16.h    |   81 -
 volk/include/volk/volk_32f_normalize_aligned16.h   |   81 -
 volk/include/volk/volk_32f_power_aligned16.h       |  144 --
 .../volk/volk_32f_s32f_32f_fm_detect_32f_a16.h     |  120 ++
 volk/include/volk/volk_32f_s32f_convert_16s_a16.h  |  110 ++
 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h |  113 ++
 volk/include/volk/volk_32f_s32f_convert_32s_a16.h  |  106 ++
 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h |  109 ++
 volk/include/volk/volk_32f_s32f_convert_8s_a16.h   |  117 ++
 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h  |  120 ++
 volk/include/volk/volk_32f_s32f_normalize_a16.h    |   81 +
 volk/include/volk/volk_32f_s32f_power_32f_a16.h    |  144 ++
 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h   |  144 ++
 volk/include/volk/volk_32f_sqrt_32f_a16.h          |   77 +
 volk/include/volk/volk_32f_sqrt_aligned16.h        |   77 -
 volk/include/volk/volk_32f_stddev_aligned16.h      |  144 --
 .../volk/volk_32f_stddev_and_mean_32f_32f_a16.h    |  169 ++
 .../volk/volk_32f_stddev_and_mean_aligned16.h      |  169 --
 volk/include/volk/volk_32f_subtract_aligned16.h    |   81 -
 volk/include/volk/volk_32f_sum_of_poly_aligned16.h |  151 --
 .../include/volk/volk_32fc_32f_multiply_32fc_a16.h |   95 ++
 .../volk/volk_32fc_32f_multiply_aligned16.h        |   95 --
 volk/include/volk/volk_32fc_32f_power_32fc_a16.h   |  109 ++
 .../volk/volk_32fc_32f_power_32fc_aligned16.h      |  109 --
 .../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h   |  344 ++++
 .../volk/volk_32fc_32fc_dot_prod_32fc_a16.h        |  468 ++++++
 .../volk/volk_32fc_32fc_multiply_32fc_a16.h        |   95 ++
 ...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h |  126 ++
 .../volk/volk_32fc_32fc_square_dist_32f_a16.h      |  112 ++
 volk/include/volk/volk_32fc_atan2_32f_aligned16.h  |  158 --
 .../volk/volk_32fc_conjugate_dot_prod_aligned16.h  |  344 ----
 .../volk/volk_32fc_deinterleave_32f_32f_a16.h      |   75 +
 .../volk/volk_32fc_deinterleave_32f_aligned16.h    |   75 -
 .../volk/volk_32fc_deinterleave_64f_64f_a16.h      |   78 +
 .../volk/volk_32fc_deinterleave_64f_aligned16.h    |   78 -
 .../volk/volk_32fc_deinterleave_real_16s_a16.h     |   80 +
 .../volk_32fc_deinterleave_real_16s_aligned16.h    |   80 -
 .../volk/volk_32fc_deinterleave_real_32f_a16.h     |   68 +
 .../volk_32fc_deinterleave_real_32f_aligned16.h    |   68 -
 .../volk/volk_32fc_deinterleave_real_64f_a16.h     |   66 +
 .../volk_32fc_deinterleave_real_64f_aligned16.h    |   66 -
 volk/include/volk/volk_32fc_dot_prod_aligned16.h   |  468 ------
 volk/include/volk/volk_32fc_index_max_16u_a16.h    |  215 +++
 volk/include/volk/volk_32fc_index_max_aligned16.h  |  215 ---
 .../volk/volk_32fc_magnitude_16s_aligned16.h       |  158 --
 volk/include/volk/volk_32fc_magnitude_32f_a16.h    |  132 ++
 .../volk/volk_32fc_magnitude_32f_aligned16.h       |  132 --
 volk/include/volk/volk_32fc_multiply_aligned16.h   |   95 --
 ...olk_32fc_power_spectral_density_32f_aligned16.h |  134 --
 .../volk/volk_32fc_power_spectrum_32f_aligned16.h  |  126 --
 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h   |  158 ++
 .../volk/volk_32fc_s32f_magnitude_16s_a16.h        |  158 ++
 .../volk/volk_32fc_s32f_power_spectrum_32f_a16.h   |  126 ++
 ...32fc_s32f_s32f_power_spectral_density_32f_a16.h |  134 ++
 .../include/volk/volk_32fc_square_dist_aligned16.h |  112 --
 .../volk_32fc_square_dist_scalar_mult_aligned16.h  |  126 --
 volk/include/volk/volk_32s_32s_and_32s_a16.h       |   81 +
 volk/include/volk/volk_32s_32s_or_32s_a16.h        |   81 +
 volk/include/volk/volk_32s_and_aligned16.h         |   81 -
 volk/include/volk/volk_32s_convert_32f_aligned16.h |   73 -
 .../volk/volk_32s_convert_32f_unaligned16.h        |   75 -
 volk/include/volk/volk_32s_or_aligned16.h          |   81 -
 volk/include/volk/volk_32s_s32f_convert_32f_a16.h  |   73 +
 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h |   75 +
 volk/include/volk/volk_32u_byteswap_a16.h          |   77 +
 volk/include/volk/volk_32u_byteswap_aligned16.h    |   77 -
 volk/include/volk/volk_32u_popcnt_a16.h            |   36 +
 volk/include/volk/volk_32u_popcnt_aligned16.h      |   36 -
 volk/include/volk/volk_64f_64f_max_64f_a16.h       |   71 +
 volk/include/volk/volk_64f_64f_min_64f_a16.h       |   71 +
 volk/include/volk/volk_64f_convert_32f_a16.h       |   67 +
 volk/include/volk/volk_64f_convert_32f_aligned16.h |   67 -
 volk/include/volk/volk_64f_convert_32f_ua16.h      |   67 +
 .../volk/volk_64f_convert_32f_unaligned16.h        |   67 -
 volk/include/volk/volk_64f_max_aligned16.h         |   71 -
 volk/include/volk/volk_64f_min_aligned16.h         |   71 -
 volk/include/volk/volk_64u_byteswap_a16.h          |   88 +
 volk/include/volk/volk_64u_byteswap_aligned16.h    |   88 -
 volk/include/volk/volk_64u_popcnt_a16.h            |   50 +
 volk/include/volk/volk_64u_popcnt_aligned16.h      |   50 -
 volk/include/volk/volk_8s_convert_16s_a16.h        |   83 +
 volk/include/volk/volk_8s_convert_16s_aligned16.h  |   83 -
 volk/include/volk/volk_8s_convert_16s_ua16.h       |   73 +
 .../include/volk/volk_8s_convert_16s_unaligned16.h |   73 -
 volk/include/volk/volk_8s_convert_32f_aligned16.h  |  105 --
 .../include/volk/volk_8s_convert_32f_unaligned16.h |   94 --
 volk/include/volk/volk_8s_s32f_convert_32f_a16.h   |  105 ++
 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h  |   94 ++
 .../volk_8sc_8sc_multiply_conjugate_16sc_a16.h     |  102 ++
 ...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h |  122 ++
 .../volk/volk_8sc_deinterleave_16s_16s_a16.h       |   77 +
 .../volk/volk_8sc_deinterleave_16s_aligned16.h     |   77 -
 .../volk/volk_8sc_deinterleave_32f_aligned16.h     |  164 --
 .../volk/volk_8sc_deinterleave_real_16s_a16.h      |   66 +
 .../volk_8sc_deinterleave_real_16s_aligned16.h     |   66 -
 .../volk_8sc_deinterleave_real_32f_aligned16.h     |  133 --
 .../volk/volk_8sc_deinterleave_real_8s_a16.h       |   67 +
 .../volk/volk_8sc_deinterleave_real_8s_aligned16.h |   67 -
 .../volk_8sc_multiply_conjugate_16sc_aligned16.h   |  102 --
 .../volk_8sc_multiply_conjugate_32fc_aligned16.h   |  122 --
 .../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h  |  164 ++
 .../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h |  133 ++
 volk/include/volk/volk_register.py                 |    5 +-
 volk/lib/Makefile.am                               |    3 +-
 volk/lib/qa_utils.cc                               |   53 +-
 volk/lib/qa_utils.h                                |    2 +-
 volk/orc/Makefile.am                               |   42 +-
 ...volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc |    5 +
 ...lk_16sc_deinterleave_16s_aligned16_orc_impl.orc |    5 -
 ...lk_16sc_deinterleave_32f_aligned16_orc_impl.orc |   12 -
 ...volk_16sc_deinterleave_real_8s_a16_orc_impl.orc |    6 +
 ...6sc_deinterleave_real_8s_aligned16_orc_impl.orc |    6 -
 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc  |   23 +
 .../volk_16sc_magnitude_16s_aligned16_orc_impl.orc |   23 -
 ...16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc |   12 +
 volk/orc/volk_16u_byteswap_a16_orc_impl.orc        |    3 +
 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc  |    3 -
 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc     |    5 +
 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc  |    5 +
 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc     |    5 +
 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc     |    5 +
 .../orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc |    5 +
 .../orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc |    5 +
 volk/orc/volk_32f_add_aligned16_orc_impl.orc       |    5 -
 volk/orc/volk_32f_divide_aligned16_orc_impl.orc    |    5 -
 volk/orc/volk_32f_max_aligned16_orc_impl.orc       |    5 -
 volk/orc/volk_32f_min_aligned16_orc_impl.orc       |    5 -
 volk/orc/volk_32f_multiply_aligned16_orc_impl.orc  |    5 -
 volk/orc/volk_32f_normalize_aligned16_orc_impl.orc |    5 -
 volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc  |    5 +
 volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc        |    4 +
 volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc      |    4 -
 volk/orc/volk_32f_subtract_aligned16_orc_impl.orc  |    5 -
 .../volk_32fc_32f_multiply_32fc_a16_orc_impl.orc   |    7 +
 .../volk_32fc_32f_multiply_aligned16_orc_impl.orc  |    7 -
 .../volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc  |    6 +
 .../volk_32fc_magnitude_16s_aligned16_orc_impl.orc |   23 -
 volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc  |   13 +
 .../volk_32fc_magnitude_32f_aligned16_orc_impl.orc |   13 -
 volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc |    6 -
 .../volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc  |   23 +
 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc     |    5 +
 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc      |    5 +
 volk/orc/volk_32s_and_aligned16_orc_impl.orc       |    5 -
 volk/orc/volk_32s_or_aligned16_orc_impl.orc        |    5 -
 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc      |    5 +
 .../orc/volk_8s_convert_16s_aligned16_orc_impl.orc |    5 -
 .../orc/volk_8s_convert_32f_aligned16_orc_impl.orc |    9 -
 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc |    9 +
 225 files changed, 10382 insertions(+), 13604 deletions(-)
 mode change 100755 => 120000 volk/config.guess
 mode change 100755 => 120000 volk/config.sub
 create mode 100644 volk/include/volk/volk_16s_add_quad_a16.h
 delete mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h
 create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h
 delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h
 create mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h
 create mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h
 create mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h
 create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
 create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
 delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
 create mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h
 create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
 create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
 create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
 delete mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
 create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
 create mode 100644 volk/include/volk/volk_16u_byteswap_a16.h
 delete mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
 create mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
 create mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_add_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
 delete mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h
 create mode 100644 volk/include/volk/volk_32f_convert_64f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h
 delete mode 100644 volk/include/volk/volk_32f_divide_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h
 delete mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_index_max_16u_a16.h
 delete mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_max_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_min_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_power_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_normalize_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_sqrt_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h
 create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h
 delete mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_index_max_16u_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h
 delete mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
 create mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h
 create mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32s_and_aligned16.h
 delete mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h
 delete mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h
 delete mode 100644 volk/include/volk/volk_32s_or_aligned16.h
 create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
 create mode 100644 volk/include/volk/volk_32u_byteswap_a16.h
 delete mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h
 create mode 100644 volk/include/volk/volk_32u_popcnt_a16.h
 delete mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h
 create mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h
 create mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h
 create mode 100644 volk/include/volk/volk_64f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h
 delete mode 100644 volk/include/volk/volk_64f_max_aligned16.h
 delete mode 100644 volk/include/volk/volk_64f_min_aligned16.h
 create mode 100644 volk/include/volk/volk_64u_byteswap_a16.h
 delete mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h
 create mode 100644 volk/include/volk/volk_64u_popcnt_a16.h
 delete mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h
 create mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h
 create mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h
 create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
 create mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
 create mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
 create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
 create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
 delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
 delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
 create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
 create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16u_byteswap_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_add_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_divide_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_max_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_min_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_multiply_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_normalize_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_subtract_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32s_and_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32s_or_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc
 delete mode 100644 volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc
 create mode 100644 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc

(limited to 'volk/include')

diff --git a/volk/config.guess b/volk/config.guess
deleted file mode 100755
index 2852378467..0000000000
--- a/volk/config.guess
+++ /dev/null
@@ -1,1505 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
-#   Free Software Foundation, Inc.
-
-timestamp='2010-08-21'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner.  Please send patches (context
-# diff format) to <config-patches@gnu.org> and include a ChangeLog
-# entry.
-#
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
-#
-# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION]
-
-Output the configuration name of the system \`$me' is run on.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.guess ($timestamp)
-
-Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
-Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-    * )
-       break ;;
-  esac
-done
-
-if test $# != 0; then
-  echo "$me: too many arguments$help" >&2
-  exit 1
-fi
-
-trap 'exit 1' HUP INT TERM
-
-# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
-# compiler to aid in system detection is discouraged as it requires
-# temporary files to be created and, as you can see below, it is a
-# headache to deal with in a portable fashion.
-
-# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
-# use `HOST_CC' if defined, but it is deprecated.
-
-# Portable tmp directory creation inspired by the Autoconf team.
-
-set_cc_for_build='
-trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
-trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" HUP INT PIPE TERM ;
-: ${TMPDIR=/tmp} ;
- { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
- { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
- { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
- { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
-dummy=$tmp/dummy ;
-tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
-case $CC_FOR_BUILD,$HOST_CC,$CC in
- ,,)    echo "int x;" > $dummy.c ;
-	for c in cc gcc c89 c99 ; do
-	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
-	     CC_FOR_BUILD="$c"; break ;
-	  fi ;
-	done ;
-	if test x"$CC_FOR_BUILD" = x ; then
-	  CC_FOR_BUILD=no_compiler_found ;
-	fi
-	;;
- ,,*)   CC_FOR_BUILD=$CC ;;
- ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-esac ; set_cc_for_build= ;'
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 1994-08-24)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
-UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
-UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
-    *:NetBSD:*:*)
-	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
-	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
-	# switched to ELF, *-*-netbsd* would select the old
-	# object file format.  This provides both forward
-	# compatibility and a consistent mechanism for selecting the
-	# object file format.
-	#
-	# Note: NetBSD doesn't particularly care about the vendor
-	# portion of the name.  We always set it to "unknown".
-	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
-	case "${UNAME_MACHINE_ARCH}" in
-	    armeb) machine=armeb-unknown ;;
-	    arm*) machine=arm-unknown ;;
-	    sh3el) machine=shl-unknown ;;
-	    sh3eb) machine=sh-unknown ;;
-	    sh5el) machine=sh5le-unknown ;;
-	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
-	esac
-	# The Operating System including object format, if it has switched
-	# to ELF recently, or will in the future.
-	case "${UNAME_MACHINE_ARCH}" in
-	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		eval $set_cc_for_build
-		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep -q __ELF__
-		then
-		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
-		    # Return netbsd for either.  FIX?
-		    os=netbsd
-		else
-		    os=netbsdelf
-		fi
-		;;
-	    *)
-	        os=netbsd
-		;;
-	esac
-	# The OS release
-	# Debian GNU/NetBSD machines have a different userland, and
-	# thus, need a distinct triplet. However, they do not need
-	# kernel version information, so it can be replaced with a
-	# suitable tag, in the style of linux-gnu.
-	case "${UNAME_VERSION}" in
-	    Debian*)
-		release='-gnu'
-		;;
-	    *)
-		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-		;;
-	esac
-	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
-	# contains redundant information, the shorter form:
-	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}"
-	exit ;;
-    *:OpenBSD:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
-	exit ;;
-    *:ekkoBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
-	exit ;;
-    *:SolidBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
-	exit ;;
-    macppc:MirBSD:*:*)
-	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    *:MirBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    alpha:OSF1:*:*)
-	case $UNAME_RELEASE in
-	*4.0)
-		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-		;;
-	*5.*)
-	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
-		;;
-	esac
-	# According to Compaq, /usr/sbin/psrinfo has been available on
-	# OSF/1 and Tru64 systems produced since 1995.  I hope that
-	# covers most systems running today.  This code pipes the CPU
-	# types through head -n 1, so we only detect the type of CPU 0.
-	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
-	case "$ALPHA_CPU_TYPE" in
-	    "EV4 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV4.5 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "LCA4 (21066/21068)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV5 (21164)")
-		UNAME_MACHINE="alphaev5" ;;
-	    "EV5.6 (21164A)")
-		UNAME_MACHINE="alphaev56" ;;
-	    "EV5.6 (21164PC)")
-		UNAME_MACHINE="alphapca56" ;;
-	    "EV5.7 (21164PC)")
-		UNAME_MACHINE="alphapca57" ;;
-	    "EV6 (21264)")
-		UNAME_MACHINE="alphaev6" ;;
-	    "EV6.7 (21264A)")
-		UNAME_MACHINE="alphaev67" ;;
-	    "EV6.8CB (21264C)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8AL (21264B)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8CX (21264D)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE="alphaev69" ;;
-	    "EV7 (21364)")
-		UNAME_MACHINE="alphaev7" ;;
-	    "EV7.9 (21364A)")
-		UNAME_MACHINE="alphaev79" ;;
-	esac
-	# A Pn.n version is a patched version.
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	exit ;;
-    Alpha\ *:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# Should we change UNAME_MACHINE based on the output of uname instead
-	# of the specific Alpha model?
-	echo alpha-pc-interix
-	exit ;;
-    21064:Windows_NT:50:3)
-	echo alpha-dec-winnt3.5
-	exit ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-unknown-sysv4
-	exit ;;
-    *:[Aa]miga[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-amigaos
-	exit ;;
-    *:[Mm]orph[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-morphos
-	exit ;;
-    *:OS/390:*:*)
-	echo i370-ibm-openedition
-	exit ;;
-    *:z/VM:*:*)
-	echo s390-ibm-zvmoe
-	exit ;;
-    *:OS400:*:*)
-        echo powerpc-ibm-os400
-	exit ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit ;;
-    arm:riscos:*:*|arm:RISCOS:*:*)
-	echo arm-unknown-riscos
-	exit ;;
-    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit ;;
-    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "`(/bin/universe) 2>/dev/null`" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit ;;
-    NILE*:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit ;;
-    DRS?6000:unix:4.0:6*)
-	echo sparc-icl-nx6
-	exit ;;
-    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
-	case `/usr/bin/uname -p` in
-	    sparc) echo sparc-icl-nx7; exit ;;
-	esac ;;
-    s390x:SunOS:*:*)
-	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
-	echo i386-pc-auroraux${UNAME_RELEASE}
-	exit ;;
-    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	eval $set_cc_for_build
-	SUN_ARCH="i386"
-	# If there is a compiler, see if it is configured for 64-bit objects.
-	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
-	# This test works for both compilers.
-	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_64BIT_ARCH >/dev/null
-	    then
-		SUN_ARCH="x86_64"
-	    fi
-	fi
-	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:*:*)
-	case "`/usr/bin/arch -k`" in
-	    Series*|S4*)
-		UNAME_RELEASE=`uname -v`
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
-	case "`/bin/arch`" in
-	    sun3)
-		echo m68k-sun-sunos${UNAME_RELEASE}
-		;;
-	    sun4)
-		echo sparc-sun-sunos${UNAME_RELEASE}
-		;;
-	esac
-	exit ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit ;;
-    # The situation for MiNT is a little confusing.  The machine name
-    # can be virtually everything (everything which is not
-    # "atarist" or "atariste" at least should have a processor
-    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
-    # to the lowercase version "mint" (or "freemint").  Finally
-    # the system name "TOS" denotes a system which is actually not
-    # MiNT.  But MiNT is downward compatible to TOS, so this should
-    # be no problem.
-    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
-        exit ;;
-    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-        echo m68k-milan-mint${UNAME_RELEASE}
-        exit ;;
-    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-        echo m68k-hades-mint${UNAME_RELEASE}
-        exit ;;
-    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-        echo m68k-unknown-mint${UNAME_RELEASE}
-        exit ;;
-    m68k:machten:*:*)
-	echo m68k-apple-machten${UNAME_RELEASE}
-	exit ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-#ifdef __cplusplus
-#include <stdio.h>  /* for printf() prototype */
-	int main (int argc, char *argv[]) {
-#else
-	int main (argc, argv) int argc; char *argv[]; {
-#endif
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c &&
-	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
-	  SYSTEM_NAME=`$dummy $dummyarg` &&
-	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos${UNAME_RELEASE}
-	exit ;;
-    Motorola:PowerMAX_OS:*:*)
-	echo powerpc-motorola-powermax
-	exit ;;
-    Motorola:*:4.3:PL8-*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit ;;
-    AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
-	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
-	then
-	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
-	       [ ${TARGET_BINARY_INTERFACE}x = x ]
-	    then
-		echo m88k-dg-dgux${UNAME_RELEASE}
-	    else
-		echo m88k-dg-dguxbcs${UNAME_RELEASE}
-	    fi
-	else
-	    echo i586-dg-dgux${UNAME_RELEASE}
-	fi
- 	exit ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
-    i*86:AIX:*:*)
-	echo i386-ibm-aix
-	exit ;;
-    ia64:AIX:*:*)
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		eval $set_cc_for_build
-		sed 's/^		//' << EOF >$dummy.c
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
-		then
-			echo "$SYSTEM_NAME"
-		else
-			echo rs6000-ibm-aix3.2.5
-		fi
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit ;;
-    *:AIX:*:[4567])
-	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
-	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
-	echo romp-ibm-bsd4.4
-	exit ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit ;;                             # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit ;;
-    9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	case "${UNAME_MACHINE}" in
-	    9000/31? )            HP_ARCH=m68000 ;;
-	    9000/[34]?? )         HP_ARCH=m68k ;;
-	    9000/[678][0-9][0-9])
-		if [ -x /usr/bin/getconf ]; then
-		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                    case "${sc_cpu_version}" in
-                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-                      532)                      # CPU_PA_RISC2_0
-                        case "${sc_kernel_bits}" in
-                          32) HP_ARCH="hppa2.0n" ;;
-                          64) HP_ARCH="hppa2.0w" ;;
-			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-                        esac ;;
-                    esac
-		fi
-		if [ "${HP_ARCH}" = "" ]; then
-		    eval $set_cc_for_build
-		    sed 's/^              //' << EOF >$dummy.c
-
-              #define _HPUX_SOURCE
-              #include <stdlib.h>
-              #include <unistd.h>
-
-              int main ()
-              {
-              #if defined(_SC_KERNEL_BITS)
-                  long bits = sysconf(_SC_KERNEL_BITS);
-              #endif
-                  long cpu  = sysconf (_SC_CPU_VERSION);
-
-                  switch (cpu)
-              	{
-              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-              	case CPU_PA_RISC2_0:
-              #if defined(_SC_KERNEL_BITS)
-              	    switch (bits)
-              		{
-              		case 64: puts ("hppa2.0w"); break;
-              		case 32: puts ("hppa2.0n"); break;
-              		default: puts ("hppa2.0"); break;
-              		} break;
-              #else  /* !defined(_SC_KERNEL_BITS) */
-              	    puts ("hppa2.0"); break;
-              #endif
-              	default: puts ("hppa1.0"); break;
-              	}
-                  exit (0);
-              }
-EOF
-		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
-		    test -z "$HP_ARCH" && HP_ARCH=hppa
-		fi ;;
-	esac
-	if [ ${HP_ARCH} = "hppa2.0w" ]
-	then
-	    eval $set_cc_for_build
-
-	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
-	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
-	    # generating 64-bit code.  GNU and HP use different nomenclature:
-	    #
-	    # $ CC_FOR_BUILD=cc ./config.guess
-	    # => hppa2.0w-hp-hpux11.23
-	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
-	    # => hppa64-hp-hpux11.23
-
-	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep -q __LP64__
-	    then
-		HP_ARCH="hppa2.0w"
-	    else
-		HP_ARCH="hppa64"
-	    fi
-	fi
-	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit ;;
-    ia64:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	echo ia64-hp-hpux${HPUX_REV}
-	exit ;;
-    3050*:HI-UX:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
-		{ echo "$SYSTEM_NAME"; exit; }
-	echo unknown-hitachi-hiuxwe2
-	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
-	echo hppa1.1-hp-bsd
-	exit ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit ;;
-    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-	echo hppa1.0-hp-mpeix
-	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
-	echo hppa1.1-hp-osf
-	exit ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit ;;
-    i*86:OSF1:*:*)
-	if [ -x /usr/sbin/sysversion ] ; then
-	    echo ${UNAME_MACHINE}-unknown-osf1mk
-	else
-	    echo ${UNAME_MACHINE}-unknown-osf1
-	fi
-	exit ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-        exit ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-        exit ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-        exit ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-        exit ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-        exit ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
-	      -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit ;;
-    5000:UNIX_System_V:4.*:*)
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
-        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit ;;
-    sparc*:BSD/OS:*:*)
-	echo sparc-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:BSD/OS:*:*)
-	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:FreeBSD:*:*)
-	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    amd64)
-		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	esac
-	exit ;;
-    i*:CYGWIN*:*)
-	echo ${UNAME_MACHINE}-pc-cygwin
-	exit ;;
-    *:MINGW*:*)
-	echo ${UNAME_MACHINE}-pc-mingw32
-	exit ;;
-    i*:windows32*:*)
-    	# uname -m includes "-pc" on this system.
-    	echo ${UNAME_MACHINE}-mingw32
-	exit ;;
-    i*:PW*:*)
-	echo ${UNAME_MACHINE}-pc-pw32
-	exit ;;
-    *:Interix*:*)
-    	case ${UNAME_MACHINE} in
-	    x86)
-		echo i586-pc-interix${UNAME_RELEASE}
-		exit ;;
-	    authenticamd | genuineintel | EM64T)
-		echo x86_64-unknown-interix${UNAME_RELEASE}
-		exit ;;
-	    IA64)
-		echo ia64-unknown-interix${UNAME_RELEASE}
-		exit ;;
-	esac ;;
-    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
-	echo i${UNAME_MACHINE}-pc-mks
-	exit ;;
-    8664:Windows_NT:*)
-	echo x86_64-pc-mks
-	exit ;;
-    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
-	# UNAME_MACHINE based on the output of uname instead of i386?
-	echo i586-pc-interix
-	exit ;;
-    i*:UWIN*:*)
-	echo ${UNAME_MACHINE}-pc-uwin
-	exit ;;
-    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-	echo x86_64-unknown-cygwin
-	exit ;;
-    p*:CYGWIN*:*)
-	echo powerpcle-unknown-cygwin
-	exit ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    *:GNU:*:*)
-	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit ;;
-    *:GNU/*:*:*)
-	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
-	exit ;;
-    i*86:Minix:*:*)
-	echo ${UNAME_MACHINE}-pc-minix
-	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
-	exit ;;
-    arm*:Linux:*:*)
-	eval $set_cc_for_build
-	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_EABI__
-	then
-	    echo ${UNAME_MACHINE}-unknown-linux-gnu
-	else
-	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
-	fi
-	exit ;;
-    avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    cris:Linux:*:*)
-	echo cris-axis-linux-gnu
-	exit ;;
-    crisv32:Linux:*:*)
-	echo crisv32-axis-linux-gnu
-	exit ;;
-    frv:Linux:*:*)
-    	echo frv-unknown-linux-gnu
-	exit ;;
-    i*86:Linux:*:*)
-	LIBC=gnu
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
-	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-	exit ;;
-    ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    mips:Linux:*:* | mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef ${UNAME_MACHINE}
-	#undef ${UNAME_MACHINE}el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=${UNAME_MACHINE}el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=${UNAME_MACHINE}
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    or32:Linux:*:*)
-	echo or32-unknown-linux-gnu
-	exit ;;
-    padre:Linux:*:*)
-	echo sparc-unknown-linux-gnu
-	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
-	exit ;;
-    parisc:Linux:*:* | hppa:Linux:*:*)
-	# Look for CPU level
-	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
-	esac
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit ;;
-    s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
-	exit ;;
-    sh64*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-tilera-linux-gnu
-	exit ;;
-    vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
-	exit ;;
-    x86_64:Linux:*:*)
-	echo x86_64-unknown-linux-gnu
-	exit ;;
-    xtensa*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    i*86:DYNIX/ptx:4*:*)
-	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
-	# earlier versions are messed up and put the nodename in both
-	# sysname and nodename.
-	echo i386-sequent-sysv4
-	exit ;;
-    i*86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
-	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit ;;
-    i*86:OS/2:*:*)
-	# If we were able to find `uname', then EMX Unix compatibility
-	# is probably installed.
-	echo ${UNAME_MACHINE}-pc-os2-emx
-	exit ;;
-    i*86:XTS-300:*:STOP)
-	echo ${UNAME_MACHINE}-unknown-stop
-	exit ;;
-    i*86:atheos:*:*)
-	echo ${UNAME_MACHINE}-unknown-atheos
-	exit ;;
-    i*86:syllable:*:*)
-	echo ${UNAME_MACHINE}-pc-syllable
-	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
-	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    i*86:*DOS:*:*)
-	echo ${UNAME_MACHINE}-pc-msdosdjgpp
-	exit ;;
-    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
-	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
-	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
-	fi
-	exit ;;
-    i*86:*:5:[678]*)
-    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
-	case `/bin/uname -X | grep "^Machine"` in
-	    *486*)	     UNAME_MACHINE=i486 ;;
-	    *Pentium)	     UNAME_MACHINE=i586 ;;
-	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
-	esac
-	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
-	exit ;;
-    i*86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
-		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
-	else
-		echo ${UNAME_MACHINE}-pc-sysv32
-	fi
-	exit ;;
-    pc:*:*:*)
-	# Left here for compatibility:
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i586.
-	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configury will decide that
-	# this is a cross-build.
-	echo i586-pc-msdosdjgpp
-        exit ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
-	fi
-	exit ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit ;;
-    mc68k:UNIX:SYSTEM5:3.51m)
-	echo m68k-convergent-sysv
-	exit ;;
-    M680?0:D-NIX:5.3:*)
-	echo m68k-diab-dnix
-	exit ;;
-    M68*:*:R3V[5678]*:*)
-	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
-    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && { echo i486-ncr-sysv4; exit; } ;;
-    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
-	OS_REL='.3'
-	test -r /etc/.relid \
-	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
-	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
-    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    rs6000:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
-	echo powerpc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv${UNAME_RELEASE}
-	exit ;;
-    RM*:ReliantUNIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=`(uname -p) 2>/dev/null`
-		echo ${UNAME_MACHINE}-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit ;;
-    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                      # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit ;;
-    i*86:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo ${UNAME_MACHINE}-stratus-vos
-	exit ;;
-    *:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo hppa1.1-stratus-vos
-	exit ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux${UNAME_RELEASE}
-	exit ;;
-    news*:NEWS-OS:6*:*)
-	echo mips-sony-newsos6
-	exit ;;
-    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
-	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
-	fi
-        exit ;;
-    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
-	echo powerpc-be-beos
-	exit ;;
-    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
-	echo powerpc-apple-beos
-	exit ;;
-    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
-	echo i586-pc-beos
-	exit ;;
-    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
-	echo i586-pc-haiku
-	exit ;;
-    SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-7:SUPER-UX:*:*)
-	echo sx7-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-8:SUPER-UX:*:*)
-	echo sx8-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-8R:SUPER-UX:*:*)
-	echo sx8r-nec-superux${UNAME_RELEASE}
-	exit ;;
-    Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Rhapsody:*:*)
-	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Darwin:*:*)
-	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    i386)
-		eval $set_cc_for_build
-		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		      grep IS_64BIT_ARCH >/dev/null
-		  then
-		      UNAME_PROCESSOR="x86_64"
-		  fi
-		fi ;;
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
-	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
-	exit ;;
-    *:procnto*:*:* | *:QNX:[0123456789]*:*)
-	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = "x86"; then
-		UNAME_PROCESSOR=i386
-		UNAME_MACHINE=pc
-	fi
-	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
-	exit ;;
-    *:QNX:*:4*)
-	echo i386-pc-qnx
-	exit ;;
-    NSE-?:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    NSR-?:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    *:NonStop-UX:*:*)
-	echo mips-compaq-nonstopux
-	exit ;;
-    BS2000:POSIX*:*:*)
-	echo bs2000-siemens-sysv
-	exit ;;
-    DS/*:UNIX_System_V:*:*)
-	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
-	exit ;;
-    *:Plan9:*:*)
-	# "uname -m" is not consistent, so use $cputype instead. 386
-	# is converted to i386 for consistency with other x86
-	# operating systems.
-	if test "$cputype" = "386"; then
-	    UNAME_MACHINE=i386
-	else
-	    UNAME_MACHINE="$cputype"
-	fi
-	echo ${UNAME_MACHINE}-unknown-plan9
-	exit ;;
-    *:TOPS-10:*:*)
-	echo pdp10-unknown-tops10
-	exit ;;
-    *:TENEX:*:*)
-	echo pdp10-unknown-tenex
-	exit ;;
-    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-	echo pdp10-dec-tops20
-	exit ;;
-    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-	echo pdp10-xkl-tops20
-	exit ;;
-    *:TOPS-20:*:*)
-	echo pdp10-unknown-tops20
-	exit ;;
-    *:ITS:*:*)
-	echo pdp10-unknown-its
-	exit ;;
-    SEI:*:*:SEIUX)
-        echo mips-sei-seiux${UNAME_RELEASE}
-	exit ;;
-    *:DragonFly:*:*)
-	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-	exit ;;
-    *:*VMS:*:*)
-    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
-	case "${UNAME_MACHINE}" in
-	    A*) echo alpha-dec-vms ; exit ;;
-	    I*) echo ia64-dec-vms ; exit ;;
-	    V*) echo vax-dec-vms ; exit ;;
-	esac ;;
-    *:XENIX:*:SysV)
-	echo i386-pc-xenix
-	exit ;;
-    i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
-	exit ;;
-    i*86:rdos:*:*)
-	echo ${UNAME_MACHINE}-pc-rdos
-	exit ;;
-    i*86:AROS:*:*)
-	echo ${UNAME_MACHINE}-pc-aros
-	exit ;;
-esac
-
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-          "4"
-#else
-	  ""
-#endif
-         ); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
-cat >&2 <<EOF
-$0: unable to guess system type
-
-This script, last modified $timestamp, has failed to recognize
-the operating system you are using. It is advised that you
-download the most up to date version of the config scripts from
-
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
-and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
-
-If the version you run ($0) is already up to date, please
-send the following data and any information you think might be
-pertinent to <config-patches@gnu.org> in order to provide the needed
-information to handle your system.
-
-config.guess timestamp = $timestamp
-
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
-
-hostinfo               = `(hostinfo) 2>/dev/null`
-/bin/universe          = `(/bin/universe) 2>/dev/null`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
-/bin/arch              = `(/bin/arch) 2>/dev/null`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
-
-UNAME_MACHINE = ${UNAME_MACHINE}
-UNAME_RELEASE = ${UNAME_RELEASE}
-UNAME_SYSTEM  = ${UNAME_SYSTEM}
-UNAME_VERSION = ${UNAME_VERSION}
-EOF
-
-exit 1
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/volk/config.guess b/volk/config.guess
new file mode 120000
index 0000000000..405bc32359
--- /dev/null
+++ b/volk/config.guess
@@ -0,0 +1 @@
+/usr/share/automake-1.11/config.guess
\ No newline at end of file
diff --git a/volk/config.sub b/volk/config.sub
deleted file mode 100755
index 320e303881..0000000000
--- a/volk/config.sub
+++ /dev/null
@@ -1,1739 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
-#   Free Software Foundation, Inc.
-
-timestamp='2010-09-11'
-
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted GNU ChangeLog entry.
-#
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS
-       $0 [OPTION] ALIAS
-
-Canonicalize a configuration name.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.sub ($timestamp)
-
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
-Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help"
-       exit 1 ;;
-
-    *local*)
-       # First pass through any local machine types.
-       echo $1
-       exit ;;
-
-    * )
-       break ;;
-  esac
-done
-
-case $# in
- 0) echo "$me: missing argument$help" >&2
-    exit 1;;
- 1) ;;
- *) echo "$me: too many arguments$help" >&2
-    exit 1;;
-esac
-
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
-  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
-  linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | \
-  kopensolaris*-gnu* | \
-  storm-chaos* | os2-emx* | rtmk-nova*)
-    os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
-    ;;
-  *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
-    else os=; fi
-    ;;
-esac
-
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work.  We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
-	-sun*os*)
-		# Prevent following clause from handling this invalid input.
-		;;
-	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
-	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
-	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray | -microblaze)
-		os=
-		basic_machine=$1
-		;;
-        -bluegene*)
-	        os=-cnk
-		;;
-	-sim | -cisco | -oki | -wec | -winbond)
-		os=
-		basic_machine=$1
-		;;
-	-scout)
-		;;
-	-wrs)
-		os=-vxworks
-		basic_machine=$1
-		;;
-	-chorusos*)
-		os=-chorusos
-		basic_machine=$1
-		;;
- 	-chorusrdb)
- 		os=-chorusrdb
-		basic_machine=$1
- 		;;
-	-hiux*)
-		os=-hiuxwe2
-		;;
-	-sco6)
-		os=-sco5v6
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5)
-		os=-sco3.2v5
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco4)
-		os=-sco3.2v4
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2.[4-9]*)
-		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2v[4-9]*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5v6*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco*)
-		os=-sco3.2v2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-udk*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-isc)
-		os=-isc2.2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-clix*)
-		basic_machine=clipper-intergraph
-		;;
-	-isc*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-lynx*)
-		os=-lynxos
-		;;
-	-ptx*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
-		;;
-	-windowsnt*)
-		os=`echo $os | sed -e 's/windowsnt/winnt/'`
-		;;
-	-psos*)
-		os=-psos
-		;;
-	-mint | -mint[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-esac
-
-# Decode aliases for certain CPU-COMPANY combinations.
-case $basic_machine in
-	# Recognize the basic CPU types without company name.
-	# Some are omitted here because they have special meanings below.
-	1750a | 580 \
-	| a29k \
-	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
-	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
-	| am33_2.0 \
-	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-	| bfin \
-	| c4x | clipper \
-	| d10v | d30v | dlx | dsp16xx \
-	| fido | fr30 | frv \
-	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-	| i370 | i860 | i960 | ia64 \
-	| ip2k | iq2000 \
-	| lm32 \
-	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | mcore | mep | metag \
-	| mips | mipsbe | mipseb | mipsel | mipsle \
-	| mips16 \
-	| mips64 | mips64el \
-	| mips64octeon | mips64octeonel \
-	| mips64orion | mips64orionel \
-	| mips64r5900 | mips64r5900el \
-	| mips64vr | mips64vrel \
-	| mips64vr4100 | mips64vr4100el \
-	| mips64vr4300 | mips64vr4300el \
-	| mips64vr5000 | mips64vr5000el \
-	| mips64vr5900 | mips64vr5900el \
-	| mipsisa32 | mipsisa32el \
-	| mipsisa32r2 | mipsisa32r2el \
-	| mipsisa64 | mipsisa64el \
-	| mipsisa64r2 | mipsisa64r2el \
-	| mipsisa64sb1 | mipsisa64sb1el \
-	| mipsisa64sr71k | mipsisa64sr71kel \
-	| mipstx39 | mipstx39el \
-	| mn10200 | mn10300 \
-	| moxie \
-	| mt \
-	| msp430 \
-	| nds32 | nds32le | nds32be \
-	| nios | nios2 \
-	| ns16k | ns32k \
-	| or32 \
-	| pdp10 | pdp11 | pj | pjl \
-	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
-	| pyramid \
-	| rx \
-	| score \
-	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
-	| sh64 | sh64le \
-	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
-	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
-	| spu | strongarm \
-	| tahoe | thumb | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
-	| ubicom32 \
-	| v850 | v850e \
-	| we32k \
-	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
-	| z8k | z80)
-		basic_machine=$basic_machine-unknown
-		;;
-	c54x)
-		basic_machine=tic54x-unknown
-		;;
-	c55x)
-		basic_machine=tic55x-unknown
-		;;
-	c6x)
-		basic_machine=tic6x-unknown
-		;;
-	m6811 | m68hc11 | m6812 | m68hc12 | picochip)
-		# Motorola 68HC11/12.
-		basic_machine=$basic_machine-unknown
-		os=-none
-		;;
-	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
-		;;
-	ms1)
-		basic_machine=mt-unknown
-		;;
-
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i*86 | x86_64)
-	  basic_machine=$basic_machine-pc
-	  ;;
-	# Object if more than one company name word.
-	*-*-*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-	# Recognize the basic CPU types with company name.
-	580-* \
-	| a29k-* \
-	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
-	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
-	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-	| avr-* | avr32-* \
-	| bfin-* | bs2000-* \
-	| c[123]* | c30-* | [cjt]90-* | c4x-* \
-	| clipper-* | craynv-* | cydra-* \
-	| d10v-* | d30v-* | dlx-* \
-	| elxsi-* \
-	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
-	| h8300-* | h8500-* \
-	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
-	| i*86-* | i860-* | i960-* | ia64-* \
-	| ip2k-* | iq2000-* \
-	| lm32-* \
-	| m32c-* | m32r-* | m32rle-* \
-	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
-	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
-	| mips16-* \
-	| mips64-* | mips64el-* \
-	| mips64octeon-* | mips64octeonel-* \
-	| mips64orion-* | mips64orionel-* \
-	| mips64r5900-* | mips64r5900el-* \
-	| mips64vr-* | mips64vrel-* \
-	| mips64vr4100-* | mips64vr4100el-* \
-	| mips64vr4300-* | mips64vr4300el-* \
-	| mips64vr5000-* | mips64vr5000el-* \
-	| mips64vr5900-* | mips64vr5900el-* \
-	| mipsisa32-* | mipsisa32el-* \
-	| mipsisa32r2-* | mipsisa32r2el-* \
-	| mipsisa64-* | mipsisa64el-* \
-	| mipsisa64r2-* | mipsisa64r2el-* \
-	| mipsisa64sb1-* | mipsisa64sb1el-* \
-	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
-	| mipstx39-* | mipstx39el-* \
-	| mmix-* \
-	| mt-* \
-	| msp430-* \
-	| nds32-* | nds32le-* | nds32be-* \
-	| nios-* | nios2-* \
-	| none-* | np1-* | ns16k-* | ns32k-* \
-	| orion-* \
-	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
-	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
-	| pyramid-* \
-	| romp-* | rs6000-* | rx-* \
-	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
-	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
-	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
-	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
-	| tahoe-* | thumb-* \
-	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
-	| tile-* | tilegx-* \
-	| tron-* \
-	| ubicom32-* \
-	| v850-* | v850e-* | vax-* \
-	| we32k-* \
-	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
-	| xstormy16-* | xtensa*-* \
-	| ymp-* \
-	| z8k-* | z80-*)
-		;;
-	# Recognize the basic CPU types without company name, with glob match.
-	xtensa*)
-		basic_machine=$basic_machine-unknown
-		;;
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	386bsd)
-		basic_machine=i386-unknown
-		os=-bsd
-		;;
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		basic_machine=m68000-att
-		;;
-	3b*)
-		basic_machine=we32k-att
-		;;
-	a29khif)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-    	abacus)
-		basic_machine=abacus-unknown
-		;;
-	adobe68k)
-		basic_machine=m68010-adobe
-		os=-scout
-		;;
-	alliant | fx80)
-		basic_machine=fx80-alliant
-		;;
-	altos | altos3068)
-		basic_machine=m68k-altos
-		;;
-	am29k)
-		basic_machine=a29k-none
-		os=-bsd
-		;;
-	amd64)
-		basic_machine=x86_64-pc
-		;;
-	amd64-*)
-		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	amdahl)
-		basic_machine=580-amdahl
-		os=-sysv
-		;;
-	amiga | amiga-*)
-		basic_machine=m68k-unknown
-		;;
-	amigaos | amigados)
-		basic_machine=m68k-unknown
-		os=-amigaos
-		;;
-	amigaunix | amix)
-		basic_machine=m68k-unknown
-		os=-sysv4
-		;;
-	apollo68)
-		basic_machine=m68k-apollo
-		os=-sysv
-		;;
-	apollo68bsd)
-		basic_machine=m68k-apollo
-		os=-bsd
-		;;
-	aros)
-		basic_machine=i386-pc
-		os=-aros
-		;;
-	aux)
-		basic_machine=m68k-apple
-		os=-aux
-		;;
-	balance)
-		basic_machine=ns32k-sequent
-		os=-dynix
-		;;
-	blackfin)
-		basic_machine=bfin-unknown
-		os=-linux
-		;;
-	blackfin-*)
-		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	bluegene*)
-		basic_machine=powerpc-ibm
-		os=-cnk
-		;;
-	c54x-*)
-		basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	c55x-*)
-		basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	c6x-*)
-		basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	c90)
-		basic_machine=c90-cray
-		os=-unicos
-		;;
-        cegcc)
-		basic_machine=arm-unknown
-		os=-cegcc
-		;;
-	convex-c1)
-		basic_machine=c1-convex
-		os=-bsd
-		;;
-	convex-c2)
-		basic_machine=c2-convex
-		os=-bsd
-		;;
-	convex-c32)
-		basic_machine=c32-convex
-		os=-bsd
-		;;
-	convex-c34)
-		basic_machine=c34-convex
-		os=-bsd
-		;;
-	convex-c38)
-		basic_machine=c38-convex
-		os=-bsd
-		;;
-	cray | j90)
-		basic_machine=j90-cray
-		os=-unicos
-		;;
-	craynv)
-		basic_machine=craynv-cray
-		os=-unicosmp
-		;;
-	cr16)
-		basic_machine=cr16-unknown
-		os=-elf
-		;;
-	crds | unos)
-		basic_machine=m68k-crds
-		;;
-	crisv32 | crisv32-* | etraxfs*)
-		basic_machine=crisv32-axis
-		;;
-	cris | cris-* | etrax*)
-		basic_machine=cris-axis
-		;;
-	crx)
-		basic_machine=crx-unknown
-		os=-elf
-		;;
-	da30 | da30-*)
-		basic_machine=m68k-da30
-		;;
-	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
-		basic_machine=mips-dec
-		;;
-	decsystem10* | dec10*)
-		basic_machine=pdp10-dec
-		os=-tops10
-		;;
-	decsystem20* | dec20*)
-		basic_machine=pdp10-dec
-		os=-tops20
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		basic_machine=m68k-motorola
-		;;
-	delta88)
-		basic_machine=m88k-motorola
-		os=-sysv3
-		;;
-	dicos)
-		basic_machine=i686-pc
-		os=-dicos
-		;;
-	djgpp)
-		basic_machine=i586-pc
-		os=-msdosdjgpp
-		;;
-	dpx20 | dpx20-*)
-		basic_machine=rs6000-bull
-		os=-bosx
-		;;
-	dpx2* | dpx2*-bull)
-		basic_machine=m68k-bull
-		os=-sysv3
-		;;
-	ebmon29k)
-		basic_machine=a29k-amd
-		os=-ebmon
-		;;
-	elxsi)
-		basic_machine=elxsi-elxsi
-		os=-bsd
-		;;
-	encore | umax | mmax)
-		basic_machine=ns32k-encore
-		;;
-	es1800 | OSE68k | ose68k | ose | OSE)
-		basic_machine=m68k-ericsson
-		os=-ose
-		;;
-	fx2800)
-		basic_machine=i860-alliant
-		;;
-	genix)
-		basic_machine=ns32k-ns
-		;;
-	gmicro)
-		basic_machine=tron-gmicro
-		os=-sysv
-		;;
-	go32)
-		basic_machine=i386-pc
-		os=-go32
-		;;
-	h3050r* | hiux*)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	h8300hms)
-		basic_machine=h8300-hitachi
-		os=-hms
-		;;
-	h8300xray)
-		basic_machine=h8300-hitachi
-		os=-xray
-		;;
-	h8500hms)
-		basic_machine=h8500-hitachi
-		os=-hms
-		;;
-	harris)
-		basic_machine=m88k-harris
-		os=-sysv3
-		;;
-	hp300-*)
-		basic_machine=m68k-hp
-		;;
-	hp300bsd)
-		basic_machine=m68k-hp
-		os=-bsd
-		;;
-	hp300hpux)
-		basic_machine=m68k-hp
-		os=-hpux
-		;;
-	hp3k9[0-9][0-9] | hp9[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		basic_machine=m68000-hp
-		;;
-	hp9k3[2-9][0-9])
-		basic_machine=m68k-hp
-		;;
-	hp9k6[0-9][0-9] | hp6[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k7[0-79][0-9] | hp7[0-79][0-9])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k78[0-9] | hp78[0-9])
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][13679] | hp8[0-9][13679])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hppa-next)
-		os=-nextstep3
-		;;
-	hppaosf)
-		basic_machine=hppa1.1-hp
-		os=-osf
-		;;
-	hppro)
-		basic_machine=hppa1.1-hp
-		os=-proelf
-		;;
-	i370-ibm* | ibm*)
-		basic_machine=i370-ibm
-		;;
-# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
-	i*86v32)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv32
-		;;
-	i*86v4*)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv4
-		;;
-	i*86v)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv
-		;;
-	i*86sol2)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-solaris2
-		;;
-	i386mach)
-		basic_machine=i386-mach
-		os=-mach
-		;;
-	i386-vsta | vsta)
-		basic_machine=i386-unknown
-		os=-vsta
-		;;
-	iris | iris4d)
-		basic_machine=mips-sgi
-		case $os in
-		    -irix*)
-			;;
-		    *)
-			os=-irix4
-			;;
-		esac
-		;;
-	isi68 | isi)
-		basic_machine=m68k-isi
-		os=-sysv
-		;;
-	m68knommu)
-		basic_machine=m68k-unknown
-		os=-linux
-		;;
-	m68knommu-*)
-		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	m88k-omron*)
-		basic_machine=m88k-omron
-		;;
-	magnum | m3230)
-		basic_machine=mips-mips
-		os=-sysv
-		;;
-	merlin)
-		basic_machine=ns32k-utek
-		os=-sysv
-		;;
-        microblaze)
-		basic_machine=microblaze-xilinx
-		;;
-	mingw32)
-		basic_machine=i386-pc
-		os=-mingw32
-		;;
-	mingw32ce)
-		basic_machine=arm-unknown
-		os=-mingw32ce
-		;;
-	miniframe)
-		basic_machine=m68000-convergent
-		;;
-	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-	mips3*-*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
-		;;
-	mips3*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
-		;;
-	monitor)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	morphos)
-		basic_machine=powerpc-unknown
-		os=-morphos
-		;;
-	msdos)
-		basic_machine=i386-pc
-		os=-msdos
-		;;
-	ms1-*)
-		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
-		;;
-	mvs)
-		basic_machine=i370-ibm
-		os=-mvs
-		;;
-	ncr3000)
-		basic_machine=i486-ncr
-		os=-sysv4
-		;;
-	netbsd386)
-		basic_machine=i386-unknown
-		os=-netbsd
-		;;
-	netwinder)
-		basic_machine=armv4l-rebel
-		os=-linux
-		;;
-	news | news700 | news800 | news900)
-		basic_machine=m68k-sony
-		os=-newsos
-		;;
-	news1000)
-		basic_machine=m68030-sony
-		os=-newsos
-		;;
-	news-3600 | risc-news)
-		basic_machine=mips-sony
-		os=-newsos
-		;;
-	necv70)
-		basic_machine=v70-nec
-		os=-sysv
-		;;
-	next | m*-next )
-		basic_machine=m68k-next
-		case $os in
-		    -nextstep* )
-			;;
-		    -ns2*)
-		      os=-nextstep2
-			;;
-		    *)
-		      os=-nextstep3
-			;;
-		esac
-		;;
-	nh3000)
-		basic_machine=m68k-harris
-		os=-cxux
-		;;
-	nh[45]000)
-		basic_machine=m88k-harris
-		os=-cxux
-		;;
-	nindy960)
-		basic_machine=i960-intel
-		os=-nindy
-		;;
-	mon960)
-		basic_machine=i960-intel
-		os=-mon960
-		;;
-	nonstopux)
-		basic_machine=mips-compaq
-		os=-nonstopux
-		;;
-	np1)
-		basic_machine=np1-gould
-		;;
-        neo-tandem)
-		basic_machine=neo-tandem
-		;;
-        nse-tandem)
-		basic_machine=nse-tandem
-		;;
-	nsr-tandem)
-		basic_machine=nsr-tandem
-		;;
-	op50n-* | op60c-*)
-		basic_machine=hppa1.1-oki
-		os=-proelf
-		;;
-	openrisc | openrisc-*)
-		basic_machine=or32-unknown
-		;;
-	os400)
-		basic_machine=powerpc-ibm
-		os=-os400
-		;;
-	OSE68000 | ose68000)
-		basic_machine=m68000-ericsson
-		os=-ose
-		;;
-	os68k)
-		basic_machine=m68k-none
-		os=-os68k
-		;;
-	pa-hitachi)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	paragon)
-		basic_machine=i860-intel
-		os=-osf
-		;;
-	parisc)
-		basic_machine=hppa-unknown
-		os=-linux
-		;;
-	parisc-*)
-		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	pbd)
-		basic_machine=sparc-tti
-		;;
-	pbb)
-		basic_machine=m68k-tti
-		;;
-	pc532 | pc532-*)
-		basic_machine=ns32k-pc532
-		;;
-	pc98)
-		basic_machine=i386-pc
-		;;
-	pc98-*)
-		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium | p5 | k5 | k6 | nexgen | viac3)
-		basic_machine=i586-pc
-		;;
-	pentiumpro | p6 | 6x86 | athlon | athlon_*)
-		basic_machine=i686-pc
-		;;
-	pentiumii | pentium2 | pentiumiii | pentium3)
-		basic_machine=i686-pc
-		;;
-	pentium4)
-		basic_machine=i786-pc
-		;;
-	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumpro-* | p6-* | 6x86-* | athlon-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium4-*)
-		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pn)
-		basic_machine=pn-gould
-		;;
-	power)	basic_machine=power-ibm
-		;;
-	ppc)	basic_machine=powerpc-unknown
-		;;
-	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppcle | powerpclittle | ppc-le | powerpc-little)
-		basic_machine=powerpcle-unknown
-		;;
-	ppcle-* | powerpclittle-*)
-		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64)	basic_machine=powerpc64-unknown
-		;;
-	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
-		basic_machine=powerpc64le-unknown
-		;;
-	ppc64le-* | powerpc64little-*)
-		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ps2)
-		basic_machine=i386-ibm
-		;;
-	pw32)
-		basic_machine=i586-unknown
-		os=-pw32
-		;;
-	rdos)
-		basic_machine=i386-pc
-		os=-rdos
-		;;
-	rom68k)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	rm[46]00)
-		basic_machine=mips-siemens
-		;;
-	rtpc | rtpc-*)
-		basic_machine=romp-ibm
-		;;
-	s390 | s390-*)
-		basic_machine=s390-ibm
-		;;
-	s390x | s390x-*)
-		basic_machine=s390x-ibm
-		;;
-	sa29200)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	sb1)
-		basic_machine=mipsisa64sb1-unknown
-		;;
-	sb1el)
-		basic_machine=mipsisa64sb1el-unknown
-		;;
-	sde)
-		basic_machine=mipsisa32-sde
-		os=-elf
-		;;
-	sei)
-		basic_machine=mips-sei
-		os=-seiux
-		;;
-	sequent)
-		basic_machine=i386-sequent
-		;;
-	sh)
-		basic_machine=sh-hitachi
-		os=-hms
-		;;
-	sh5el)
-		basic_machine=sh5le-unknown
-		;;
-	sh64)
-		basic_machine=sh64-unknown
-		;;
-	sparclite-wrs | simso-wrs)
-		basic_machine=sparclite-wrs
-		os=-vxworks
-		;;
-	sps7)
-		basic_machine=m68k-bull
-		os=-sysv2
-		;;
-	spur)
-		basic_machine=spur-unknown
-		;;
-	st2000)
-		basic_machine=m68k-tandem
-		;;
-	stratus)
-		basic_machine=i860-stratus
-		os=-sysv4
-		;;
-	sun2)
-		basic_machine=m68000-sun
-		;;
-	sun2os3)
-		basic_machine=m68000-sun
-		os=-sunos3
-		;;
-	sun2os4)
-		basic_machine=m68000-sun
-		os=-sunos4
-		;;
-	sun3os3)
-		basic_machine=m68k-sun
-		os=-sunos3
-		;;
-	sun3os4)
-		basic_machine=m68k-sun
-		os=-sunos4
-		;;
-	sun4os3)
-		basic_machine=sparc-sun
-		os=-sunos3
-		;;
-	sun4os4)
-		basic_machine=sparc-sun
-		os=-sunos4
-		;;
-	sun4sol2)
-		basic_machine=sparc-sun
-		os=-solaris2
-		;;
-	sun3 | sun3-*)
-		basic_machine=m68k-sun
-		;;
-	sun4)
-		basic_machine=sparc-sun
-		;;
-	sun386 | sun386i | roadrunner)
-		basic_machine=i386-sun
-		;;
-	sv1)
-		basic_machine=sv1-cray
-		os=-unicos
-		;;
-	symmetry)
-		basic_machine=i386-sequent
-		os=-dynix
-		;;
-	t3e)
-		basic_machine=alphaev5-cray
-		os=-unicos
-		;;
-	t90)
-		basic_machine=t90-cray
-		os=-unicos
-		;;
-        # This must be matched before tile*.
-        tilegx*)
-		basic_machine=tilegx-unknown
-		os=-linux-gnu
-		;;
-	tile*)
-		basic_machine=tile-unknown
-		os=-linux-gnu
-		;;
-	tx39)
-		basic_machine=mipstx39-unknown
-		;;
-	tx39el)
-		basic_machine=mipstx39el-unknown
-		;;
-	toad1)
-		basic_machine=pdp10-xkl
-		os=-tops20
-		;;
-	tower | tower-32)
-		basic_machine=m68k-ncr
-		;;
-	tpf)
-		basic_machine=s390x-ibm
-		os=-tpf
-		;;
-	udi29k)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	ultra3)
-		basic_machine=a29k-nyu
-		os=-sym1
-		;;
-	v810 | necv810)
-		basic_machine=v810-nec
-		os=-none
-		;;
-	vaxv)
-		basic_machine=vax-dec
-		os=-sysv
-		;;
-	vms)
-		basic_machine=vax-dec
-		os=-vms
-		;;
-	vpp*|vx|vx-*)
-		basic_machine=f301-fujitsu
-		;;
-	vxworks960)
-		basic_machine=i960-wrs
-		os=-vxworks
-		;;
-	vxworks68)
-		basic_machine=m68k-wrs
-		os=-vxworks
-		;;
-	vxworks29k)
-		basic_machine=a29k-wrs
-		os=-vxworks
-		;;
-	w65*)
-		basic_machine=w65-wdc
-		os=-none
-		;;
-	w89k-*)
-		basic_machine=hppa1.1-winbond
-		os=-proelf
-		;;
-	xbox)
-		basic_machine=i686-pc
-		os=-mingw32
-		;;
-	xps | xps100)
-		basic_machine=xps100-honeywell
-		;;
-	ymp)
-		basic_machine=ymp-cray
-		os=-unicos
-		;;
-	z8k-*-coff)
-		basic_machine=z8k-unknown
-		os=-sim
-		;;
-	z80-*-coff)
-		basic_machine=z80-unknown
-		os=-sim
-		;;
-	none)
-		basic_machine=none-none
-		os=-none
-		;;
-
-# Here we handle the default manufacturer of certain CPU types.  It is in
-# some cases the only manufacturer, in others, it is the most popular.
-	w89k)
-		basic_machine=hppa1.1-winbond
-		;;
-	op50n)
-		basic_machine=hppa1.1-oki
-		;;
-	op60c)
-		basic_machine=hppa1.1-oki
-		;;
-	romp)
-		basic_machine=romp-ibm
-		;;
-	mmix)
-		basic_machine=mmix-knuth
-		;;
-	rs6000)
-		basic_machine=rs6000-ibm
-		;;
-	vax)
-		basic_machine=vax-dec
-		;;
-	pdp10)
-		# there are many clones, so DEC is not a safe bet
-		basic_machine=pdp10-unknown
-		;;
-	pdp11)
-		basic_machine=pdp11-dec
-		;;
-	we32k)
-		basic_machine=we32k-att
-		;;
-	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
-		basic_machine=sh-unknown
-		;;
-	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
-		basic_machine=sparc-sun
-		;;
-	cydra)
-		basic_machine=cydra-cydrome
-		;;
-	orion)
-		basic_machine=orion-highlevel
-		;;
-	orion105)
-		basic_machine=clipper-highlevel
-		;;
-	mac | mpw | mac-mpw)
-		basic_machine=m68k-apple
-		;;
-	pmac | pmac-mpw)
-		basic_machine=powerpc-apple
-		;;
-	*-unknown)
-		# Make sure to match an already-canonicalized machine name.
-		;;
-	*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
-	*-digital*)
-		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
-		;;
-	*-commodore*)
-		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if [ x"$os" != x"" ]
-then
-case $os in
-        # First match some system type aliases
-        # that might get confused with valid system types.
-	# -solaris* is a basic system type, with this one exception.
-        -auroraux)
-	        os=-auroraux
-		;;
-	-solaris1 | -solaris1.*)
-		os=`echo $os | sed -e 's|solaris1|sunos4|'`
-		;;
-	-solaris)
-		os=-solaris2
-		;;
-	-svr4*)
-		os=-sysv4
-		;;
-	-unixware*)
-		os=-sysv4.2uw
-		;;
-	-gnu/linux*)
-		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
-		;;
-	# First accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST END IN A *, to match a version number.
-	# -sysv* is not here because it comes later, after sysvr4.
-	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
-	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-	      | -sym* | -kopensolaris* \
-	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* | -aros* \
-	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
-	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -openbsd* | -solidbsd* \
-	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
-	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
-	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* | -cegcc* \
-	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -linux-android* \
-	      | -linux-newlib* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* \
-	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
-	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
-	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
-	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
-	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
-	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
-	# Remember, each alternative MUST END IN *, to match a version number.
-		;;
-	-qnx*)
-		case $basic_machine in
-		    x86-* | i*86-*)
-			;;
-		    *)
-			os=-nto$os
-			;;
-		esac
-		;;
-	-nto-qnx*)
-		;;
-	-nto*)
-		os=`echo $os | sed -e 's|nto|nto-qnx|'`
-		;;
-	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
-	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
-		;;
-	-mac*)
-		os=`echo $os | sed -e 's|mac|macos|'`
-		;;
-	-linux-dietlibc)
-		os=-linux-dietlibc
-		;;
-	-linux*)
-		os=`echo $os | sed -e 's|linux|linux-gnu|'`
-		;;
-	-sunos5*)
-		os=`echo $os | sed -e 's|sunos5|solaris2|'`
-		;;
-	-sunos6*)
-		os=`echo $os | sed -e 's|sunos6|solaris3|'`
-		;;
-	-opened*)
-		os=-openedition
-		;;
-        -os400*)
-		os=-os400
-		;;
-	-wince*)
-		os=-wince
-		;;
-	-osfrose*)
-		os=-osfrose
-		;;
-	-osf*)
-		os=-osf
-		;;
-	-utek*)
-		os=-bsd
-		;;
-	-dynix*)
-		os=-bsd
-		;;
-	-acis*)
-		os=-aos
-		;;
-	-atheos*)
-		os=-atheos
-		;;
-	-syllable*)
-		os=-syllable
-		;;
-	-386bsd)
-		os=-bsd
-		;;
-	-ctix* | -uts*)
-		os=-sysv
-		;;
-	-nova*)
-		os=-rtmk-nova
-		;;
-	-ns2 )
-		os=-nextstep2
-		;;
-	-nsk*)
-		os=-nsk
-		;;
-	# Preserve the version number of sinix5.
-	-sinix5.*)
-		os=`echo $os | sed -e 's|sinix|sysv|'`
-		;;
-	-sinix*)
-		os=-sysv4
-		;;
-        -tpf*)
-		os=-tpf
-		;;
-	-triton*)
-		os=-sysv3
-		;;
-	-oss*)
-		os=-sysv3
-		;;
-	-svr4)
-		os=-sysv4
-		;;
-	-svr3)
-		os=-sysv3
-		;;
-	-sysvr4)
-		os=-sysv4
-		;;
-	# This must come after -sysvr4.
-	-sysv*)
-		;;
-	-ose*)
-		os=-ose
-		;;
-	-es1800*)
-		os=-ose
-		;;
-	-xenix)
-		os=-xenix
-		;;
-	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-		os=-mint
-		;;
-	-aros*)
-		os=-aros
-		;;
-	-kaos*)
-		os=-kaos
-		;;
-	-zvmoe)
-		os=-zvmoe
-		;;
-	-dicos*)
-		os=-dicos
-		;;
-        -nacl*)
-	        ;;
-	-none)
-		;;
-	*)
-		# Get rid of the `-' at the beginning of $os.
-		os=`echo $os | sed 's/[^-]*-//'`
-		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
-		exit 1
-		;;
-esac
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-case $basic_machine in
-        score-*)
-		os=-elf
-		;;
-        spu-*)
-		os=-elf
-		;;
-	*-acorn)
-		os=-riscix1.2
-		;;
-	arm*-rebel)
-		os=-linux
-		;;
-	arm*-semi)
-		os=-aout
-		;;
-        c4x-* | tic4x-*)
-        	os=-coff
-		;;
-	tic54x-*)
-		os=-coff
-		;;
-	tic55x-*)
-		os=-coff
-		;;
-	tic6x-*)
-		os=-coff
-		;;
-	# This must come before the *-dec entry.
-	pdp10-*)
-		os=-tops20
-		;;
-	pdp11-*)
-		os=-none
-		;;
-	*-dec | vax-*)
-		os=-ultrix4.2
-		;;
-	m68*-apollo)
-		os=-domain
-		;;
-	i386-sun)
-		os=-sunos4.0.2
-		;;
-	m68000-sun)
-		os=-sunos3
-		# This also exists in the configure program, but was not the
-		# default.
-		# os=-sunos4
-		;;
-	m68*-cisco)
-		os=-aout
-		;;
-        mep-*)
-		os=-elf
-		;;
-	mips*-cisco)
-		os=-elf
-		;;
-	mips*-*)
-		os=-elf
-		;;
-	or32-*)
-		os=-coff
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=-sysv3
-		;;
-	sparc-* | *-sun)
-		os=-sunos4.1.1
-		;;
-	*-be)
-		os=-beos
-		;;
-	*-haiku)
-		os=-haiku
-		;;
-	*-ibm)
-		os=-aix
-		;;
-    	*-knuth)
-		os=-mmixware
-		;;
-	*-wec)
-		os=-proelf
-		;;
-	*-winbond)
-		os=-proelf
-		;;
-	*-oki)
-		os=-proelf
-		;;
-	*-hp)
-		os=-hpux
-		;;
-	*-hitachi)
-		os=-hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=-sysv
-		;;
-	*-cbm)
-		os=-amigaos
-		;;
-	*-dg)
-		os=-dgux
-		;;
-	*-dolphin)
-		os=-sysv3
-		;;
-	m68k-ccur)
-		os=-rtu
-		;;
-	m88k-omron*)
-		os=-luna
-		;;
-	*-next )
-		os=-nextstep
-		;;
-	*-sequent)
-		os=-ptx
-		;;
-	*-crds)
-		os=-unos
-		;;
-	*-ns)
-		os=-genix
-		;;
-	i370-*)
-		os=-mvs
-		;;
-	*-next)
-		os=-nextstep3
-		;;
-	*-gould)
-		os=-sysv
-		;;
-	*-highlevel)
-		os=-bsd
-		;;
-	*-encore)
-		os=-bsd
-		;;
-	*-sgi)
-		os=-irix
-		;;
-	*-siemens)
-		os=-sysv4
-		;;
-	*-masscomp)
-		os=-rtu
-		;;
-	f30[01]-fujitsu | f700-fujitsu)
-		os=-uxpv
-		;;
-	*-rom68k)
-		os=-coff
-		;;
-	*-*bug)
-		os=-coff
-		;;
-	*-apple)
-		os=-macos
-		;;
-	*-atari*)
-		os=-mint
-		;;
-	*)
-		os=-none
-		;;
-esac
-fi
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
-	*-unknown)
-		case $os in
-			-riscix*)
-				vendor=acorn
-				;;
-			-sunos*)
-				vendor=sun
-				;;
-			-cnk*|-aix*)
-				vendor=ibm
-				;;
-			-beos*)
-				vendor=be
-				;;
-			-hpux*)
-				vendor=hp
-				;;
-			-mpeix*)
-				vendor=hp
-				;;
-			-hiux*)
-				vendor=hitachi
-				;;
-			-unos*)
-				vendor=crds
-				;;
-			-dgux*)
-				vendor=dg
-				;;
-			-luna*)
-				vendor=omron
-				;;
-			-genix*)
-				vendor=ns
-				;;
-			-mvs* | -opened*)
-				vendor=ibm
-				;;
-			-os400*)
-				vendor=ibm
-				;;
-			-ptx*)
-				vendor=sequent
-				;;
-			-tpf*)
-				vendor=ibm
-				;;
-			-vxsim* | -vxworks* | -windiss*)
-				vendor=wrs
-				;;
-			-aux*)
-				vendor=apple
-				;;
-			-hms*)
-				vendor=hitachi
-				;;
-			-mpw* | -macos*)
-				vendor=apple
-				;;
-			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-				vendor=atari
-				;;
-			-vos*)
-				vendor=stratus
-				;;
-		esac
-		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
-		;;
-esac
-
-echo $basic_machine$os
-exit
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/volk/config.sub b/volk/config.sub
new file mode 120000
index 0000000000..4d47fbcbc4
--- /dev/null
+++ b/volk/config.sub
@@ -0,0 +1 @@
+/usr/share/automake-1.11/config.sub
\ No newline at end of file
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 99276ab87b..aef1d7ba8c 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -41,93 +41,93 @@ volkinclude_HEADERS = \
 	volk.h \
 	volk_cpu.h \
 	volk_environment_init.h \
-	volk_16s_add_quad_aligned16.h \
-	volk_16s_branch_4_state_8_aligned16.h \
-	volk_16sc_deinterleave_16s_aligned16.h \
-	volk_16sc_deinterleave_32f_aligned16.h \
-	volk_16sc_deinterleave_real_16s_aligned16.h \
-	volk_16sc_deinterleave_real_32f_aligned16.h \
-	volk_16sc_deinterleave_real_8s_aligned16.h \
-	volk_16sc_magnitude_16s_aligned16.h \
-	volk_16sc_magnitude_32f_aligned16.h \
-	volk_16s_convert_32f_aligned16.h \
-	volk_16s_convert_32f_unaligned16.h \
-	volk_16s_convert_8s_aligned16.h \
-	volk_16s_convert_8s_unaligned16.h \
-	volk_16s_max_star_aligned16.h \
-	volk_16s_max_star_horizontal_aligned16.h \
-	volk_16s_permute_and_scalar_add_aligned16.h \
-	volk_16s_quad_max_star_aligned16.h \
-	volk_16u_byteswap_aligned16.h \
-	volk_32f_accumulator_aligned16.h \
-	volk_32f_add_aligned16.h \
-	volk_32fc_32f_multiply_aligned16.h \
-	volk_32fc_32f_power_32fc_aligned16.h \
-	volk_32f_calc_spectral_noise_floor_aligned16.h \
-	volk_32fc_atan2_32f_aligned16.h \
-	volk_32fc_conjugate_dot_prod_aligned16.h \
-	volk_32fc_deinterleave_32f_aligned16.h \
-	volk_32fc_deinterleave_64f_aligned16.h \
-	volk_32fc_deinterleave_real_16s_aligned16.h \
-	volk_32fc_deinterleave_real_32f_aligned16.h \
-	volk_32fc_deinterleave_real_64f_aligned16.h \
-	volk_32fc_dot_prod_aligned16.h \
-	volk_32fc_index_max_aligned16.h \
-	volk_32fc_magnitude_16s_aligned16.h \
-	volk_32fc_magnitude_32f_aligned16.h \
-	volk_32fc_multiply_aligned16.h \
-	volk_32f_convert_16s_aligned16.h \
-	volk_32f_convert_16s_unaligned16.h \
-	volk_32f_convert_32s_aligned16.h \
-	volk_32f_convert_32s_unaligned16.h \
-	volk_32f_convert_64f_aligned16.h \
-	volk_32f_convert_64f_unaligned16.h \
-	volk_32f_convert_8s_aligned16.h \
-	volk_32f_convert_8s_unaligned16.h \
-	volk_32fc_power_spectral_density_32f_aligned16.h \
-	volk_32fc_power_spectrum_32f_aligned16.h \
-	volk_32fc_square_dist_aligned16.h \
-	volk_32fc_square_dist_scalar_mult_aligned16.h \
-	volk_32f_divide_aligned16.h \
-	volk_32f_dot_prod_aligned16.h \
-	volk_32f_dot_prod_unaligned16.h \
-	volk_32f_fm_detect_aligned16.h \
-	volk_32f_index_max_aligned16.h \
-	volk_32f_interleave_16sc_aligned16.h \
-	volk_32f_interleave_32fc_aligned16.h \
-	volk_32f_max_aligned16.h \
-	volk_32f_min_aligned16.h \
-	volk_32f_multiply_aligned16.h \
-	volk_32f_normalize_aligned16.h \
-	volk_32f_power_aligned16.h \
-	volk_32f_sqrt_aligned16.h \
-	volk_32f_stddev_aligned16.h \
-	volk_32f_stddev_and_mean_aligned16.h \
-	volk_32f_subtract_aligned16.h \
-	volk_32f_sum_of_poly_aligned16.h \
-	volk_32s_and_aligned16.h \
-	volk_32s_convert_32f_aligned16.h \
-	volk_32s_convert_32f_unaligned16.h \
-	volk_32s_or_aligned16.h \
-	volk_32u_byteswap_aligned16.h \
-	volk_32u_popcnt_aligned16.h \
-	volk_64f_convert_32f_aligned16.h \
-	volk_64f_convert_32f_unaligned16.h \
-	volk_64f_max_aligned16.h \
-	volk_64f_min_aligned16.h \
-	volk_64u_byteswap_aligned16.h \
-	volk_64u_popcnt_aligned16.h \
-	volk_8sc_deinterleave_16s_aligned16.h \
-	volk_8sc_deinterleave_32f_aligned16.h \
-	volk_8sc_deinterleave_real_16s_aligned16.h \
-	volk_8sc_deinterleave_real_32f_aligned16.h \
-	volk_8sc_deinterleave_real_8s_aligned16.h \
-	volk_8sc_multiply_conjugate_16sc_aligned16.h \
-	volk_8sc_multiply_conjugate_32fc_aligned16.h \
-	volk_8s_convert_16s_aligned16.h \
-	volk_8s_convert_16s_unaligned16.h \
-	volk_8s_convert_32f_aligned16.h \
-	volk_8s_convert_32f_unaligned16.h 
+	volk_16s_add_quad_a16.h \
+	volk_16s_branch_4_state_8_a16.h \
+	volk_16sc_deinterleave_16s_16s_a16.h \
+	volk_16sc_s32f_deinterleave_32f_32f_a16.h \
+	volk_16sc_deinterleave_real_16s_a16.h \
+	volk_16sc_s32f_deinterleave_real_32f_a16.h \
+	volk_16sc_deinterleave_real_8s_a16.h \
+	volk_16sc_magnitude_16s_a16.h \
+	volk_16sc_s32f_magnitude_32f_a16.h \
+	volk_16s_s32f_convert_32f_a16.h \
+	volk_16s_s32f_convert_32f_ua16.h \
+	volk_16s_convert_8s_a16.h \
+	volk_16s_convert_8s_ua16.h \
+	volk_16s_max_star_16s_a16.h \
+	volk_16s_max_star_horizontal_16s_a16.h \
+	volk_16s_permute_and_scalar_add_a16.h \
+	volk_16s_quad_max_star_16s_a16.h \
+	volk_16u_byteswap_a16.h \
+	volk_32f_accumulator_s32f_a16.h \
+	volk_32f_32f_add_32f_a16.h \
+	volk_32fc_32f_multiply_32fc_a16.h \
+	volk_32fc_32f_power_32fc_a16.h \
+	volk_32f_calc_spectral_noise_floor_a16.h \
+	volk_32fc_s32f_atan2_32f_a16.h \
+	volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \
+	volk_32fc_deinterleave_32f_32f_a16.h \
+	volk_32fc_deinterleave_64f_64f_a16.h \
+	volk_32fc_deinterleave_real_16s_a16.h \
+	volk_32fc_deinterleave_real_32f_a16.h \
+	volk_32fc_deinterleave_real_64f_a16.h \
+	volk_32fc_32fc_dot_prod_32fc_a16.h \
+	volk_32fc_index_max_16u_a16.h \
+	volk_32fc_s32f_magnitude_16s_a16.h \
+	volk_32fc_magnitude_32f_a16.h \
+	volk_32fc_32fc_multiply_32fc_a16.h \
+	volk_32f_s32f_convert_16s_a16.h \
+	volk_32f_s32f_convert_16s_ua16.h \
+	volk_32f_s32f_convert_32s_a16.h \
+	volk_32f_s32f_convert_32s_ua16.h \
+	volk_32f_convert_64f_a16.h \
+	volk_32f_convert_64f_ua16.h \
+	volk_32f_s32f_convert_8s_a16.h \
+	volk_32f_s32f_convert_8s_ua16.h \
+	volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \
+	volk_32fc_s32f_power_spectrum_32f_a16.h \
+	volk_32fc_32fc_square_dist_32f_a16.h \
+	volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \
+	volk_32f_32f_divide_32f_a16.h \
+	volk_32f_32f_dot_prod_32f_a16.h \
+	volk_32f_32f_dot_prod_32f_ua16.h \
+	volk_32f_s32f_32f_fm_detect_32f_a16.h \
+	volk_32f_index_max_16u_a16.h \
+	volk_32f_32f_s32f_interleave_16sc_a16.h \
+	volk_32f_32f_interleave_32fc_a16.h \
+	volk_32f_32f_max_32f_a16.h \
+	volk_32f_32f_min_32f_a16.h \
+	volk_32f_32f_multiply_32f_a16.h \
+	volk_32f_s32f_normalize_a16.h \
+	volk_32f_s32f_power_32f_a16.h \
+	volk_32f_sqrt_32f_a16.h \
+	volk_32f_s32f_stddev_32f_a16.h \
+	volk_32f_stddev_and_mean_32f_32f_a16.h \
+	volk_32f_32f_subtract_32f_a16.h \
+	volk_32f_32f_32f_sum_of_poly_32f_a16.h \
+	volk_32s_32s_and_32s_a16.h \
+	volk_32s_s32f_convert_32f_a16.h \
+	volk_32s_s32f_convert_32f_ua16.h \
+	volk_32s_32s_or_32s_a16.h \
+	volk_32u_byteswap_a16.h \
+	volk_32u_popcnt_a16.h \
+	volk_64f_convert_32f_a16.h \
+	volk_64f_convert_32f_ua16.h \
+	volk_64f_64f_max_64f_a16.h \
+	volk_64f_64f_min_64f_a16.h \
+	volk_64u_byteswap_a16.h \
+	volk_64u_popcnt_a16.h \
+	volk_8sc_deinterleave_16s_16s_a16.h \
+	volk_8sc_s32f_deinterleave_32f_32f_a16.h \
+	volk_8sc_deinterleave_real_16s_a16.h \
+	volk_8sc_s32f_deinterleave_real_32f_a16.h \
+	volk_8sc_deinterleave_real_8s_a16.h \
+	volk_8sc_8sc_multiply_conjugate_16sc_a16.h \
+	volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \
+	volk_8s_convert_16s_a16.h \
+	volk_8s_convert_16s_ua16.h \
+	volk_8s_s32f_convert_32f_a16.h \
+	volk_8s_s32f_convert_32f_ua16.h 
 
 VOLK_MKTABLES_SOURCES = \
 	$(top_srcdir)/lib/volk_rank_archs.c \
diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py
index f2432d7a4e..f708ba7d07 100644
--- a/volk/include/volk/make_c.py
+++ b/volk/include/volk/make_c.py
@@ -24,8 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
     tempstring = tempstring + "    }\n"
     tempstring = tempstring + "    return 0;\n"
     tempstring = tempstring + "}\n"
-
-
+    
     for i in range(len(funclist)): 
         tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
         
diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h
new file mode 100644
index 0000000000..67d0c55a3d
--- /dev/null
+++ b/volk/include/volk/volk_16s_add_quad_a16.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_volk_16s_add_quad_a16_H
+#define INCLUDED_volk_16s_add_quad_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+  p_target0 = (__m128i*)target0;
+  p_target1 = (__m128i*)target1;
+  p_target2 = (__m128i*)target2;
+  p_target3 = (__m128i*)target3;
+
+  p_src0 = (__m128i*)src0;
+  p_src1 = (__m128i*)src1;
+  p_src2 = (__m128i*)src2;
+  p_src3 = (__m128i*)src3;
+  p_src4 = (__m128i*)src4;
+
+  int i = 0;
+
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(p_src1);
+    xmm2 = _mm_load_si128(p_src2);
+    xmm3 = _mm_load_si128(p_src3);
+    xmm4 = _mm_load_si128(p_src4);
+    
+    p_src0 += 1;
+    p_src1 += 1;
+    
+    xmm1 = _mm_add_epi16(xmm0, xmm1);
+    xmm2 = _mm_add_epi16(xmm0, xmm2);
+    xmm3 = _mm_add_epi16(xmm0, xmm3);
+    xmm4 = _mm_add_epi16(xmm0, xmm4);
+    
+    
+    p_src2 += 1;
+    p_src3 += 1;
+    p_src4 += 1;
+
+    _mm_store_si128(p_target0, xmm1);
+    _mm_store_si128(p_target1, xmm2);
+    _mm_store_si128(p_target2, xmm3);
+    _mm_store_si128(p_target3, xmm4);
+    
+    p_target0 += 1;
+    p_target1 += 1;
+    p_target2 += 1;
+    p_target3 += 1;
+  }
+    /*asm volatile
+		(
+		 ".%=volk_16s_add_quad_a16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je .%=volk_16s_add_quad_a16_sse2_END\n\t"
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+		 "movaps (%[src4]), %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $16, %[src1]\n\t"
+		 "add $16, %[src2]\n\t"
+		 "add $16, %[src3]\n\t"
+		 "add $16, %[src4]\n\t"
+		 "paddw %%xmm1, %%xmm2\n\t"
+		 "paddw %%xmm1, %%xmm3\n\t"
+		 "paddw %%xmm1, %%xmm4\n\t"
+		 "paddw %%xmm1, %%xmm5\n\t"
+		 "add $-1, %[bound]\n\t"
+		 "movaps %%xmm2, (%[target0])\n\t"
+		 "movaps %%xmm3, (%[target1])\n\t"
+		 "movaps %%xmm4, (%[target2])\n\t"
+		 "movaps %%xmm5, (%[target3])\n\t"
+		 "add $16, %[target0]\n\t"
+		 "add $16, %[target1]\n\t"
+		 "add $16, %[target2]\n\t"
+		 "add $16, %[target3]\n\t"
+		 "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t"
+		 ".%=volk_16s_add_quad_a16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+		 );
+		 
+    */
+	 
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target0[i] = src0[i] + src1[i];
+    target1[i] = src0[i] + src2[i];
+    target2[i] = src0[i] + src3[i];
+    target3[i] = src0[i] + src4[i];
+  }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target0[i] = src0[i] + src1[i];
+		target1[i] = src0[i] + src2[i];
+		target2[i] = src0[i] + src3[i];
+		target3[i] = src0[i] + src4[i];
+	}
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_volk_16s_add_quad_a16_H*/
diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h
deleted file mode 100644
index 63042bef18..0000000000
--- a/volk/include/volk/volk_16s_add_quad_aligned16.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
-#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-
-#if LV_HAVE_SSE2
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline  void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
-  p_target0 = (__m128i*)target0;
-  p_target1 = (__m128i*)target1;
-  p_target2 = (__m128i*)target2;
-  p_target3 = (__m128i*)target3;
-
-  p_src0 = (__m128i*)src0;
-  p_src1 = (__m128i*)src1;
-  p_src2 = (__m128i*)src2;
-  p_src3 = (__m128i*)src3;
-  p_src4 = (__m128i*)src4;
-
-  int i = 0;
-
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(p_src1);
-    xmm2 = _mm_load_si128(p_src2);
-    xmm3 = _mm_load_si128(p_src3);
-    xmm4 = _mm_load_si128(p_src4);
-    
-    p_src0 += 1;
-    p_src1 += 1;
-    
-    xmm1 = _mm_add_epi16(xmm0, xmm1);
-    xmm2 = _mm_add_epi16(xmm0, xmm2);
-    xmm3 = _mm_add_epi16(xmm0, xmm3);
-    xmm4 = _mm_add_epi16(xmm0, xmm4);
-    
-    
-    p_src2 += 1;
-    p_src3 += 1;
-    p_src4 += 1;
-
-    _mm_store_si128(p_target0, xmm1);
-    _mm_store_si128(p_target1, xmm2);
-    _mm_store_si128(p_target2, xmm3);
-    _mm_store_si128(p_target3, xmm4);
-    
-    p_target0 += 1;
-    p_target1 += 1;
-    p_target2 += 1;
-    p_target3 += 1;
-  }
-    /*asm volatile
-		(
-		 ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t"
-		 "cmp $0, %[bound]\n\t"
-		 "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t"
-		 "movaps (%[src0]), %%xmm1\n\t"
-		 "movaps (%[src1]), %%xmm2\n\t"
-		 "movaps (%[src2]), %%xmm3\n\t"
-		 "movaps (%[src3]), %%xmm4\n\t"
-		 "movaps (%[src4]), %%xmm5\n\t"
-		 "add $16, %[src0]\n\t"
-		 "add $16, %[src1]\n\t"
-		 "add $16, %[src2]\n\t"
-		 "add $16, %[src3]\n\t"
-		 "add $16, %[src4]\n\t"
-		 "paddw %%xmm1, %%xmm2\n\t"
-		 "paddw %%xmm1, %%xmm3\n\t"
-		 "paddw %%xmm1, %%xmm4\n\t"
-		 "paddw %%xmm1, %%xmm5\n\t"
-		 "add $-1, %[bound]\n\t"
-		 "movaps %%xmm2, (%[target0])\n\t"
-		 "movaps %%xmm3, (%[target1])\n\t"
-		 "movaps %%xmm4, (%[target2])\n\t"
-		 "movaps %%xmm5, (%[target3])\n\t"
-		 "add $16, %[target0]\n\t"
-		 "add $16, %[target1]\n\t"
-		 "add $16, %[target2]\n\t"
-		 "add $16, %[target3]\n\t"
-		 "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t"
-		 ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t"
-		 :
-		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
-		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-		 );
-		 
-    */
-	 
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target0[i] = src0[i] + src1[i];
-    target1[i] = src0[i] + src2[i];
-    target2[i] = src0[i] + src3[i];
-    target3[i] = src0[i] + src4[i];
-  }
-}
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	for(i = 0; i < bound; ++i) {
-		target0[i] = src0[i] + src1[i];
-		target1[i] = src0[i] + src2[i];
-		target2[i] = src0[i] + src3[i];
-		target3[i] = src0[i] + src4[i];
-	}
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-
-#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h
new file mode 100644
index 0000000000..4c1af87297
--- /dev/null
+++ b/volk/include/volk/volk_16s_branch_4_state_8_a16.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H
+#define INCLUDED_volk_16s_branch_4_state_8_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16s_branch_4_state_8_a16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+  
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = 1;
+  
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+  for(; i < bound; ++i) {
+    
+    xmm5 = _mm_load_si128(p_src0);
+    
+    
+    
+    
+    
+    
+    
+
+
+    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+    
+    p_src0 += 4;
+   
+    
+    xmm5 = _mm_add_epi16(xmm1, xmm2);
+    
+    xmm6 = _mm_add_epi16(xmm2, xmm6);
+    xmm8 = _mm_add_epi16(xmm1, xmm8);
+   
+     
+    xmm7 = _mm_load_si128(p_cntl2);
+    xmm9 = _mm_load_si128(p_cntl3);
+    
+    xmm0 = _mm_add_epi16(xmm5, xmm0);
+    
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm5 = _mm_load_si128(&p_cntl2[1]);
+    xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm7);
+   
+   
+ 
+    xmm7 = _mm_load_si128(&p_cntl2[2]);
+    xmm9 = _mm_load_si128(&p_cntl3[2]);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm5);
+   
+    
+    xmm5 = _mm_load_si128(&p_cntl2[3]);
+    xmm11 = _mm_load_si128(&p_cntl3[3]);
+    
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+    
+    xmm8 = _mm_add_epi16(xmm8, xmm7);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    _mm_store_si128(p_target, xmm0);
+    _mm_store_si128(&p_target[1], xmm6);
+
+    xmm10 = _mm_add_epi16(xmm5, xmm10);
+    
+    _mm_store_si128(&p_target[2], xmm8);
+    
+    _mm_store_si128(&p_target[3], xmm10);
+    
+    p_target += 3;   
+  }
+}
+	
+	
+#endif /*LV_HAVE_SSEs*/
+
+#if LV_HAVE_GENERIC
+static inline  void volk_16s_branch_4_state_8_a16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	int i = 0;
+	
+	int bound = 4;
+	
+	for(; i < bound; ++i) {
+	  target[i* 8] = src0[((char)permuters[i][0])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8] & scalars[2])
+	    + (cntl3[i * 8] & scalars[3]);
+	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 1] & scalars[2])
+	    + (cntl3[i * 8 + 1] & scalars[3]);
+	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 2] & scalars[2])
+	    + (cntl3[i * 8 + 2] & scalars[3]);
+	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 3] & scalars[2])
+	    + (cntl3[i * 8 + 3] & scalars[3]);
+	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 4] & scalars[2])
+	    + (cntl3[i * 8 + 4] & scalars[3]);
+	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 5] & scalars[2])
+	    + (cntl3[i * 8 + 5] & scalars[3]);
+	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 6] & scalars[2])
+	    + (cntl3[i * 8 + 6] & scalars[3]);
+	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 7] & scalars[2])
+	    + (cntl3[i * 8 + 7] & scalars[3]);
+	  
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
deleted file mode 100644
index fb9d7cb874..0000000000
--- a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
-#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_branch_4_state_8_aligned16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-	
-  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
-
-  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
-
-  
-  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
-  
-  int i = 0;
-  
-  int bound = 1;
-  
-  
-  xmm0 = _mm_load_si128(p_scalars);
-  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
-  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
-  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
-  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
-
-  for(; i < bound; ++i) {
-    
-    xmm5 = _mm_load_si128(p_src0);
-    
-    
-    
-    
-    
-    
-    
-
-
-    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
-    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
-    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
-    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
-    
-    p_src0 += 4;
-   
-    
-    xmm5 = _mm_add_epi16(xmm1, xmm2);
-    
-    xmm6 = _mm_add_epi16(xmm2, xmm6);
-    xmm8 = _mm_add_epi16(xmm1, xmm8);
-   
-     
-    xmm7 = _mm_load_si128(p_cntl2);
-    xmm9 = _mm_load_si128(p_cntl3);
-    
-    xmm0 = _mm_add_epi16(xmm5, xmm0);
-    
-    
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    xmm9 = _mm_and_si128(xmm9, xmm4);
-    
-    xmm5 = _mm_load_si128(&p_cntl2[1]);
-    xmm11 = _mm_load_si128(&p_cntl3[1]);
-
-    xmm7 = _mm_add_epi16(xmm7, xmm9);
-
-    xmm5 = _mm_and_si128(xmm5, xmm3);
-    xmm11 = _mm_and_si128(xmm11, xmm4);
-
-    xmm0 = _mm_add_epi16(xmm0, xmm7);
-   
-   
- 
-    xmm7 = _mm_load_si128(&p_cntl2[2]);
-    xmm9 = _mm_load_si128(&p_cntl3[2]);
-    
-    xmm5 = _mm_add_epi16(xmm5, xmm11);
-    
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    xmm9 = _mm_and_si128(xmm9, xmm4);
-    
-    xmm6 = _mm_add_epi16(xmm6, xmm5);
-   
-    
-    xmm5 = _mm_load_si128(&p_cntl2[3]);
-    xmm11 = _mm_load_si128(&p_cntl3[3]);
-    
-    xmm7 = _mm_add_epi16(xmm7, xmm9);
-    
-    xmm5 = _mm_and_si128(xmm5, xmm3);
-    xmm11 = _mm_and_si128(xmm11, xmm4);
-    
-    xmm8 = _mm_add_epi16(xmm8, xmm7);
-    
-    xmm5 = _mm_add_epi16(xmm5, xmm11);
-    
-    _mm_store_si128(p_target, xmm0);
-    _mm_store_si128(&p_target[1], xmm6);
-
-    xmm10 = _mm_add_epi16(xmm5, xmm10);
-    
-    _mm_store_si128(&p_target[2], xmm8);
-    
-    _mm_store_si128(&p_target[3], xmm10);
-    
-    p_target += 3;   
-  }
-}
-	
-	
-#endif /*LV_HAVE_SSEs*/
-
-#if LV_HAVE_GENERIC
-static inline  void volk_16s_branch_4_state_8_aligned16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-	int i = 0;
-	
-	int bound = 4;
-	
-	for(; i < bound; ++i) {
-	  target[i* 8] = src0[((char)permuters[i][0])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8] & scalars[2])
-	    + (cntl3[i * 8] & scalars[3]);
-	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 1] & scalars[2])
-	    + (cntl3[i * 8 + 1] & scalars[3]);
-	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 2] & scalars[2])
-	    + (cntl3[i * 8 + 2] & scalars[3]);
-	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 3] & scalars[2])
-	    + (cntl3[i * 8 + 3] & scalars[3]);
-	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 4] & scalars[2])
-	    + (cntl3[i * 8 + 4] & scalars[3]);
-	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 5] & scalars[2])
-	    + (cntl3[i * 8 + 5] & scalars[3]);
-	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 6] & scalars[2])
-	    + (cntl3[i * 8 + 6] & scalars[3]);
-	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 7] & scalars[2])
-	    + (cntl3[i * 8 + 7] & scalars[3]);
-	  
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h
deleted file mode 100644
index 126ce15282..0000000000
--- a/volk/include/volk/volk_16s_convert_32f_aligned16.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-    
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-      
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-      
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h
deleted file mode 100644
index d6212fba5e..0000000000
--- a/volk/include/volk/volk_16s_convert_32f_unaligned16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-    
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-      
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-      
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h
new file mode 100644
index 0000000000..13db435def
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_a16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_volk_16s_convert_8s_a16_H
+#define INCLUDED_volk_16s_convert_8s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h
deleted file mode 100644
index 64c368688f..0000000000
--- a/volk/include/volk/volk_16s_convert_8s_aligned16.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-      
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_store_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h
new file mode 100644
index 0000000000..9941118ae6
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_ua16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_16s_convert_8s_ua16_H
+#define INCLUDED_volk_16s_convert_8s_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h
deleted file mode 100644
index ca925de86f..0000000000
--- a/volk/include/volk/volk_16s_convert_8s_unaligned16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
-#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-  \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-      
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-  \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h
new file mode 100644
index 0000000000..b2ec90552c
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_16s_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16s_max_star_16s_a16_H
+#define INCLUDED_volk_16s_max_star_16s_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+  
+  short candidate = src0[0];
+  short cands[8];
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+  
+
+  __m128i *p_src0;
+  
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  xmm1 = _mm_setzero_si128();
+  xmm0 = _mm_setzero_si128();
+  //_mm_insert_epi16(xmm0, candidate, 0);
+  
+  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
+
+  
+  for(i = 0; i < bound; ++i) {
+    xmm1 = _mm_load_si128(p_src0);
+    p_src0 += 1;
+    xmm2 = _mm_sub_epi16(xmm1, xmm0);
+  
+
+    
+  
+  
+  
+    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+    xmm6 = _mm_xor_si128(xmm4, xmm5);
+    
+    xmm3 = _mm_and_si128(xmm3, xmm0);
+    xmm4 = _mm_and_si128(xmm6, xmm1);
+    
+    xmm0 = _mm_add_epi16(xmm3, xmm4);
+    
+  
+  }
+  
+  _mm_store_si128((__m128i*)cands, xmm0);
+  
+  for(i = 0; i < 8; ++i) {
+    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+  }
+  
+ 
+  
+  for(i = 0; i < leftovers; ++i) {
+  
+    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+  }
+
+  target[0] = candidate;
+  
+    
+    
+ 
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	short candidate = src0[0];
+	for(i = 1; i < bound; ++i) {
+	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+	}
+	target[0] = candidate;
+	  
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h
deleted file mode 100644
index ba4e979ecd..0000000000
--- a/volk/include/volk/volk_16s_max_star_aligned16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
-#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) {
-
-
-  
-  short candidate = src0[0];
-  short cands[8];
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-  
-
-  __m128i *p_src0;
-  
-  p_src0 = (__m128i*)src0;
-
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  int i = 0;
-  
-  
-  xmm1 = _mm_setzero_si128();
-  xmm0 = _mm_setzero_si128();
-  //_mm_insert_epi16(xmm0, candidate, 0);
-  
-  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
-
-  
-  for(i = 0; i < bound; ++i) {
-    xmm1 = _mm_load_si128(p_src0);
-    p_src0 += 1;
-    xmm2 = _mm_sub_epi16(xmm1, xmm0);
-  
-
-    
-  
-  
-  
-    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
-    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
-    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
-
-    xmm6 = _mm_xor_si128(xmm4, xmm5);
-    
-    xmm3 = _mm_and_si128(xmm3, xmm0);
-    xmm4 = _mm_and_si128(xmm6, xmm1);
-    
-    xmm0 = _mm_add_epi16(xmm3, xmm4);
-    
-  
-  }
-  
-  _mm_store_si128((__m128i*)cands, xmm0);
-  
-  for(i = 0; i < 8; ++i) {
-    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
-  }
-  
- 
-  
-  for(i = 0; i < leftovers; ++i) {
-  
-    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
-  }
-
-  target[0] = candidate;
-  
-    
-    
- 
-
-}   
- 
-#endif /*LV_HAVE_SSSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	short candidate = src0[0];
-	for(i = 1; i < bound; ++i) {
-	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
-	}
-	target[0] = candidate;
-	  
-}
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
new file mode 100644
index 0000000000..68994593bc
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
+#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+  
+  
+  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
+  __m128i  xmm5, xmm6, xmm7, xmm8;
+  
+  xmm4 = _mm_load_si128((__m128i*)shufmask0);
+  xmm5 = _mm_load_si128((__m128i*)shufmask1);
+  xmm6 = _mm_load_si128((__m128i*)andmask0);
+  xmm7 = _mm_load_si128((__m128i*)andmask1);
+  
+  __m128i *p_target, *p_src0;
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 5;
+  int intermediate = (num_bytes >> 4) & 1;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  for(i = 0; i < bound; ++i) {
+     
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(&p_src0[1]);
+    
+    
+
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 2;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    xmm3 = _mm_and_si128(xmm2, xmm7);
+    
+
+    xmm8 = _mm_add_epi8(xmm8, xmm4);
+    xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+    
+    
+    xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+    
+    _mm_store_si128(p_target, xmm3);
+    
+    p_target += 1;
+  
+  }
+
+  for(i = 0; i < intermediate; ++i) {
+    
+    xmm0 = _mm_load_si128(p_src0);
+    
+    
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 1;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    
+    xmm3 = _mm_add_epi8(xmm8, xmm4);
+    
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+    
+
+    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+    
+    p_target = (__m128i*)((int8_t*)p_target + 8);
+
+  }
+    
+  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
+    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+  }
+  
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+      
+	for(i = 0; i < bound; i += 2) {
+	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+	}
+		
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
deleted file mode 100644
index 82d011677b..0000000000
--- a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
-#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-
-  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
-  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
-
-  
-  
-  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
-  __m128i  xmm5, xmm6, xmm7, xmm8;
-  
-  xmm4 = _mm_load_si128((__m128i*)shufmask0);
-  xmm5 = _mm_load_si128((__m128i*)shufmask1);
-  xmm6 = _mm_load_si128((__m128i*)andmask0);
-  xmm7 = _mm_load_si128((__m128i*)andmask1);
-  
-  __m128i *p_target, *p_src0;
-  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-
-  int bound = num_bytes >> 5;
-  int intermediate = (num_bytes >> 4) & 1;
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  int i = 0;
-  
-  
-  for(i = 0; i < bound; ++i) {
-     
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(&p_src0[1]);
-    
-    
-
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 2;
-    
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
-
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    xmm3 = _mm_and_si128(xmm2, xmm7);
-    
-
-    xmm8 = _mm_add_epi8(xmm8, xmm4);
-    xmm3 = _mm_add_epi8(xmm3, xmm5);
-
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
-    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
-    
-    
-    xmm3 = _mm_add_epi16(xmm0, xmm1);
-
-    
-    _mm_store_si128(p_target, xmm3);
-    
-    p_target += 1;
-  
-  }
-
-  for(i = 0; i < intermediate; ++i) {
-    
-    xmm0 = _mm_load_si128(p_src0);
-    
-    
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 1;
-    
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
-
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    
-    xmm3 = _mm_add_epi8(xmm8, xmm4);
-    
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
-    
-
-    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
-    
-    p_target = (__m128i*)((int8_t*)p_target + 8);
-
-  }
-    
-  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
-    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
-  }
-  
-
-}   
- 
-#endif /*LV_HAVE_SSSE3*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-      
-	for(i = 0; i < bound; i += 2) {
-	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
-	}
-		
-}
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
new file mode 100644
index 0000000000..2e7586b575
--- /dev/null
+++ b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H
+#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16s_permute_and_scalar_add_a16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+  short* p_permute_indexes = permute_indexes;
+  
+  p_target = (__m128i*)target;
+  p_cntl0 = (__m128i*)cntl0;
+  p_cntl1 = (__m128i*)cntl1;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_setzero_si128();
+    xmm5 = _mm_setzero_si128();
+    xmm6 = _mm_setzero_si128();
+    xmm7 = _mm_setzero_si128();
+
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+    
+    p_permute_indexes += 8;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    xmm5 = _mm_load_si128(p_cntl0);
+    xmm6 = _mm_load_si128(p_cntl1);
+    xmm7 = _mm_load_si128(p_cntl2);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm1);
+    xmm6 = _mm_and_si128(xmm6, xmm2);
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    
+    xmm5 = _mm_load_si128(p_cntl3);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+    p_cntl0 += 1;
+    
+    xmm5 = _mm_and_si128(xmm5, xmm4);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    p_cntl1 += 1;
+    p_cntl2 += 1;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5); 
+    
+    p_cntl3 += 1;
+
+    _mm_store_si128(p_target, xmm0);
+    
+    p_target += 1;
+  }
+	
+	
+	
+	
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target[i] = src0[permute_indexes[i]] 
+      + (cntl0[i] & scalars[0])
+      + (cntl1[i] & scalars[1])
+      + (cntl2[i] & scalars[2])
+      + (cntl3[i] & scalars[3]);
+  }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target[i] = src0[permute_indexes[i]] 
+			+ (cntl0[i] & scalars[0])
+			+ (cntl1[i] & scalars[1])
+			+ (cntl2[i] & scalars[2])
+			+ (cntl3[i] & scalars[3]);
+		
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
deleted file mode 100644
index 452d05c4f2..0000000000
--- a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
-#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline  void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-	
-
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
-
-  short* p_permute_indexes = permute_indexes;
-  
-  p_target = (__m128i*)target;
-  p_cntl0 = (__m128i*)cntl0;
-  p_cntl1 = (__m128i*)cntl1;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
-  
-  int i = 0;
-  
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  xmm0 = _mm_load_si128(p_scalars);
-  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_setzero_si128();
-    xmm5 = _mm_setzero_si128();
-    xmm6 = _mm_setzero_si128();
-    xmm7 = _mm_setzero_si128();
-
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
-
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
-    
-    p_permute_indexes += 8;
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
-    
-    xmm5 = _mm_load_si128(p_cntl0);
-    xmm6 = _mm_load_si128(p_cntl1);
-    xmm7 = _mm_load_si128(p_cntl2);
-    
-    xmm5 = _mm_and_si128(xmm5, xmm1);
-    xmm6 = _mm_and_si128(xmm6, xmm2);
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    
-    xmm5 = _mm_load_si128(p_cntl3);
-    
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
-
-    p_cntl0 += 1;
-    
-    xmm5 = _mm_and_si128(xmm5, xmm4);
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
-    
-    p_cntl1 += 1;
-    p_cntl2 += 1;
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm5); 
-    
-    p_cntl3 += 1;
-
-    _mm_store_si128(p_target, xmm0);
-    
-    p_target += 1;
-  }
-	
-	
-	
-	
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target[i] = src0[permute_indexes[i]] 
-      + (cntl0[i] & scalars[0])
-      + (cntl1[i] & scalars[1])
-      + (cntl2[i] & scalars[2])
-      + (cntl3[i] & scalars[3]);
-  }
-}
-#endif /*LV_HAVE_SSEs*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	for(i = 0; i < bound; ++i) {
-		target[i] = src0[permute_indexes[i]] 
-			+ (cntl0[i] & scalars[0])
-			+ (cntl1[i] & scalars[1])
-			+ (cntl2[i] & scalars[2])
-			+ (cntl3[i] & scalars[3]);
-		
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
new file mode 100644
index 0000000000..3e89ff963d
--- /dev/null
+++ b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H
+#define INCLUDED_volk_16s_quad_max_star_16s_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline  void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+
+
+
+	int i = 0;
+
+	int bound = (num_bytes >> 4);
+	int bound_copy = bound;
+	int leftovers = (num_bytes >> 1) & 7;
+	
+	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+	p_target = (__m128i*) target;
+	p_src0 =  (__m128i*)src0;
+	p_src1 =  (__m128i*)src1;
+	p_src2 =  (__m128i*)src2;
+	p_src3 =  (__m128i*)src3;
+	
+	
+
+	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+	while(bound_copy > 0) {
+	 
+	  xmm1 = _mm_load_si128(p_src0);
+	  xmm2 = _mm_load_si128(p_src1);
+	  xmm3 = _mm_load_si128(p_src2);
+	  xmm4 = _mm_load_si128(p_src3);
+	  
+	  xmm5 = _mm_setzero_si128();
+	  xmm6 = _mm_setzero_si128();
+	  xmm7 = xmm1;
+	  xmm8 = xmm3;
+	  
+	  
+	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+	  
+
+	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+	  
+
+	  xmm2 = _mm_and_si128(xmm5, xmm2);
+	  xmm4 = _mm_and_si128(xmm6, xmm4);
+	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
+	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+	  xmm5 = _mm_add_epi16(xmm2, xmm5);
+	  xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+	  
+	  xmm1 = _mm_xor_si128(xmm1, xmm1);
+	  xmm2 = xmm5;
+	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
+	  p_src0 += 1;
+	  bound_copy -= 1;
+
+	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+	  p_src1 += 1;
+	  
+	  xmm6 = _mm_and_si128(xmm1, xmm6);
+	  	  
+	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
+	  p_src2 += 1;
+
+
+	  
+	  xmm1 = _mm_add_epi16(xmm6, xmm1);
+	  p_src3 += 1;
+
+	  
+	  _mm_store_si128(p_target, xmm1);
+	  p_target += 1;
+      
+	}
+	
+
+	/*asm volatile
+		(
+		 "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t"
+
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+
+		 "pxor %%xmm5, %%xmm5\n\t"
+		 "pxor %%xmm6, %%xmm6\n\t"
+		 "movaps %%xmm1, %%xmm7\n\t"
+		 "movaps %%xmm3, %%xmm8\n\t"
+		 "psubw %%xmm2, %%xmm1\n\t"
+		 "psubw %%xmm4, %%xmm3\n\t"
+		 
+		 "pcmpgtw %%xmm1, %%xmm5\n\t"
+		 "pcmpgtw %%xmm3, %%xmm6\n\t"
+		 
+		 "pand %%xmm5, %%xmm2\n\t"
+		 "pand %%xmm6, %%xmm4\n\t"
+		 "pandn %%xmm7, %%xmm5\n\t"
+		 "pandn %%xmm8, %%xmm6\n\t"
+		 
+		 "paddw %%xmm2, %%xmm5\n\t"
+		 "paddw %%xmm4, %%xmm6\n\t"
+
+		 "pxor %%xmm1, %%xmm1\n\t"
+		 "movaps %%xmm5, %%xmm2\n\t"
+		 
+		 "psubw %%xmm6, %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $-1, %[bound]\n\t"
+	  
+		 "pcmpgtw %%xmm5, %%xmm1\n\t"
+		 "add $16, %[src1]\n\t"
+
+		 "pand %%xmm1, %%xmm6\n\t"
+
+		 "pandn %%xmm2, %%xmm1\n\t"
+		 "add $16, %[src2]\n\t"
+
+		 "paddw %%xmm6, %%xmm1\n\t"
+		 "add $16, %[src3]\n\t"
+
+		 "movaps %%xmm1, (%[target])\n\t"
+		 "addw $16, %[target]\n\t"
+		 "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t"
+		 
+		 "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+		 :
+		 );
+	*/ 
+
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+	return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+	
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = 0; i < bound; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h
deleted file mode 100644
index 1004c4d23e..0000000000
--- a/volk/include/volk/volk_16s_quad_max_star_aligned16.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
-#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include<emmintrin.h>
-
-static inline  void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-	
-
-
-
-	int i = 0;
-
-	int bound = (num_bytes >> 4);
-	int bound_copy = bound;
-	int leftovers = (num_bytes >> 1) & 7;
-	
-	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
-	p_target = (__m128i*) target;
-	p_src0 =  (__m128i*)src0;
-	p_src1 =  (__m128i*)src1;
-	p_src2 =  (__m128i*)src2;
-	p_src3 =  (__m128i*)src3;
-	
-	
-
-	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
-	while(bound_copy > 0) {
-	 
-	  xmm1 = _mm_load_si128(p_src0);
-	  xmm2 = _mm_load_si128(p_src1);
-	  xmm3 = _mm_load_si128(p_src2);
-	  xmm4 = _mm_load_si128(p_src3);
-	  
-	  xmm5 = _mm_setzero_si128();
-	  xmm6 = _mm_setzero_si128();
-	  xmm7 = xmm1;
-	  xmm8 = xmm3;
-	  
-	  
-	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
-
-	  
-
-	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
-
-	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
-	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
-
-	  
-
-	  xmm2 = _mm_and_si128(xmm5, xmm2);
-	  xmm4 = _mm_and_si128(xmm6, xmm4);
-	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
-	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
-
-	  xmm5 = _mm_add_epi16(xmm2, xmm5);
-	  xmm6 = _mm_add_epi16(xmm4, xmm6);
-
-	  
-	  xmm1 = _mm_xor_si128(xmm1, xmm1);
-	  xmm2 = xmm5;
-	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
-	  p_src0 += 1;
-	  bound_copy -= 1;
-
-	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
-	  p_src1 += 1;
-	  
-	  xmm6 = _mm_and_si128(xmm1, xmm6);
-	  	  
-	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
-	  p_src2 += 1;
-
-
-	  
-	  xmm1 = _mm_add_epi16(xmm6, xmm1);
-	  p_src3 += 1;
-
-	  
-	  _mm_store_si128(p_target, xmm1);
-	  p_target += 1;
-      
-	}
-	
-
-	/*asm volatile
-		(
-		 "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t"
-		 "cmp $0, %[bound]\n\t"
-		 "je volk_16s_quad_max_star_aligned16_sse2_END\n\t"
-
-		 "movaps (%[src0]), %%xmm1\n\t"
-		 "movaps (%[src1]), %%xmm2\n\t"
-		 "movaps (%[src2]), %%xmm3\n\t"
-		 "movaps (%[src3]), %%xmm4\n\t"
-
-		 "pxor %%xmm5, %%xmm5\n\t"
-		 "pxor %%xmm6, %%xmm6\n\t"
-		 "movaps %%xmm1, %%xmm7\n\t"
-		 "movaps %%xmm3, %%xmm8\n\t"
-		 "psubw %%xmm2, %%xmm1\n\t"
-		 "psubw %%xmm4, %%xmm3\n\t"
-		 
-		 "pcmpgtw %%xmm1, %%xmm5\n\t"
-		 "pcmpgtw %%xmm3, %%xmm6\n\t"
-		 
-		 "pand %%xmm5, %%xmm2\n\t"
-		 "pand %%xmm6, %%xmm4\n\t"
-		 "pandn %%xmm7, %%xmm5\n\t"
-		 "pandn %%xmm8, %%xmm6\n\t"
-		 
-		 "paddw %%xmm2, %%xmm5\n\t"
-		 "paddw %%xmm4, %%xmm6\n\t"
-
-		 "pxor %%xmm1, %%xmm1\n\t"
-		 "movaps %%xmm5, %%xmm2\n\t"
-		 
-		 "psubw %%xmm6, %%xmm5\n\t"
-		 "add $16, %[src0]\n\t"
-		 "add $-1, %[bound]\n\t"
-	  
-		 "pcmpgtw %%xmm5, %%xmm1\n\t"
-		 "add $16, %[src1]\n\t"
-
-		 "pand %%xmm1, %%xmm6\n\t"
-
-		 "pandn %%xmm2, %%xmm1\n\t"
-		 "add $16, %[src2]\n\t"
-
-		 "paddw %%xmm6, %%xmm1\n\t"
-		 "add $16, %[src3]\n\t"
-
-		 "movaps %%xmm1, (%[target])\n\t"
-		 "addw $16, %[target]\n\t"
-		 "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t"
-		 
-		 "volk_16s_quad_max_star_aligned16_sse2_END:\n\t"
-		 :
-		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
-		 :
-		 );
-	*/ 
-
-	short temp0 = 0;
-	short temp1 = 0;
-	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-	}
-	return;
-
-
-}
-
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-	
-	short temp0 = 0;
-	short temp1 = 0;
-	for(i = 0; i < bound; ++i) {
-	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-	}
-}
-
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
new file mode 100644
index 0000000000..8f9b44478d
--- /dev/null
+++ b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H
+#define INCLUDED_volk_16s_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
new file mode 100644
index 0000000000..ad52aea1a5
--- /dev/null
+++ b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H
+#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
new file mode 100644
index 0000000000..8e5da24ec6
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
+#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *int16ComplexVectorPtr++;
+    *qBufferPtr++ = *int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
deleted file mode 100644
index cf94a3f385..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
-  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
-    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *int16ComplexVectorPtr++;
-    *qBufferPtr++ = *int16ComplexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
-  unsigned int eighthPoints = num_points / 8;
- 
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
-
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
deleted file mode 100644
index 50b8b62d5d..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    float* iBufferPtr = iBuffer;
-    float* qBufferPtr = qBuffer;
-
-    uint64_t number = 0;
-    const uint64_t quarterPoints = num_points / 4;    
-    __m128 cplxValue1, cplxValue2, iValue, qValue;
-
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    float floatBuffer[8] __attribute__((aligned(128)));
-
-    for(;number < quarterPoints; number++){
-      
-      floatBuffer[0] = (float)(complexVectorPtr[0]);
-      floatBuffer[1] = (float)(complexVectorPtr[1]);
-      floatBuffer[2] = (float)(complexVectorPtr[2]);
-      floatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-      floatBuffer[4] = (float)(complexVectorPtr[4]);
-      floatBuffer[5] = (float)(complexVectorPtr[5]);
-      floatBuffer[6] = (float)(complexVectorPtr[6]);
-      floatBuffer[7] = (float)(complexVectorPtr[7]);
-
-      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
-      complexVectorPtr += 8;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      _mm_store_ps(iBufferPtr, iValue);
-      _mm_store_ps(qBufferPtr, qValue);
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)&complexVector[number];
-    for(; number < num_points; number++){
-      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
new file mode 100644
index 0000000000..068c1350c2
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
+#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i complexVal1, complexVal2, iOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
deleted file mode 100644
index b594c85b84..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
-    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i complexVal1, complexVal2, iOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
-  unsigned int eighthPoints = num_points / 8;
- 
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
deleted file mode 100644
index 3e7be1e642..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-
-  __m128 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
-  for(;number < quarterPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm_cvtepi16_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
-    sixteenTComplexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 iValue;
-
-  const float iScalar = 1.0/scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-
-    iValue = _mm_load_ps(floatBuffer);
-
-    iValue = _mm_mul_ps(iValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
new file mode 100644
index 0000000000..afa21ebc43
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
+#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+    complexVal1 = _mm_srai_epi16(complexVal1, 8);
+    complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+    int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
deleted file mode 100644
index 2dd85a4225..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
-    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
-
-    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
-    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
-
-    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
-
-
-    complexVal1 = _mm_srai_epi16(complexVal1, 8);
-    complexVal3 = _mm_srai_epi16(complexVal3, 8);
-
-    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
-    int16ComplexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h
new file mode 100644
index 0000000000..d832de5fe7
--- /dev/null
+++ b/volk/include/volk/volk_16sc_magnitude_16s_a16.h
@@ -0,0 +1,190 @@
+#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H
+#define INCLUDED_volk_16sc_magnitude_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+   
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float inputFloatBuffer[4] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    cplxValue1 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+    cplxValue2 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float scalar = 32768.0;
+  for(number = 0; number < num_points; number++){
+    float real = ((float)(*complexVectorPtr++)) / scalar;
+    float imag = ((float)(*complexVectorPtr++)) / scalar;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
deleted file mode 100644
index 41e8751d61..0000000000
--- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-   
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(32768.0);
-  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    result = _mm_mul_ps(result, vScalar); // Scale the results
-
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
-    *magnitudeVectorPtr++ = (int16_t)(val1Result);
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(32768.0);
-  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
-  float inputFloatBuffer[4] __attribute__((aligned(128)));
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    cplxValue1 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
-    cplxValue2 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    result = _mm_mul_ps(result, vScalar); // Scale the results
-
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
-    *magnitudeVectorPtr++ = (int16_t)(val1Result);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float scalar = 32768.0;
-  for(number = 0; number < num_points; number++){
-    float real = ((float)(*complexVectorPtr++)) / scalar;
-    float imag = ((float)(*complexVectorPtr++)) / scalar;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
deleted file mode 100644
index c2605d5519..0000000000
--- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
-#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    _mm_store_ps(magnitudeVectorPtr, result);
-      
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) / scalar;
-    float val1Imag = (float)(*complexVectorPtr++) / scalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-
-  __m128 cplxValue1, cplxValue2, result, re, im;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-    
-    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
-    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(re, invScalar);
-    cplxValue2 = _mm_mul_ps(im, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    _mm_store_ps(magnitudeVectorPtr, result);
-      
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) * iScalar;
-    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
-}
-
- 
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    float real = ( (float) (*complexVectorPtr++)) * invScalar;
-    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
new file mode 100644
index 0000000000..53e4253c44
--- /dev/null
+++ b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
+#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t quarterPoints = num_points / 4;    
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[8] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      
+      floatBuffer[0] = (float)(complexVectorPtr[0]);
+      floatBuffer[1] = (float)(complexVectorPtr[1]);
+      floatBuffer[2] = (float)(complexVectorPtr[2]);
+      floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+      floatBuffer[4] = (float)(complexVectorPtr[4]);
+      floatBuffer[5] = (float)(complexVectorPtr[5]);
+      floatBuffer[6] = (float)(complexVectorPtr[6]);
+      floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      _mm_store_ps(iBufferPtr, iValue);
+      _mm_store_ps(qBufferPtr, qValue);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for(; number < num_points; number++){
+      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 0000000000..7320db3688
--- /dev/null
+++ b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+  for(;number < quarterPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi16_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+    sixteenTComplexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar = 1.0/scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
new file mode 100644
index 0000000000..649b5cc96c
--- /dev/null
+++ b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
@@ -0,0 +1,179 @@
+#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
+#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) / scalar;
+    float val1Imag = (float)(*complexVectorPtr++) / scalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  const float iScalar = 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+
+  __m128 cplxValue1, cplxValue2, result, re, im;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+    
+    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(re, invScalar);
+    cplxValue2 = _mm_mul_ps(im, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) * iScalar;
+    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+
+ 
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    float real = ( (float) (*complexVectorPtr++)) * invScalar;
+    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h
new file mode 100644
index 0000000000..c8128dbab8
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_16u_byteswap_a16_H
+#define INCLUDED_volk_16u_byteswap_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+  uint16_t* inputPtr = intsToSwap;
+  __m128i input, left, right, output;
+
+  const unsigned int eighthPoints = num_points / 8;
+  for(;number < eighthPoints; number++){
+    // Load the 16t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_load_si128((__m128i*)inputPtr);
+    // Do the two shifts
+    left = _mm_slli_epi16(input, 8);
+    right = _mm_srli_epi16(input, 8);
+    // Or the left and right halves together
+    output = _mm_or_si128(left, right);
+    // Store the results
+    _mm_store_si128((__m128i*)inputPtr, output);
+    inputPtr += 8;
+  }
+
+  
+  // Byteswap any remaining points:
+  number = eighthPoints*8;  
+  for(; number < num_points; number++){
+    uint16_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int point;
+  uint16_t* inputPtr = intsToSwap;
+  for(point = 0; point < num_points; point++){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){
+    volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_a16_H */
diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
deleted file mode 100644
index 9d19d1a456..0000000000
--- a/volk/include/volk/volk_16u_byteswap_aligned16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
-#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int16_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-  uint16_t* inputPtr = intsToSwap;
-  __m128i input, left, right, output;
-
-  const unsigned int eighthPoints = num_points / 8;
-  for(;number < eighthPoints; number++){
-    // Load the 16t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_load_si128((__m128i*)inputPtr);
-    // Do the two shifts
-    left = _mm_slli_epi16(input, 8);
-    right = _mm_srli_epi16(input, 8);
-    // Or the left and right halves together
-    output = _mm_or_si128(left, right);
-    // Store the results
-    _mm_store_si128((__m128i*)inputPtr, output);
-    inputPtr += 8;
-  }
-
-  
-  // Byteswap any remaining points:
-  number = eighthPoints*8;  
-  for(; number < num_points; number++){
-    uint16_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int16_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){
-  unsigned int point;
-  uint16_t* inputPtr = intsToSwap;
-  for(point = 0; point < num_points; point++){
-    uint16_t output = *inputPtr;
-    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
-    *inputPtr = output;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int16_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
-static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){
-    volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
new file mode 100644
index 0000000000..a0f97f94e2
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
+#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+  
+  
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+  
+  
+  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+  xmm9 = _mm_setzero_ps();
+  xmm1 = _mm_setzero_ps();
+  
+  xmm0 = _mm_load1_ps(&center_point_array[0]);
+  xmm6 = _mm_load1_ps(&center_point_array[1]);
+  xmm7 = _mm_load1_ps(&center_point_array[2]);
+  xmm8 = _mm_load1_ps(&center_point_array[3]);
+  //xmm11 = _mm_load1_ps(&center_point_array[4]);
+  xmm10 = _mm_load1_ps(cutoff);
+  
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 2) & 3;
+  int i = 0;
+  
+  for(; i < bound; ++i) {
+    xmm2 = _mm_load_ps(src0);
+    xmm2 = _mm_max_ps(xmm10, xmm2);
+    xmm3 = _mm_mul_ps(xmm2, xmm2);
+    xmm4 = _mm_mul_ps(xmm2, xmm3);
+    xmm5 = _mm_mul_ps(xmm3, xmm3);
+    //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+    xmm2 = _mm_mul_ps(xmm2, xmm0);
+    xmm3 = _mm_mul_ps(xmm3, xmm6);
+    xmm4 = _mm_mul_ps(xmm4, xmm7);
+    xmm5 = _mm_mul_ps(xmm5, xmm8);
+    //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+    xmm2 = _mm_add_ps(xmm2, xmm3);
+    xmm3 = _mm_add_ps(xmm4, xmm5);
+    
+    src0 += 4;
+    
+    xmm9 = _mm_add_ps(xmm2, xmm9);
+    
+    xmm1 = _mm_add_ps(xmm3, xmm1);
+
+    //xmm9 = _mm_add_ps(xmm12, xmm9);
+  }
+  
+  xmm2 = _mm_hadd_ps(xmm9, xmm1);
+  xmm3 = _mm_hadd_ps(xmm2, xmm2);
+  xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+  _mm_store_ss(&result, xmm4);
+    
+  
+
+  for(i = 0; i < leftovers; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth);// + 
+	       //center_point_array[4] * fith);
+  }
+
+  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+  target[0] = result;
+}
+ 
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+    
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+
+
+  int i = 0; 
+  
+  for(; i < num_bytes >> 2; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth); //+
+	       //center_point_array[4] * fith);
+    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
+		  center_point_array[1] * sq + 
+		  center_point_array[2] * thrd + 
+			 center_point_array[3] * frth) +
+	   //center_point_array[4] * fith) + 
+	   (center_point_array[4]), i);
+    */
+  }
+
+  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+  
+  
+  *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h
new file mode 100644
index 0000000000..ba38c310f7
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_add_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H
+#define INCLUDED_volk_32f_32f_add_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_add_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h
new file mode 100644
index 0000000000..a0995e631b
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_divide_32f_a16.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H
+#define INCLUDED_volk_32f_32f_divide_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_div_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
new file mode 100644
index 0000000000..63f5221d32
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
+#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){      
+
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
new file mode 100644
index 0000000000..b5fa7d7a44
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
+#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+      
+    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/
diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
new file mode 100644
index 0000000000..34ea933491
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H
+#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  unsigned int number = 0;
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+
+  const uint64_t quarterPoints = num_points / 4;
+    
+  __m128 iValue, qValue, cplxValue;
+  for(;number < quarterPoints; number++){
+    iValue = _mm_load_ps(iBufferPtr);
+    qValue = _mm_load_ps(qBufferPtr);
+
+    // Interleaves the lower two values in the i and q variables into one buffer
+    cplxValue = _mm_unpacklo_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    // Interleaves the upper two values in the i and q variables into one buffer
+    cplxValue = _mm_unpackhi_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector.
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h
new file mode 100644
index 0000000000..8ca7a5ba8b
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_max_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H
+#define INCLUDED_volk_32f_32f_max_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_max_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h
new file mode 100644
index 0000000000..dd05988bea
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_min_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H
+#define INCLUDED_volk_32f_32f_min_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_min_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
new file mode 100644
index 0000000000..2d004db105
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H
+#define INCLUDED_volk_32f_32f_multiply_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
new file mode 100644
index 0000000000..207382a199
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
+#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue1, cplxValue2;
+    __m128i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+      intValue1 = _mm_cvtps_epi32(cplxValue1);
+      intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+      intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+      complexVectorPtr += 8;
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[4] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+      _mm_store_ps(floatBuffer, cplxValue);
+
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+ 
+      _mm_store_ps(floatBuffer, cplxValue);
+      
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
new file mode 100644
index 0000000000..9fea6aa27e
--- /dev/null
+++ b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H
+#define INCLUDED_volk_32f_32f_subtract_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_sub_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h
deleted file mode 100644
index 7e395cf501..0000000000
--- a/volk/include/volk/volk_32f_accumulator_aligned16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
-#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Accumulates the values in the input buffer
-  \param result The accumulated result
-  \param inputBuffer The buffer of data to be accumulated
-  \param num_points The number of values in inputBuffer to be accumulated
-*/
-static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* aPtr = inputBuffer;
-  float tempBuffer[4] __attribute__((aligned(128)));
-  
-  __m128 accumulator = _mm_setzero_ps();
-  __m128 aVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-    aVal = _mm_load_ps(aPtr);
-    accumulator = _mm_add_ps(accumulator, aVal); 
-    aPtr += 4;
-  }
-  _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
-  returnValue = tempBuffer[0];
-  returnValue += tempBuffer[1];
-  returnValue += tempBuffer[2];
-  returnValue += tempBuffer[3];
-  
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Accumulates the values in the input buffer
-  \param result The accumulated result
-  \param inputBuffer The buffer of data to be accumulated
-  \param num_points The number of values in inputBuffer to be accumulated
-*/
-static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){
-  const float* aPtr = inputBuffer;
-  unsigned int number = 0;
-  float returnValue = 0;
-
-  for(;number < num_points; number++){
-    returnValue += (*aPtr++);
-  }
-  *result = returnValue;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h
new file mode 100644
index 0000000000..4a3588e6d2
--- /dev/null
+++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H
+#define INCLUDED_volk_32f_accumulator_s32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Accumulates the values in the input buffer
+  \param result The accumulated result
+  \param inputBuffer The buffer of data to be accumulated
+  \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* aPtr = inputBuffer;
+  float tempBuffer[4] __attribute__((aligned(128)));
+  
+  __m128 accumulator = _mm_setzero_ps();
+  __m128 aVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+    aVal = _mm_load_ps(aPtr);
+    accumulator = _mm_add_ps(accumulator, aVal); 
+    aPtr += 4;
+  }
+  _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+  returnValue = tempBuffer[0];
+  returnValue += tempBuffer[1];
+  returnValue += tempBuffer[2];
+  returnValue += tempBuffer[3];
+  
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    returnValue += (*aPtr++);
+  }
+  *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Accumulates the values in the input buffer
+  \param result The accumulated result
+  \param inputBuffer The buffer of data to be accumulated
+  \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){
+  const float* aPtr = inputBuffer;
+  unsigned int number = 0;
+  float returnValue = 0;
+
+  for(;number < num_points; number++){
+    returnValue += (*aPtr++);
+  }
+  *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */
diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h
deleted file mode 100644
index e7d8de265d..0000000000
--- a/volk/include/volk/volk_32f_add_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H
-#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_add_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
new file mode 100644
index 0000000000..fce77cd045
--- /dev/null
+++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
@@ -0,0 +1,167 @@
+#ifndef INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H
+#define INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the spectral noise floor of an input power spectrum
+
+  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
+
+  \param realDataPoints The input power spectrum
+  \param num_points The number of data points in the input power spectrum vector
+  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* dataPointsPtr = realDataPoints;
+  float avgPointsVector[4] __attribute__((aligned(128)));
+    
+  __m128 dataPointsVal;
+  __m128 avgPointsVal = _mm_setzero_ps();
+  // Calculate the sum (for mean) for all points
+  for(; number < quarterPoints; number++){
+
+    dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+    dataPointsPtr += 4;
+
+    avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+  }
+
+  _mm_store_ps(avgPointsVector, avgPointsVal);
+
+  float sumMean = 0.0;
+  sumMean += avgPointsVector[0];
+  sumMean += avgPointsVector[1];
+  sumMean += avgPointsVector[2];
+  sumMean += avgPointsVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    sumMean += realDataPoints[number];
+  }
+
+  // calculate the spectral mean
+  // +20 because for the comparison below we only want to throw out bins
+  // that are significantly higher (and would, thus, affect the mean more
+  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+  __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+  __m128 vOnesVector = _mm_set_ps1(1.0);
+  __m128 vValidBinCount = _mm_setzero_ps();
+  avgPointsVal = _mm_setzero_ps();
+  __m128 compareMask;
+  number = 0;
+  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+  for(; number < quarterPoints; number++){
+
+    dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+    dataPointsPtr += 4;
+
+    // Identify which items do not exceed the mean amplitude
+    compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+    avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+      
+    // Count the number of bins which do not exceed the mean amplitude
+    vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+  }
+    
+  // Calculate the mean from the remaining data points
+  _mm_store_ps(avgPointsVector, avgPointsVal);
+
+  sumMean = 0.0;
+  sumMean += avgPointsVector[0];
+  sumMean += avgPointsVector[1];
+  sumMean += avgPointsVector[2];
+  sumMean += avgPointsVector[3];
+
+  // Calculate the number of valid bins from the remaning count
+  float validBinCountVector[4] __attribute__((aligned(128)));
+  _mm_store_ps(validBinCountVector, vValidBinCount);
+
+  float validBinCount = 0;
+  validBinCount += validBinCountVector[0];
+  validBinCount += validBinCountVector[1];
+  validBinCount += validBinCountVector[2];
+  validBinCount += validBinCountVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    if(realDataPoints[number] <= meanAmplitude){
+      sumMean += realDataPoints[number];
+      validBinCount += 1.0;
+    }
+  }
+    
+  float localNoiseFloorAmplitude = 0;
+  if(validBinCount > 0.0){
+    localNoiseFloorAmplitude = sumMean / validBinCount;
+  }
+  else{
+    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+  }
+
+  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the spectral noise floor of an input power spectrum
+
+  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
+
+  \param realDataPoints The input power spectrum
+  \param num_points The number of data points in the input power spectrum vector
+  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+  float sumMean = 0.0;
+  unsigned int number;
+  // find the sum (for mean), etc
+  for(number = 0; number < num_points; number++){
+    // sum (for mean)
+    sumMean += realDataPoints[number];
+  }
+
+  // calculate the spectral mean
+  // +20 because for the comparison below we only want to throw out bins
+  // that are significantly higher (and would, thus, affect the mean more)
+  const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+  // now throw out any bins higher than the mean
+  sumMean = 0.0;
+  unsigned int newNumDataPoints = num_points;
+  for(number = 0; number < num_points; number++){
+    if (realDataPoints[number] <= meanAmplitude)
+      sumMean += realDataPoints[number];
+    else
+      newNumDataPoints--;
+  }
+
+  float localNoiseFloorAmplitude = 0.0;
+  if (newNumDataPoints == 0)             // in the odd case that all
+    localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+  else
+    localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
deleted file mode 100644
index ff917525ff..0000000000
--- a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the spectral noise floor of an input power spectrum
-
-  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
-
-  \param realDataPoints The input power spectrum
-  \param num_points The number of data points in the input power spectrum vector
-  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
-  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
-*/
-static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* dataPointsPtr = realDataPoints;
-  float avgPointsVector[4] __attribute__((aligned(128)));
-    
-  __m128 dataPointsVal;
-  __m128 avgPointsVal = _mm_setzero_ps();
-  // Calculate the sum (for mean) for all points
-  for(; number < quarterPoints; number++){
-
-    dataPointsVal = _mm_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 4;
-
-    avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
-  }
-
-  _mm_store_ps(avgPointsVector, avgPointsVal);
-
-  float sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more
-  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
-  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
-  __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
-  __m128 vOnesVector = _mm_set_ps1(1.0);
-  __m128 vValidBinCount = _mm_setzero_ps();
-  avgPointsVal = _mm_setzero_ps();
-  __m128 compareMask;
-  number = 0;
-  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
-  for(; number < quarterPoints; number++){
-
-    dataPointsVal = _mm_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 4;
-
-    // Identify which items do not exceed the mean amplitude
-    compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
-
-    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
-    avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
-      
-    // Count the number of bins which do not exceed the mean amplitude
-    vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
-  }
-    
-  // Calculate the mean from the remaining data points
-  _mm_store_ps(avgPointsVector, avgPointsVal);
-
-  sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-
-  // Calculate the number of valid bins from the remaning count
-  float validBinCountVector[4] __attribute__((aligned(128)));
-  _mm_store_ps(validBinCountVector, vValidBinCount);
-
-  float validBinCount = 0;
-  validBinCount += validBinCountVector[0];
-  validBinCount += validBinCountVector[1];
-  validBinCount += validBinCountVector[2];
-  validBinCount += validBinCountVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    if(realDataPoints[number] <= meanAmplitude){
-      sumMean += realDataPoints[number];
-      validBinCount += 1.0;
-    }
-  }
-    
-  float localNoiseFloorAmplitude = 0;
-  if(validBinCount > 0.0){
-    localNoiseFloorAmplitude = sumMean / validBinCount;
-  }
-  else{
-    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
-  }
-
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the spectral noise floor of an input power spectrum
-
-  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
-
-  \param realDataPoints The input power spectrum
-  \param num_points The number of data points in the input power spectrum vector
-  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
-  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
-*/
-static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
-  float sumMean = 0.0;
-  unsigned int number;
-  // find the sum (for mean), etc
-  for(number = 0; number < num_points; number++){
-    // sum (for mean)
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more)
-  const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
-
-  // now throw out any bins higher than the mean
-  sumMean = 0.0;
-  unsigned int newNumDataPoints = num_points;
-  for(number = 0; number < num_points; number++){
-    if (realDataPoints[number] <= meanAmplitude)
-      sumMean += realDataPoints[number];
-    else
-      newNumDataPoints--;
-  }
-
-  float localNoiseFloorAmplitude = 0.0;
-  if (newNumDataPoints == 0)             // in the odd case that all
-    localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
-  else
-    localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
-
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h
deleted file mode 100644
index 7fbabd9c38..0000000000
--- a/volk/include/volk/volk_32f_convert_16s_aligned16.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;    
-  for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h
deleted file mode 100644
index d2bbdf13ad..0000000000
--- a/volk/include/volk/volk_32f_convert_16s_unaligned16.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h
deleted file mode 100644
index 011ef5d0e4..0000000000
--- a/volk/include/volk/volk_32f_convert_32s_aligned16.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h
deleted file mode 100644
index a6df826c72..0000000000
--- a/volk/include/volk/volk_32f_convert_32s_unaligned16.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h
new file mode 100644
index 0000000000..c303dc1188
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_a16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_a16_H
+#define INCLUDED_volk_32f_convert_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ 
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_store_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_store_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h
deleted file mode 100644
index 91a8558135..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_aligned16.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the float values into double values
-    \param dVector The converted double vector values
-    \param fVector The float vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
-
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- 
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_store_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_store_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the float values into double values
-  \param dVector The converted double vector values
-  \param fVector The float vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h
new file mode 100644
index 0000000000..c8de768dc9
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_ua16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_ua16_H
+#define INCLUDED_volk_32f_convert_64f_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ 
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h
deleted file mode 100644
index 698e0d4460..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_unaligned16.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the float values into double values
-    \param dVector The converted double vector values
-    \param fVector The float vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
-
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- 
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the float values into double values
-  \param dVector The converted double vector values
-  \param fVector The float vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h
deleted file mode 100644
index b9487b6229..0000000000
--- a/volk/include/volk/volk_32f_convert_8s_aligned16.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h
deleted file mode 100644
index e986dbc872..0000000000
--- a/volk/include/volk/volk_32f_convert_8s_unaligned16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h
deleted file mode 100644
index c595b5e92c..0000000000
--- a/volk/include/volk/volk_32f_divide_aligned16.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
-#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_div_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) / (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) / (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h
deleted file mode 100644
index 3aee1136ac..0000000000
--- a/volk/include/volk/volk_32f_dot_prod_aligned16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
-#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
-
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-  
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_dot_prod_aligned16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_load_ps(aPtr); 
-    bVal = _mm_load_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_load_ps(aPtr); 
-    bVal = _mm_load_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){      
-
-    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-    
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h
deleted file mode 100644
index bce6aa15fd..0000000000
--- a/volk/include/volk/volk_32f_dot_prod_unaligned16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
-#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
-
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-  
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_loadu_ps(aPtr); 
-    bVal = _mm_loadu_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_loadu_ps(aPtr); 
-    bVal = _mm_loadu_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-      
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-    
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h
deleted file mode 100644
index c82239d749..0000000000
--- a/volk/include/volk/volk_32f_fm_detect_aligned16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
-#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
-  \param outputVector The byte-aligned vector where the results will be stored.
-  \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
-  \param bound The interval that the input phase data is in, which is used to modulo the differentiation
-  \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
-  \param num_noints The number of real values in the input vector.
-*/
-static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
-  if (num_points < 1) {
-    return;
-  }
-  unsigned int number = 1;
-  unsigned int j = 0;
-  // num_points-1 keeps Fedora 7's gcc from crashing...
-  // num_points won't work.  :(
-  const unsigned int quarterPoints = (num_points-1) / 4;
-
-  float* outPtr = outputVector;
-  const float* inPtr = inputVector;
-  __m128 upperBound = _mm_set_ps1(bound);
-  __m128 lowerBound = _mm_set_ps1(-bound);
-  __m128 next3old1;
-  __m128 next4;
-  __m128 boundAdjust;
-  __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
-  __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
-  // Do the first 4 by hand since we're going in from the saveValue:
-  *outPtr = *inPtr - *saveValue;
-  if (*outPtr >  bound) *outPtr -= 2*bound;
-  if (*outPtr < -bound) *outPtr += 2*bound;
-  inPtr++;
-  outPtr++;
-  for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
-    inPtr++;
-    outPtr++;
-  }
-    
-  for (; number < quarterPoints; number++) {
-    // Load data
-    next3old1 = _mm_loadu_ps((float*) (inPtr-1));
-    next4 = _mm_load_ps(inPtr);
-    inPtr += 4;
-    // Subtract and store:
-    next3old1 = _mm_sub_ps(next4, next3old1);
-    // Bound:
-    boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
-    boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
-    next4 = _mm_cmplt_ps(next3old1, lowerBound);
-    next4 = _mm_and_ps(next4, negBoundAdjust);
-    boundAdjust = _mm_or_ps(next4, boundAdjust);
-    // Make sure we're in the bounding interval:
-    next3old1 = _mm_add_ps(next3old1, boundAdjust);
-    _mm_store_ps(outPtr,next3old1); // Store the results back into the output
-    outPtr += 4;
-  }
-    
-  for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
-    inPtr++;
-    outPtr++;
-  }
-    
-  *saveValue = inputVector[num_points-1];
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
-  \param outputVector The byte-aligned vector where the results will be stored.
-  \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
-  \param bound The interval that the input phase data is in, which is used to modulo the differentiation
-  \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
-  \param num_points The number of real values in the input vector.
-*/
-static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
-  if (num_points < 1) {
-    return;
-  }
-  unsigned int number = 0;
-  float* outPtr = outputVector;
-  const float* inPtr = inputVector;
-      
-  // Do the first 1 by hand since we're going in from the saveValue:
-  *outPtr = *inPtr - *saveValue;
-  if (*outPtr >  bound) *outPtr -= 2*bound;
-  if (*outPtr < -bound) *outPtr += 2*bound;
-  inPtr++;
-  outPtr++;
-    
-  for (number = 1; number < num_points; number++) {
-    *outPtr = *(inPtr) - *(inPtr-1);
-    if (*outPtr >  bound) *outPtr -= 2*bound;
-    if (*outPtr < -bound) *outPtr += 2*bound;
-    inPtr++;
-    outPtr++;
-  }
-    
-  *saveValue = inputVector[num_points-1];
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h
new file mode 100644
index 0000000000..d070e17d56
--- /dev/null
+++ b/volk/include/volk/volk_32f_index_max_16u_a16.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_volk_32f_index_max_16u_a16_H
+#define INCLUDED_volk_32f_index_max_16u_a16_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)src0;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    float maxValuesBuffer[4] __attribute__((aligned(16)));
+    float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+    for(;number < quarterPoints; number++){
+
+      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
+      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+      maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+      maxValues      = _mm_blendv_ps(currentValues, maxValues, compareResults);
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for(number = 0; number < 4; number++){
+      if(maxValuesBuffer[number] > max){
+	index = maxIndexesBuffer[number];
+	max = maxValuesBuffer[number];
+      }
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){      
+      if(src0[number] > max){
+	index = number;
+	max = src0[number];
+      }
+    }
+    target[0] = (unsigned int)index;
+  }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#if LV_HAVE_SSE
+#include<xmmintrin.h>
+
+static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)src0;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    float maxValuesBuffer[4] __attribute__((aligned(16)));
+    float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+    for(;number < quarterPoints; number++){
+
+      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
+      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+      maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+      maxValues      = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for(number = 0; number < 4; number++){
+      if(maxValuesBuffer[number] > max){
+	index = maxIndexesBuffer[number];
+	max = maxValuesBuffer[number];
+      }
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){      
+      if(src0[number] > max){
+	index = number;
+	max = src0[number];
+      }
+    }
+    target[0] = (unsigned int)index;
+  }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+  if(num_points > 0){
+    float max = src0[0];
+    unsigned int index = 0;
+    
+    int i = 1; 
+    
+    for(; i < num_points; ++i) {
+      
+      if(src0[i] > max){
+	index = i;
+	max = src0[i];
+      }
+
+    }
+    target[0] = index;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/
diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h
deleted file mode 100644
index 26322bfa26..0000000000
--- a/volk/include/volk/volk_32f_index_max_aligned16.h
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
-#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
-
-#include <volk/volk_common.h>
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include<smmintrin.h>
-
-static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* inputPtr = (float*)src0;
-
-    __m128 indexIncrementValues = _mm_set1_ps(4);
-    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
-    float max = src0[0];
-    float index = 0;
-    __m128 maxValues = _mm_set1_ps(max);
-    __m128 maxValuesIndex = _mm_setzero_ps();
-    __m128 compareResults;
-    __m128 currentValues;
-
-    float maxValuesBuffer[4] __attribute__((aligned(16)));
-    float maxIndexesBuffer[4] __attribute__((aligned(16)));
-
-    for(;number < quarterPoints; number++){
-
-      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
-      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-
-      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-
-      maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
-      maxValues      = _mm_blendv_ps(currentValues, maxValues, compareResults);
-    }
-
-    // Calculate the largest value from the remaining 4 points
-    _mm_store_ps(maxValuesBuffer, maxValues);
-    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-    for(number = 0; number < 4; number++){
-      if(maxValuesBuffer[number] > max){
-	index = maxIndexesBuffer[number];
-	max = maxValuesBuffer[number];
-      }
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){      
-      if(src0[number] > max){
-	index = number;
-	max = src0[number];
-      }
-    }
-    target[0] = (unsigned int)index;
-  }
-}
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#if LV_HAVE_SSE
-#include<xmmintrin.h>
-
-static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) {
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* inputPtr = (float*)src0;
-
-    __m128 indexIncrementValues = _mm_set1_ps(4);
-    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
-    float max = src0[0];
-    float index = 0;
-    __m128 maxValues = _mm_set1_ps(max);
-    __m128 maxValuesIndex = _mm_setzero_ps();
-    __m128 compareResults;
-    __m128 currentValues;
-
-    float maxValuesBuffer[4] __attribute__((aligned(16)));
-    float maxIndexesBuffer[4] __attribute__((aligned(16)));
-
-    for(;number < quarterPoints; number++){
-
-      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
-      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-
-      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-
-      maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
-
-      maxValues      = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
-    }
-
-    // Calculate the largest value from the remaining 4 points
-    _mm_store_ps(maxValuesBuffer, maxValues);
-    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-    for(number = 0; number < 4; number++){
-      if(maxValuesBuffer[number] > max){
-	index = maxIndexesBuffer[number];
-	max = maxValuesBuffer[number];
-      }
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){      
-      if(src0[number] > max){
-	index = number;
-	max = src0[number];
-      }
-    }
-    target[0] = (unsigned int)index;
-  }
-}
-
-#endif /*LV_HAVE_SSE*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) {
-  if(num_points > 0){
-    float max = src0[0];
-    unsigned int index = 0;
-    
-    int i = 1; 
-    
-    for(; i < num_points; ++i) {
-      
-      if(src0[i] > max){
-	index = i;
-	max = src0[i];
-      }
-
-    }
-    target[0] = index;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h
deleted file mode 100644
index 476946b887..0000000000
--- a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H
-#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const float* iBufferPtr = iBuffer;
-    const float* qBufferPtr = qBuffer;
-
-    __m128 vScalar = _mm_set_ps1(scalar);
-
-    const unsigned int quarterPoints = num_points / 4;
-    
-    __m128 iValue, qValue, cplxValue1, cplxValue2;
-    __m128i intValue1, intValue2;
-
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    for(;number < quarterPoints; number++){
-      iValue = _mm_load_ps(iBufferPtr);
-      qValue = _mm_load_ps(qBufferPtr);
-
-      // Interleaves the lower two values in the i and q variables into one buffer
-      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
-      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
-
-      // Interleaves the upper two values in the i and q variables into one buffer
-      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
-      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
-
-      intValue1 = _mm_cvtps_epi32(cplxValue1);
-      intValue2 = _mm_cvtps_epi32(cplxValue2);
-
-      intValue1 = _mm_packs_epi32(intValue1, intValue2);
-
-      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
-      complexVectorPtr += 8;
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)(&complexVector[number]);
-    for(; number < num_points; number++){
-      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-    }
-    
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const float* iBufferPtr = iBuffer;
-    const float* qBufferPtr = qBuffer;
-
-    __m128 vScalar = _mm_set_ps1(scalar);
-
-    const unsigned int quarterPoints = num_points / 4;
-    
-    __m128 iValue, qValue, cplxValue;
-
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    float floatBuffer[4] __attribute__((aligned(128)));
-
-    for(;number < quarterPoints; number++){
-      iValue = _mm_load_ps(iBufferPtr);
-      qValue = _mm_load_ps(qBufferPtr);
-
-      // Interleaves the lower two values in the i and q variables into one buffer
-      cplxValue = _mm_unpacklo_ps(iValue, qValue);
-      cplxValue = _mm_mul_ps(cplxValue, vScalar);
-
-      _mm_store_ps(floatBuffer, cplxValue);
-
-      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
-
-      // Interleaves the upper two values in the i and q variables into one buffer
-      cplxValue = _mm_unpackhi_ps(iValue, qValue);
-      cplxValue = _mm_mul_ps(cplxValue, vScalar);
- 
-      _mm_store_ps(floatBuffer, cplxValue);
-      
-      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)(&complexVector[number]);
-    for(; number < num_points; number++){
-      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-    }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h
deleted file mode 100644
index 859c6a0efa..0000000000
--- a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H
-#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Interleaves the I & Q vector data into the complex vector
-  \param iBuffer The I buffer data to be interleaved
-  \param qBuffer The Q buffer data to be interleaved
-  \param complexVector The complex output vector
-  \param num_points The number of complex data values to be interleaved
-*/
-static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
-  unsigned int number = 0;
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-
-  const uint64_t quarterPoints = num_points / 4;
-    
-  __m128 iValue, qValue, cplxValue;
-  for(;number < quarterPoints; number++){
-    iValue = _mm_load_ps(iBufferPtr);
-    qValue = _mm_load_ps(qBufferPtr);
-
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue = _mm_unpacklo_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue = _mm_unpackhi_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Interleaves the I & Q vector data into the complex vector.
-  \param iBuffer The I buffer data to be interleaved
-  \param qBuffer The Q buffer data to be interleaved
-  \param complexVector The complex output vector
-  \param num_points The number of complex data values to be interleaved
-*/
-static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h
deleted file mode 100644
index d4e30fba81..0000000000
--- a/volk/include/volk/volk_32f_max_aligned16.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H
-#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_max_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h
deleted file mode 100644
index 55daafb6a9..0000000000
--- a/volk/include/volk/volk_32f_min_aligned16.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H
-#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_min_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h
deleted file mode 100644
index 87ae7bcf8c..0000000000
--- a/volk/include/volk/volk_32f_multiply_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H
-#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_mul_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
deleted file mode 100644
index 323d0513c2..0000000000
--- a/volk/include/volk/volk_32f_normalize_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H
-#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value )
-  \param vecBuffer The buffer of values to be vectorized
-  \param num_points The number of values in vecBuffer
-  \param scalar The scale value to be applied to each buffer value
-*/
-static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* inputPtr = vecBuffer;
-
-  const float invScalar = 1.0 / scalar;
-  __m128 vecScalar = _mm_set_ps1(invScalar);
-
-  __m128 input1;
-
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-
-    input1 = _mm_load_ps(inputPtr);
-
-    input1 = _mm_mul_ps(input1, vecScalar);
-
-    _mm_store_ps(inputPtr, input1);
-
-    inputPtr += 4;
-  }
-
-  number = quarterPoints*4;
-  for(; number < num_points; number++){
-    *inputPtr *= invScalar;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Normalizes the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be normalizeed
-  \param bVector One of the vectors to be normalizeed
-  \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
-*/
-static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* inputPtr = vecBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *inputPtr *= invScalar;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Normalizes the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be normalizeed
-  \param bVector One of the vectors to be normalizeed
-  \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
-*/
-extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
-static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){
-    float invscalar = 1.0 / scalar;
-    volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h
deleted file mode 100644
index 2ecd8eecbd..0000000000
--- a/volk/include/volk/volk_32f_power_aligned16.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H
-#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE4_1
-#include <tmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Takes each the input vector value to the specified power and stores the results in the return vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector of values to be taken to a power
-  \param power The power value to be applied to each data point
-  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-*/
-static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 vPower = _mm_set_ps1(power);
-  __m128 zeroValue = _mm_setzero_ps();
-  __m128 signMask;
-  __m128 negatedValues;
-  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
-  __m128 onesMask = _mm_set_ps1(1);
-  
-  __m128 aVal, cVal;
-  for(;number < quarterPoints; number++){
-    
-    aVal = _mm_load_ps(aPtr);
-    signMask = _mm_cmplt_ps(aVal, zeroValue);
-    negatedValues = _mm_sub_ps(zeroValue, aVal);
-    aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
-    
-    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
-    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
-
-    cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
-
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-    
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-  for(;number < num_points; number++){
-    *cPtr++ = powf((*aPtr++), power);
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Takes each the input vector value to the specified power and stores the results in the return vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector of values to be taken to a power
-  \param power The power value to be applied to each data point
-  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-*/
-static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 vPower = _mm_set_ps1(power);
-  __m128 zeroValue = _mm_setzero_ps();
-  __m128 signMask;
-  __m128 negatedValues;
-  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
-  __m128 onesMask = _mm_set_ps1(1);
-  
-  __m128 aVal, cVal;
-  for(;number < quarterPoints; number++){
-    
-    aVal = _mm_load_ps(aPtr);
-    signMask = _mm_cmplt_ps(aVal, zeroValue);
-    negatedValues = _mm_sub_ps(zeroValue, aVal);
-    aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
-    
-    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
-    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
-
-    cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
-
-    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-    
-    aPtr += 4;
-    cPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-  for(;number < num_points; number++){
-    *cPtr++ = powf((*aPtr++), power);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Takes each the input vector value to the specified power and stores the results in the return vector
-    \param cVector The vector where the results will be stored
-    \param aVector The vector of values to be taken to a power
-    \param power The power value to be applied to each data point
-    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-  */
-static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
-  float* cPtr = cVector;
-  const float* aPtr = aVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = powf((*aPtr++), power);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h
new file mode 100644
index 0000000000..ff4d5b19c3
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+  \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+  \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+  \param num_noints The number of real values in the input vector.
+*/
+static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 1;
+  unsigned int j = 0;
+  // num_points-1 keeps Fedora 7's gcc from crashing...
+  // num_points won't work.  :(
+  const unsigned int quarterPoints = (num_points-1) / 4;
+
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+  __m128 upperBound = _mm_set_ps1(bound);
+  __m128 lowerBound = _mm_set_ps1(-bound);
+  __m128 next3old1;
+  __m128 next4;
+  __m128 boundAdjust;
+  __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+  __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+  // Do the first 4 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+  for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+    
+  for (; number < quarterPoints; number++) {
+    // Load data
+    next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+    next4 = _mm_load_ps(inPtr);
+    inPtr += 4;
+    // Subtract and store:
+    next3old1 = _mm_sub_ps(next4, next3old1);
+    // Bound:
+    boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+    boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+    next4 = _mm_cmplt_ps(next3old1, lowerBound);
+    next4 = _mm_and_ps(next4, negBoundAdjust);
+    boundAdjust = _mm_or_ps(next4, boundAdjust);
+    // Make sure we're in the bounding interval:
+    next3old1 = _mm_add_ps(next3old1, boundAdjust);
+    _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+    outPtr += 4;
+  }
+    
+  for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+    
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+  \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+  \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+  \param num_points The number of real values in the input vector.
+*/
+static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 0;
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+      
+  // Do the first 1 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+    
+  for (number = 1; number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+    
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h
new file mode 100644
index 0000000000..cf51cf9c53
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h
@@ -0,0 +1,110 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H
+#define INCLUDED_volk_32f_s32f_convert_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
new file mode 100644
index 0000000000..53d159f82a
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
@@ -0,0 +1,113 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H
+#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h
new file mode 100644
index 0000000000..0be649418a
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H
+#define INCLUDED_volk_32f_s32f_convert_32s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
new file mode 100644
index 0000000000..efb2c3a20d
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H
+#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h
new file mode 100644
index 0000000000..69ccec5c63
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h
@@ -0,0 +1,117 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H
+#define INCLUDED_volk_32f_s32f_convert_8s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
new file mode 100644
index 0000000000..af1652b194
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H
+#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h
new file mode 100644
index 0000000000..0850cddf79
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_normalize_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H
+#define INCLUDED_volk_32f_s32f_normalize_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value )
+  \param vecBuffer The buffer of values to be vectorized
+  \param num_points The number of values in vecBuffer
+  \param scalar The scale value to be applied to each buffer value
+*/
+static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* inputPtr = vecBuffer;
+
+  const float invScalar = 1.0 / scalar;
+  __m128 vecScalar = _mm_set_ps1(invScalar);
+
+  __m128 input1;
+
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+
+    input1 = _mm_load_ps(inputPtr);
+
+    input1 = _mm_mul_ps(input1, vecScalar);
+
+    _mm_store_ps(inputPtr, input1);
+
+    inputPtr += 4;
+  }
+
+  number = quarterPoints*4;
+  for(; number < num_points; number++){
+    *inputPtr *= invScalar;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Normalizes the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be normalizeed
+  \param bVector One of the vectors to be normalizeed
+  \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* inputPtr = vecBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *inputPtr *= invScalar;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Normalizes the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be normalizeed
+  \param bVector One of the vectors to be normalizeed
+  \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
+static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){
+    float invscalar = 1.0 / scalar;
+    volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h
new file mode 100644
index 0000000000..3ed594d9a7
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_power_32f_a16.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H
+#define INCLUDED_volk_32f_s32f_power_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <tmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  __m128 zeroValue = _mm_setzero_ps();
+  __m128 signMask;
+  __m128 negatedValues;
+  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+  __m128 onesMask = _mm_set_ps1(1);
+  
+  __m128 aVal, cVal;
+  for(;number < quarterPoints; number++){
+    
+    aVal = _mm_load_ps(aPtr);
+    signMask = _mm_cmplt_ps(aVal, zeroValue);
+    negatedValues = _mm_sub_ps(zeroValue, aVal);
+    aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+    
+    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+    cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+    
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  for(;number < num_points; number++){
+    *cPtr++ = powf((*aPtr++), power);
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  __m128 zeroValue = _mm_setzero_ps();
+  __m128 signMask;
+  __m128 negatedValues;
+  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+  __m128 onesMask = _mm_set_ps1(1);
+  
+  __m128 aVal, cVal;
+  for(;number < quarterPoints; number++){
+    
+    aVal = _mm_load_ps(aPtr);
+    signMask = _mm_cmplt_ps(aVal, zeroValue);
+    negatedValues = _mm_sub_ps(zeroValue, aVal);
+    aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+    
+    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+    cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+
+    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+    
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  for(;number < num_points; number++){
+    *cPtr++ = powf((*aPtr++), power);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Takes each the input vector value to the specified power and stores the results in the return vector
+    \param cVector The vector where the results will be stored
+    \param aVector The vector of values to be taken to a power
+    \param power The power value to be applied to each data point
+    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+  */
+static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = powf((*aPtr++), power);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h
new file mode 100644
index 0000000000..32f4fa0672
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H
+#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation of the input buffer using the supplied mean
+  \param stddev The calculated standard deviation
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param mean The mean of the input buffer
+  \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+  float returnValue = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;    
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      aPtr++;
+    }
+    returnValue /= num_points;
+    returnValue -= (mean * mean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation of the input buffer using the supplied mean
+  \param stddev The calculated standard deviation
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param mean The mean of the input buffer
+  \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+  float returnValue = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      aPtr++;
+    }
+    returnValue /= num_points;
+    returnValue -= (mean * mean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation of the input buffer using the supplied mean
+  \param stddev The calculated standard deviation
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param mean The mean of the input buffer
+  \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+  float returnValue = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+      
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      aPtr++;
+    }
+
+    returnValue /= num_points;
+    returnValue -= (mean * mean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h
new file mode 100644
index 0000000000..513c2cffec
--- /dev/null
+++ b/volk/include/volk/volk_32f_sqrt_32f_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H
+#define INCLUDED_volk_32f_sqrt_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      
+      cVal = _mm_sqrt_ps(aVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int);
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){
+    volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h
deleted file mode 100644
index f6996ad5fc..0000000000
--- a/volk/include/volk/volk_32f_sqrt_aligned16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H
-#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Sqrts the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be sqrted
-  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
-*/
-static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-
-    __m128 aVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      
-      cVal = _mm_sqrt_ps(aVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = sqrtf(*aPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Sqrts the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be sqrted
-  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
-*/
-static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = sqrtf(*aPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int);
-/*!
-  \brief Sqrts the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be sqrted
-  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
-*/
-static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){
-    volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points);
-}
-
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h
deleted file mode 100644
index 1c6a08437e..0000000000
--- a/volk/include/volk/volk_32f_stddev_aligned16.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H
-#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Calculates the standard deviation of the input buffer using the supplied mean
-  \param stddev The calculated standard deviation
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param mean The mean of the input buffer
-  \param num_points The number of values in input buffer to used in the stddev calculation
-*/
-static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
-  float returnValue = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const float* aPtr = inputBuffer;
-
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal1, aVal2, aVal3, aVal4;
-    __m128 cVal1, cVal2, cVal3, cVal4;
-    for(;number < sixteenthPoints; number++) {
-      aVal1 = _mm_load_ps(aPtr); aPtr += 4;    
-      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-
-      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-
-      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-
-      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-
-      cVal1 = _mm_or_ps(cVal1, cVal2);
-      cVal3 = _mm_or_ps(cVal3, cVal4);
-      cVal1 = _mm_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = sixteenthPoints * 16;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      aPtr++;
-    }
-    returnValue /= num_points;
-    returnValue -= (mean * mean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the standard deviation of the input buffer using the supplied mean
-  \param stddev The calculated standard deviation
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param mean The mean of the input buffer
-  \param num_points The number of values in input buffer to used in the stddev calculation
-*/
-static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
-  float returnValue = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* aPtr = inputBuffer;
-
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal = _mm_setzero_ps();
-    for(;number < quarterPoints; number++) {
-      aVal = _mm_load_ps(aPtr);                     // aVal = x
-      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
-      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
-      aPtr += 4;
-    }
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      aPtr++;
-    }
-    returnValue /= num_points;
-    returnValue -= (mean * mean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the standard deviation of the input buffer using the supplied mean
-  \param stddev The calculated standard deviation
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param mean The mean of the input buffer
-  \param num_points The number of values in input buffer to used in the stddev calculation
-*/
-static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
-  float returnValue = 0;
-  if(num_points > 0){
-    const float* aPtr = inputBuffer;
-    unsigned int number = 0;
-      
-    for(number = 0; number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      aPtr++;
-    }
-
-    returnValue /= num_points;
-    returnValue -= (mean * mean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
new file mode 100644
index 0000000000..2ba8098454
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
@@ -0,0 +1,169 @@
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
deleted file mode 100644
index 1cd502257c..0000000000
--- a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
-#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const float* aPtr = inputBuffer;
-    float meanBuffer[4] __attribute__((aligned(128)));
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal1, aVal2, aVal3, aVal4;
-    __m128 cVal1, cVal2, cVal3, cVal4;
-    for(;number < sixteenthPoints; number++) {
-      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
-      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
-
-      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
-
-      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
-
-      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
-
-      cVal1 = _mm_or_ps(cVal1, cVal2);
-      cVal3 = _mm_or_ps(cVal3, cVal4);
-      cVal1 = _mm_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = sixteenthPoints * 16;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* aPtr = inputBuffer;
-    float meanBuffer[4] __attribute__((aligned(128)));
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal = _mm_setzero_ps();
-    for(;number < quarterPoints; number++) {
-      aVal = _mm_load_ps(aPtr);                     // aVal = x
-      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
-      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
-      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
-      aPtr += 4;
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    const float* aPtr = inputBuffer;
-    unsigned int number = 0;
-    
-    for(number = 0; number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h
deleted file mode 100644
index e152429016..0000000000
--- a/volk/include/volk/volk_32f_subtract_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H
-#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_sub_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) - (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) - (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h
deleted file mode 100644
index a326e62b1d..0000000000
--- a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H
-#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
-#endif
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-  
-  
-  float result = 0.0;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-  //float fith = 0.0;
-  
-  
-  
-  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
-
-  xmm9 = _mm_setzero_ps();
-  xmm1 = _mm_setzero_ps();
-  
-  xmm0 = _mm_load1_ps(&center_point_array[0]);
-  xmm6 = _mm_load1_ps(&center_point_array[1]);
-  xmm7 = _mm_load1_ps(&center_point_array[2]);
-  xmm8 = _mm_load1_ps(&center_point_array[3]);
-  //xmm11 = _mm_load1_ps(&center_point_array[4]);
-  xmm10 = _mm_load1_ps(cutoff);
-  
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 2) & 3;
-  int i = 0;
-  
-  for(; i < bound; ++i) {
-    xmm2 = _mm_load_ps(src0);
-    xmm2 = _mm_max_ps(xmm10, xmm2);
-    xmm3 = _mm_mul_ps(xmm2, xmm2);
-    xmm4 = _mm_mul_ps(xmm2, xmm3);
-    xmm5 = _mm_mul_ps(xmm3, xmm3);
-    //xmm12 = _mm_mul_ps(xmm3, xmm4);
-
-    xmm2 = _mm_mul_ps(xmm2, xmm0);
-    xmm3 = _mm_mul_ps(xmm3, xmm6);
-    xmm4 = _mm_mul_ps(xmm4, xmm7);
-    xmm5 = _mm_mul_ps(xmm5, xmm8);
-    //xmm12 = _mm_mul_ps(xmm12, xmm11);
-
-    xmm2 = _mm_add_ps(xmm2, xmm3);
-    xmm3 = _mm_add_ps(xmm4, xmm5);
-    
-    src0 += 4;
-    
-    xmm9 = _mm_add_ps(xmm2, xmm9);
-    
-    xmm1 = _mm_add_ps(xmm3, xmm1);
-
-    //xmm9 = _mm_add_ps(xmm12, xmm9);
-  }
-  
-  xmm2 = _mm_hadd_ps(xmm9, xmm1);
-  xmm3 = _mm_hadd_ps(xmm2, xmm2);
-  xmm4 = _mm_hadd_ps(xmm3, xmm3);
-
-  _mm_store_ss(&result, xmm4);
-    
-  
-
-  for(i = 0; i < leftovers; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-    
-    result += (center_point_array[0] * fst + 
-	       center_point_array[1] * sq + 
-	       center_point_array[2] * thrd + 
-	       center_point_array[3] * frth);// + 
-	       //center_point_array[4] * fith);
-  }
-
-  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
-
-  target[0] = result;
-}
- 
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-
-
-    
-  float result = 0.0;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-  //float fith = 0.0;
-  
-
-
-  int i = 0; 
-  
-  for(; i < num_bytes >> 2; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-    
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-    
-    result += (center_point_array[0] * fst + 
-	       center_point_array[1] * sq + 
-	       center_point_array[2] * thrd + 
-	       center_point_array[3] * frth); //+
-	       //center_point_array[4] * fith);
-    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
-		  center_point_array[1] * sq + 
-		  center_point_array[2] * thrd + 
-			 center_point_array[3] * frth) +
-	   //center_point_array[4] * fith) + 
-	   (center_point_array[4]), i);
-    */
-  }
-
-  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
-
-  
-  
-  *target = result;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h
new file mode 100644
index 0000000000..514998800f
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H
+#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies the input complex vector with the input float vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the float values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal1 = _mm_load_ps((const float*)aPtr);
+      aPtr += 2;
+ 
+      aVal2 = _mm_load_ps((const float*)aPtr); 
+      aPtr += 2;
+
+      bVal = _mm_load_ps(bPtr);
+      bPtr += 4;
+
+      bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+      bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+
+      cVal = _mm_mul_ps(aVal1, bVal1); 
+      
+      _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+      cPtr += 2;
+
+      cVal = _mm_mul_ps(aVal2, bVal2); 
+      
+      _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+
+      cPtr += 2;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr);
+      bPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+  const float* bPtr=  bVector;
+  unsigned int number = 0;
+  
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = (*aPtr++) * (*bPtr++);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+    volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
deleted file mode 100644
index 304ed8e2d0..0000000000
--- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies the input complex vector with the input float vector and store their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The complex vector to be multiplied
-    \param bVector The vectors containing the float values to be multiplied against each complex value in aVector
-    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal1 = _mm_load_ps((const float*)aPtr);
-      aPtr += 2;
- 
-      aVal2 = _mm_load_ps((const float*)aPtr); 
-      aPtr += 2;
-
-      bVal = _mm_load_ps(bPtr);
-      bPtr += 4;
-
-      bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
-      bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
-
-      cVal = _mm_mul_ps(aVal1, bVal1); 
-      
-      _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
-      cPtr += 2;
-
-      cVal = _mm_mul_ps(aVal2, bVal2); 
-      
-      _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
-
-      cPtr += 2;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr);
-      bPtr++;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The complex vector to be multiplied
-    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
-    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  const float* bPtr=  bVector;
-  unsigned int number = 0;
-  
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = (*aPtr++) * (*bPtr++);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The complex vector to be multiplied
-    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
-    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-  */
-extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
-    volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h
new file mode 100644
index 0000000000..6f9e9e3ee1
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_32fc_32f_power_32fc_a16_H
+#define INCLUDED_volk_32fc_32f_power_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The complex vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32fc_32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  
+  __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+  for(;number < quarterPoints; number++){
+    
+    cplxValue1 = _mm_load_ps((float*)aPtr); 
+    aPtr += 2;
+    
+    cplxValue2 = _mm_load_ps((float*)aPtr); 
+    aPtr += 2;
+    
+    // Convert to polar coordinates
+    
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+    
+    phase = atan2f4(qValue, iValue); // Calculate the Phase
+    
+    magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+    
+    // Now calculate the power of the polar coordinate data
+    magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+    
+    phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+    
+    // Convert back to cartesian coordinates
+    iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+    qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+    
+    cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+    cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+    
+    _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+    
+    cPtr += 2;
+    
+    _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+    
+    cPtr += 2;
+  }
+  
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  lv_32fc_t complexPower;
+  ((float*)&complexPower)[0] = power;
+  ((float*)&complexPower)[1] = 0;
+  for(;number < num_points; number++){
+    *cPtr++ = lv_cpow((*aPtr++), complexPower);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector of values to be taken to a power
+    \param power The power value to be applied to each data point
+    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+  */
+static inline void volk_32fc_32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+  unsigned int number = 0;
+  lv_32fc_t complexPower;
+  ((float*)&complexPower)[0] = power;
+  ((float*)&complexPower)[1] = 0.0;
+
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = lv_cpow((*aPtr++), complexPower);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_32f_power_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
deleted file mode 100644
index 2d71ee4f88..0000000000
--- a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
-  \param cVector The vector where the results will be stored
-  \param aVector The complex vector of values to be taken to a power
-  \param power The power value to be applied to each data point
-  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-*/
-static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 vPower = _mm_set_ps1(power);
-  
-  __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
-  for(;number < quarterPoints; number++){
-    
-    cplxValue1 = _mm_load_ps((float*)aPtr); 
-    aPtr += 2;
-    
-    cplxValue2 = _mm_load_ps((float*)aPtr); 
-    aPtr += 2;
-    
-    // Convert to polar coordinates
-    
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-    
-    phase = atan2f4(qValue, iValue); // Calculate the Phase
-    
-    magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
-    
-    // Now calculate the power of the polar coordinate data
-    magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
-    
-    phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
-    
-    // Convert back to cartesian coordinates
-    iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
-    qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
-    
-    cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
-    cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
-    
-    _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
-    
-    cPtr += 2;
-    
-    _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
-    
-    cPtr += 2;
-  }
-  
-  number = quarterPoints * 4;
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-  lv_32fc_t complexPower;
-  ((float*)&complexPower)[0] = power;
-  ((float*)&complexPower)[1] = 0;
-  for(;number < num_points; number++){
-    *cPtr++ = lv_cpow((*aPtr++), complexPower);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
-    \param cVector The vector where the results will be stored
-    \param aVector The complex vector of values to be taken to a power
-    \param power The power value to be applied to each data point
-    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-  */
-static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  unsigned int number = 0;
-  lv_32fc_t complexPower;
-  ((float*)&complexPower)[0] = power;
-  ((float*)&complexPower)[1] = 0.0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = lv_cpow((*aPtr++), complexPower);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
new file mode 100644
index 0000000000..cd9cc81609
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
@@ -0,0 +1,344 @@
+#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H
+#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H
+
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+ 
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+  }
+  /*
+  for(i = 0; i < num_bytes >> 3; ++i) {
+    *result += input[i] * conjf(taps[i]);
+  }
+  */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  
+
+
+
+  asm volatile 
+    (
+     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %[conjugator], %%r9\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movaps 0(%%r9), %%xmm8\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movups	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "  xorps  %%xmm8, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "  xorps   %%xmm8, %%xmm3\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     "  xorps   %%xmm8, %%xmm2\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+  
+  }
+
+  return;
+}  
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+  int bound = num_bytes >> 4;
+  int leftovers = num_bytes % 16;
+
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	#movl	12(%%ebp), %%eax		# input\n\t"
+     "	#movl	16(%%ebp), %%edx		# taps\n\t"
+     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%[eax]), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%[edx]), %%xmm2\n\t"
+     "  movl    %[ecx], (%[out])\n\t"
+     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%[eax]), %%xmmA\n\t"
+     "#	movaps	(%[edx]), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%[edx]), %%xmm3\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "  xorps   %%xmm1, %%xmm3\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	movaps	16(%[eax]), %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%[eax]), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %[eax]\n\t"
+     "	movaps	32(%[edx]), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	addl	$32, %[edx]\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%[ecx]\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %[ecx]\n\t"
+     "	andl	$1, %[ecx]\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  #movl 8(%%ebp), %[eax] \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%[out])\n\t"
+     "	movss	(%[out]), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+     );
+
+  
+  
+  
+  printf("%d, %d\n", leftovers, bound);
+  
+  for(; leftovers > 0; leftovers -= 8) {
+    
+    
+    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+
+
+#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
new file mode 100644
index 0000000000..2ccfcf2f27
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
@@ -0,0 +1,468 @@
+#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H
+#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#if LV_HAVE_GENERIC 
+
+
+static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+
+  }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  asm 
+    (
+     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+  
+  }
+
+  return;
+  
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	movl	12(%%ebp), %%eax		# input\n\t"
+     "	movl	16(%%ebp), %%edx		# taps\n\t"
+     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%eax), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%edx), %%xmm2\n\t"
+     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%eax), %%xmmA\n\t"
+     "#	movaps	(%%edx), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%eax), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%edx), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%eax), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %%eax\n\t"
+     "	movaps	32(%%edx), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	addl	$32, %%edx\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%%ecx\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %%ecx\n\t"
+     "	andl	$1, %%ecx\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  movl 8(%%ebp), %%eax \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%%eax)\n\t"
+     "	movss	(%%eax), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %%eax		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     :
+     : "eax", "ecx", "edx"
+     );
+
+  
+  int getem = num_bytes % 16;
+  
+  for(; getem > 0; getem -= 8) {
+    
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int halfPoints = num_bytes >> 4;
+
+  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm_setzero_ps();
+
+  for(;number < halfPoints; number++){
+      
+    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+    a += 2;
+    b += 2;
+  }
+
+  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
+
+  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+  if((num_bytes >> 2) != 0) {
+    dotProduct += (*a) * (*b);
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes);
+  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
+   /* 
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+    float *p_input, *p_taps;
+    __m64 *p_result;
+
+    p_result = (__m64*)result;
+    p_input = (float*)input;
+    p_taps = (float*)taps;
+
+    static const __m128i neg = {0x000000000000000080000000};
+
+    int i = 0;
+  
+    int bound = (num_bytes >> 5);
+    int leftovers = (num_bytes & 24) >> 3;
+
+    real0 = _mm_sub_ps(real0, real0);
+    real1 = _mm_sub_ps(real1, real1);
+    im0 = _mm_sub_ps(im0, im0);
+    im1 = _mm_sub_ps(im1, im1);
+  
+    for(; i < bound; ++i) {
+  
+    
+    xmm0 = _mm_load_ps(p_input);
+    xmm1 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm2 = _mm_load_ps(p_input);
+    xmm3 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+    
+    //imaginary vector from input
+    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+    //real vector from input
+    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+    //imaginary vector from taps
+    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+    //real vector from taps
+    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+    
+    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+    
+    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+    
+    real0 = _mm_add_ps(xmm4, real0);
+    real1 = _mm_add_ps(xmm5, real1);
+    im0 = _mm_add_ps(xmm6, im0);
+    im1 = _mm_add_ps(xmm7, im1);
+    
+    }
+
+
+    
+    
+    real1 = _mm_xor_ps(real1, (__m128)neg);
+    
+  
+    im0 = _mm_add_ps(im0, im1);
+    real0 = _mm_add_ps(real0, real1);
+  
+    im0 = _mm_add_ps(im0, real0);
+  
+    _mm_storel_pi(p_result, im0);
+  
+    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
+    
+    *result += input[i] * taps[i];
+    }
+  */
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
new file mode 100644
index 0000000000..59259882ca
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H
+#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
+static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    static const float mask = -0.0;
+    volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
new file mode 100644
index 0000000000..14f5116970
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H
+#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+  lv_32fc_t diff;
+  memset(&diff, 0x0, 2*sizeof(float));
+
+  float sq_dist = 0.0;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+  
+  
+  
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm8 = _mm_load1_ps(&scalar);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+  
+  for(; i < bound - 1; ++i) {
+  
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+    
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+  
+  xmm4 = _mm_mul_ps(xmm4, xmm8);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+  
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
new file mode 100644
index 0000000000..b6c72adbf2
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
@@ -0,0 +1,112 @@
+#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H
+#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  lv_32fc_t diff;
+  float sq_dist;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+
+  for(; i < bound - 1; ++i) {
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h
deleted file mode 100644
index df0ebb987b..0000000000
--- a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief performs the atan2 on the input vector and stores the results in the output vector.
-  \param outputVector The byte-aligned vector where the results will be stored.
-  \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
-  \param normalizeFactor The atan2 results will be divided by this normalization factor.
-  \param num_points The number of complex values in the input vector.
-*/
-static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* outPtr = outputVector;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  const float invNormalizeFactor = 1.0 / normalizeFactor;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 testVector = _mm_set_ps1(2*M_PI);
-  __m128 correctVector = _mm_set_ps1(M_PI);
-  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
-  __m128 phase;
-  __m128 complex1, complex2, iValue, qValue;
-  __m128 keepMask;
-    
-  for (; number < quarterPoints; number++) {
-    // Load IQ data:
-    complex1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    complex2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    // Deinterleave IQ data:
-    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); 
-    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); 
-    // Arctan to get phase:
-    phase = atan2f4(qValue, iValue);
-    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
-    // Compare to 2pi:
-    keepMask = _mm_cmpneq_ps(phase,testVector);
-    phase = _mm_blendv_ps(correctVector, phase, keepMask);
-    // done with above correction.      
-    phase = _mm_mul_ps(phase, vNormalizeFactor);
-    _mm_store_ps((float*)outPtr, phase);
-    outPtr += 4;
-  }
-  number = quarterPoints * 4;
-#endif /* LV_HAVE_SIMDMATH_H */
-
-  for (; number < num_points; number++) {
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief performs the atan2 on the input vector and stores the results in the output vector.
-  \param outputVector The byte-aligned vector where the results will be stored.
-  \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
-  \param normalizeFactor The atan2 results will be divided by this normalization factor.
-  \param num_points The number of complex values in the input vector.
-*/
-static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* outPtr = outputVector;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  const float invNormalizeFactor = 1.0 / normalizeFactor;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 testVector = _mm_set_ps1(2*M_PI);
-  __m128 correctVector = _mm_set_ps1(M_PI);
-  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
-  __m128 phase;
-  __m128 complex1, complex2, iValue, qValue;
-  __m128 mask;
-  __m128 keepMask;
-    
-  for (; number < quarterPoints; number++) {
-    // Load IQ data:
-    complex1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    complex2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-    // Deinterleave IQ data:
-    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); 
-    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); 
-    // Arctan to get phase:
-    phase = atan2f4(qValue, iValue);
-    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
-    // Compare to 2pi:
-    keepMask = _mm_cmpneq_ps(phase,testVector);
-    phase = _mm_and_ps(phase, keepMask);
-    mask = _mm_andnot_ps(keepMask, correctVector);
-    phase = _mm_or_ps(phase, mask);
-    // done with above correction.      
-    phase = _mm_mul_ps(phase, vNormalizeFactor);
-    _mm_store_ps((float*)outPtr, phase);
-    outPtr += 4;
-  }
-  number = quarterPoints * 4;
-#endif /* LV_HAVE_SIMDMATH_H */
-
-  for (; number < num_points; number++) {
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief performs the atan2 on the input vector and stores the results in the output vector.
-  \param outputVector The vector where the results will be stored.
-  \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin).
-  \param normalizeFactor The atan2 results will be divided by this normalization factor.
-  \param num_points The number of complex values in the input vector.
-*/
-static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
-  float* outPtr = outputVector;
-  const float* inPtr = (float*)inputVector;
-  const float invNormalizeFactor = 1.0 / normalizeFactor;
-  unsigned int number;
-  for ( number = 0; number < num_points; number++) {
-    const float real = *inPtr++;
-    const float imag = *inPtr++;
-    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
deleted file mode 100644
index 60103c1b5f..0000000000
--- a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
+++ /dev/null
@@ -1,344 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H
-
-#include<volk/volk_complex.h>
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-  
-  
-  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  int i = 0;
-
-  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    
-
-    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
-    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
-    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-    
-    
-    in += 4;
-    tp += 4;
-
-  }
- 
-  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  
-  
-  
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-
-  }
-  /*
-  for(i = 0; i < num_bytes >> 3; ++i) {
-    *result += input[i] * conjf(taps[i]);
-  }
-  */
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE && LV_HAVE_64
-
-
-static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-  
-
-
-
-  asm volatile 
-    (
-     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %[conjugator], %%r9\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movaps 0(%%r9), %%xmm8\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%r9), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movups	0(%%r10), %%xmm2\n\t"
-     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "  xorps  %%xmm8, %%xmm2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%r9), %%xmmA\n\t"
-     "#	movaps	(%%r10), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%r9), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%r10), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "  xorps   %%xmm8, %%xmm3\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%r9), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	add	$32, %%r9\n\t"
-     "	movaps	32(%%r10), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	add	$32, %%r10\n\t"
-     "  xorps   %%xmm8, %%xmm2\n\t"
-     ".%=L1_test:\n\t"
-     "	dec	%%rax\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	and	$1, %%r8\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "	mov	$0x80000000, %%r9\n\t"
-     "	movd	%%r9, %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
-     :"rax", "r8", "r9", "r10"
-     );
-  
-  
-  int getem = num_bytes % 16;
-  
-  
-  for(; getem > 0; getem -= 8) {
-  
-    
-    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
-  
-  }
-
-  return;
-}  
-#endif
-
-#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
-  int bound = num_bytes >> 4;
-  int leftovers = num_bytes % 16;
-
-  
-  asm volatile 
-    (
-     "	#pushl	%%ebp\n\t"
-     "	#movl	%%esp, %%ebp\n\t"
-     "	#movl	12(%%ebp), %%eax		# input\n\t"
-     "	#movl	16(%%ebp), %%edx		# taps\n\t"
-     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%[eax]), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%[edx]), %%xmm2\n\t"
-     "  movl    %[ecx], (%[out])\n\t"
-     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
-     
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%[eax]), %%xmmA\n\t"
-     "#	movaps	(%[edx]), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%[edx]), %%xmm3\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "  xorps   %%xmm1, %%xmm3\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	movaps	16(%[eax]), %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%[eax]), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	addl	$32, %[eax]\n\t"
-     "	movaps	32(%[edx]), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     "	addl	$32, %[edx]\n\t"
-     ".%=L1_test:\n\t"
-     "	decl	%[ecx]\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %[ecx]\n\t"
-     "	andl	$1, %[ecx]\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "  #movl 8(%%ebp), %[eax] \n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "  movl	$0x80000000, (%[out])\n\t"
-     "	movss	(%[out]), %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
-     "	#popl	%%ebp\n\t"
-     :
-     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
-     );
-
-  
-  
-  
-  printf("%d, %d\n", leftovers, bound);
-  
-  for(; leftovers > 0; leftovers -= 8) {
-    
-    
-    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
-    
-  }
-  
-  return;
-  
-  
-  
-
-  
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-
-
-#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
new file mode 100644
index 0000000000..3ee579c2ef
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H
+#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
deleted file mode 100644
index 02085cd1e8..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-  for(;number < quarterPoints; number++){
-      
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
new file mode 100644
index 0000000000..404defc367
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H
+#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;    
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for(;number < halfPoints; number++){
+      
+      cplxValue = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 2;
+      qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  double* qBufferPtr = qBuffer;
+
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    *qBufferPtr++ = (double)*complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
deleted file mode 100644
index 3d9ebccdd3..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    double* iBufferPtr = iBuffer;
-    double* qBufferPtr = qBuffer;
-
-    const unsigned int halfPoints = num_points / 2;    
-    __m128 cplxValue, fVal;
-    __m128d dVal;
-
-    for(;number < halfPoints; number++){
-      
-      cplxValue = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i1i2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
-      dVal = _mm_cvtps_pd(fVal); 
-      _mm_store_pd(iBufferPtr, dVal);
-
-      // Arrange in q1q2q1q2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
-      dVal = _mm_cvtps_pd(fVal); 
-      _mm_store_pd(qBufferPtr, dVal);
-
-      iBufferPtr += 2;
-      qBufferPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(; number < num_points; number++){
-      *iBufferPtr++ = *complexVectorPtr++;
-      *qBufferPtr++ = *complexVectorPtr++;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-  double* qBufferPtr = qBuffer;
-
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
new file mode 100644
index 0000000000..53235e5f79
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
@@ -0,0 +1,80 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H
+#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    iValue = _mm_mul_ps(iValue, vScalar);
+
+    _mm_store_ps(floatBuffer, iValue);
+    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  iBufferPtr = &iBuffer[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
deleted file mode 100644
index 3026b24220..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-
-    iValue = _mm_mul_ps(iValue, vScalar);
-
-    _mm_store_ps(floatBuffer, iValue);
-    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h
new file mode 100644
index 0000000000..9838ec88ba
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  float* iBufferPtr = iBuffer;
+
+  __m128 cplxValue1, cplxValue2, iValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
deleted file mode 100644
index 2af973bcc0..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  float* iBufferPtr = iBuffer;
-
-  __m128 cplxValue1, cplxValue2, iValue;
-  for(;number < quarterPoints; number++){
-      
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h
new file mode 100644
index 0000000000..af392d0748
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H
+#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+
+  const unsigned int halfPoints = num_points / 2;    
+  __m128 cplxValue, fVal;
+  __m128d dVal;
+  for(;number < halfPoints; number++){
+      
+    cplxValue = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i1i2 format
+    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+    dVal = _mm_cvtps_pd(fVal); 
+    _mm_store_pd(iBufferPtr, dVal);
+
+    iBufferPtr += 2;
+  }
+
+  number = halfPoints * 2;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
deleted file mode 100644
index f408589c41..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-
-  const unsigned int halfPoints = num_points / 2;    
-  __m128 cplxValue, fVal;
-  __m128d dVal;
-  for(;number < halfPoints; number++){
-      
-    cplxValue = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i1i2 format
-    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
-    dVal = _mm_cvtps_pd(fVal); 
-    _mm_store_pd(iBufferPtr, dVal);
-
-    iBufferPtr += 2;
-  }
-
-  number = halfPoints * 2;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h
deleted file mode 100644
index 1a834dc252..0000000000
--- a/volk/include/volk/volk_32fc_dot_prod_aligned16.h
+++ /dev/null
@@ -1,468 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H
-
-#include <volk/volk_complex.h>
-#include <stdio.h>
-#include <string.h>
-
-
-#if LV_HAVE_GENERIC 
-
-
-static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-  
-  
-  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  int i = 0;
-
-  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    
-
-    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
-    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
-    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-    
-    
-    in += 4;
-    tp += 4;
-
-  }
-
-  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  
-  
-  
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
-
-  }
-
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE && LV_HAVE_64
-
-
-static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-
-  asm 
-    (
-     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%r9), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%%r10), %%xmm2\n\t"
-     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%r9), %%xmmA\n\t"
-     "#	movaps	(%%r10), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%r9), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%r10), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%r9), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	add	$32, %%r9\n\t"
-     "	movaps	32(%%r10), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	add	$32, %%r10\n\t"
-     ".%=L1_test:\n\t"
-     "	dec	%%rax\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	and	$1, %%r8\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "	mov	$0x80000000, %%r9\n\t"
-     "	movd	%%r9, %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
-     :"rax", "r8", "r9", "r10"
-     );
-  
-  
-  int getem = num_bytes % 16;
-  
-  
-  for(; getem > 0; getem -= 8) {
-  
-    
-    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-  
-  }
-
-  return;
-  
-}
-
-#endif
-
-#if LV_HAVE_SSE && LV_HAVE_32
-
-static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  asm volatile 
-    (
-     "	#pushl	%%ebp\n\t"
-     "	#movl	%%esp, %%ebp\n\t"
-     "	movl	12(%%ebp), %%eax		# input\n\t"
-     "	movl	16(%%ebp), %%edx		# taps\n\t"
-     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%eax), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%%edx), %%xmm2\n\t"
-     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%eax), %%xmmA\n\t"
-     "#	movaps	(%%edx), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%eax), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%edx), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%eax), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	addl	$32, %%eax\n\t"
-     "	movaps	32(%%edx), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	addl	$32, %%edx\n\t"
-     ".%=L1_test:\n\t"
-     "	decl	%%ecx\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %%ecx\n\t"
-     "	andl	$1, %%ecx\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "  movl 8(%%ebp), %%eax \n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "  movl	$0x80000000, (%%eax)\n\t"
-     "	movss	(%%eax), %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	#movl	8(%%ebp), %%eax		# @result\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
-     "	#popl	%%ebp\n\t"
-     :
-     :
-     : "eax", "ecx", "edx"
-     );
-
-  
-  int getem = num_bytes % 16;
-  
-  for(; getem > 0; getem -= 8) {
-    
-    
-    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-    
-  }
-  
-  return;
-  
-  
-  
-
-  
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32fc_dot_prod_aligned16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
-
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_bytes >> 4;
-
-  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
-
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
-
-  dotProdVal = _mm_setzero_ps();
-
-  for(;number < halfPoints; number++){
-      
-    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-      
-    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-      
-    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-      
-    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-      
-    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-      
-    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
-
-    a += 2;
-    b += 2;
-  }
-
-  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
-
-  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
-
-  if((num_bytes >> 2) != 0) {
-    dotProduct += (*a) * (*b);
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32fc_dot_prod_aligned16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  volk_32fc_dot_prod_aligned16_sse3(result, input, taps, num_bytes);
-  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
-   /* 
-    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-    float *p_input, *p_taps;
-    __m64 *p_result;
-
-    p_result = (__m64*)result;
-    p_input = (float*)input;
-    p_taps = (float*)taps;
-
-    static const __m128i neg = {0x000000000000000080000000};
-
-    int i = 0;
-  
-    int bound = (num_bytes >> 5);
-    int leftovers = (num_bytes & 24) >> 3;
-
-    real0 = _mm_sub_ps(real0, real0);
-    real1 = _mm_sub_ps(real1, real1);
-    im0 = _mm_sub_ps(im0, im0);
-    im1 = _mm_sub_ps(im1, im1);
-  
-    for(; i < bound; ++i) {
-  
-    
-    xmm0 = _mm_load_ps(p_input);
-    xmm1 = _mm_load_ps(p_taps);
-    
-    p_input += 4;
-    p_taps += 4;
-    
-    xmm2 = _mm_load_ps(p_input);
-    xmm3 = _mm_load_ps(p_taps);
-    
-    p_input += 4;
-    p_taps += 4;
-    
-    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
-    
-    //imaginary vector from input
-    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-    //real vector from input
-    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-    //imaginary vector from taps
-    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-    //real vector from taps
-    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
-    
-    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
-    
-    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
-    
-    real0 = _mm_add_ps(xmm4, real0);
-    real1 = _mm_add_ps(xmm5, real1);
-    im0 = _mm_add_ps(xmm6, im0);
-    im1 = _mm_add_ps(xmm7, im1);
-    
-    }
-
-
-    
-    
-    real1 = _mm_xor_ps(real1, (__m128)neg);
-    
-  
-    im0 = _mm_add_ps(im0, im1);
-    real0 = _mm_add_ps(real0, real1);
-  
-    im0 = _mm_add_ps(im0, real0);
-  
-    _mm_storel_pi(p_result, im0);
-  
-    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
-    
-    *result += input[i] * taps[i];
-    }
-  */
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h
new file mode 100644
index 0000000000..532ae4e7c8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_index_max_16u_a16.h
@@ -0,0 +1,215 @@
+#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H
+#define INCLUDED_volk_32fc_index_max_16u_a16_H
+
+#include <volk/volk_common.h>
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+
+static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+  
+  
+  
+  union bit128 holderf;
+  union bit128 holderi;
+  float sq_dist = 0.0;
+
+
+
+  
+  union bit128 xmm5, xmm4;
+  __m128 xmm1, xmm2, xmm3;
+  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+  xmm5.int_vec = xmmfive = _mm_setzero_si128();
+  xmm4.int_vec = xmmfour = _mm_setzero_si128();
+  holderf.int_vec = holder0 = _mm_setzero_si128();
+  holderi.int_vec = holder1 = _mm_setzero_si128();
+ 
+  
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+  
+  
+  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+  xmm9 = xmm8 = _mm_setzero_si128();
+  xmm10 = _mm_set_epi32(4, 4, 4, 4);
+  xmm3 = _mm_setzero_ps();
+;
+  
+  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+  
+  for(; i < bound; ++i) {
+  
+    xmm1 = _mm_load_ps((float*)src0);
+    xmm2 = _mm_load_ps((float*)&src0[2]);
+    
+
+    src0 += 4;
+  
+  
+    xmm1 = _mm_mul_ps(xmm1, xmm1);
+    xmm2 = _mm_mul_ps(xmm2, xmm2);
+    
+    
+    xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+    xmm3 = _mm_max_ps(xmm1, xmm3);
+  
+    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+    
+        
+    
+    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+    
+    xmm9 = _mm_add_epi32(xmm11,  xmm12);
+
+    xmm8 = _mm_add_epi32(xmm8, xmm10);    
+
+    
+    //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+
+  }
+  
+  
+  for(i = 0; i < leftovers0; ++i) {
+
+
+    xmm2 = _mm_load_ps((float*)src0);
+    
+    xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8);
+    xmm8 = (__m128i)xmm1;
+
+    xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+    src0 += 2;
+
+    xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+    xmm3 = _mm_max_ps(xmm1, xmm3);
+
+    xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+    
+    
+    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+    
+        
+    
+    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+    
+    xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+    xmm8 = _mm_add_epi32(xmm8, xmm10);    
+    //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+  }
+    
+  
+    
+
+  for(i = 0; i < leftovers1; ++i) {
+    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+    
+
+    sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+    
+    xmm2 = _mm_load1_ps(&sq_dist);
+
+    xmm1 = xmm3;
+    
+    xmm3 = _mm_max_ss(xmm3, xmm2);
+
+    
+        
+    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+    
+        
+    xmm8 = _mm_shuffle_epi32(xmm8, 0x00);  
+    
+    xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+    xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+    
+
+    xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+  }
+  
+  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+
+  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+  _mm_store_ps((float*)&(holderf.f), xmm3);
+  _mm_store_si128(&(holderi.int_vec), xmm9);
+  
+  target[0] = holderi.i[0];
+  sq_dist = holderf.f[0]; 
+  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+ 
+  
+  /*
+  float placeholder = 0.0;
+  uint32_t temp0, temp1; 
+  unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+  unsigned int l0 = g0 ^ 1;
+
+  unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+  unsigned int l1 = g1 ^ 1;
+  
+  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; 
+  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+  
+  g0 = (sq_dist > placeholder);
+  l0 = g0 ^ 1;
+  target[0] = g0 * temp0 + l0 * temp1;
+  */
+  
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+  float sq_dist = 0.0;
+  float max = 0.0;
+  unsigned int index = 0;
+  
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+
+    sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+    
+    index = sq_dist > max ? i : index;
+    max = sq_dist > max ? sq_dist : max;
+    
+    
+  }
+  target[0] = index;
+  
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/
diff --git a/volk/include/volk/volk_32fc_index_max_aligned16.h b/volk/include/volk/volk_32fc_index_max_aligned16.h
deleted file mode 100644
index d77a95f90d..0000000000
--- a/volk/include/volk/volk_32fc_index_max_aligned16.h
+++ /dev/null
@@ -1,215 +0,0 @@
-#ifndef INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H
-#define INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H
-
-#include <volk/volk_common.h>
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-
-static inline void volk_32fc_index_max_aligned16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
-  
-  
-  
-  union bit128 holderf;
-  union bit128 holderi;
-  float sq_dist = 0.0;
-
-
-
-  
-  union bit128 xmm5, xmm4;
-  __m128 xmm1, xmm2, xmm3;
-  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
-
-  xmm5.int_vec = xmmfive = _mm_setzero_si128();
-  xmm4.int_vec = xmmfour = _mm_setzero_si128();
-  holderf.int_vec = holder0 = _mm_setzero_si128();
-  holderi.int_vec = holder1 = _mm_setzero_si128();
- 
-  
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-  
-  
-  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
-  xmm9 = xmm8 = _mm_setzero_si128();
-  xmm10 = _mm_set_epi32(4, 4, 4, 4);
-  xmm3 = _mm_setzero_ps();
-;
-  
-  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
-  
-  for(; i < bound; ++i) {
-  
-    xmm1 = _mm_load_ps((float*)src0);
-    xmm2 = _mm_load_ps((float*)&src0[2]);
-    
-
-    src0 += 4;
-  
-  
-    xmm1 = _mm_mul_ps(xmm1, xmm1);
-    xmm2 = _mm_mul_ps(xmm2, xmm2);
-    
-    
-    xmm1 = _mm_hadd_ps(xmm1, xmm2);
-
-    xmm3 = _mm_max_ps(xmm1, xmm3);
-  
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-    
-        
-    
-    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
-    
-    xmm9 = _mm_add_epi32(xmm11,  xmm12);
-
-    xmm8 = _mm_add_epi32(xmm8, xmm10);    
-
-    
-    //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
-
-  }
-  
-  
-  for(i = 0; i < leftovers0; ++i) {
-
-
-    xmm2 = _mm_load_ps((float*)src0);
-    
-    xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8);
-    xmm8 = (__m128i)xmm1;
-
-    xmm2 = _mm_mul_ps(xmm2, xmm2);
-
-    src0 += 2;
-
-    xmm1 = _mm_hadd_ps(xmm2, xmm2);
-
-    xmm3 = _mm_max_ps(xmm1, xmm3);
-
-    xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
-    
-    
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-    
-        
-    
-    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
-    
-    xmm9 = _mm_add_epi32(xmm11, xmm12);
-
-    xmm8 = _mm_add_epi32(xmm8, xmm10);    
-    //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-
-  }
-    
-  
-    
-
-  for(i = 0; i < leftovers1; ++i) {
-    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-    
-
-    sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-    
-    xmm2 = _mm_load1_ps(&sq_dist);
-
-    xmm1 = xmm3;
-    
-    xmm3 = _mm_max_ss(xmm3, xmm2);
-
-    
-        
-    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-    
-        
-    xmm8 = _mm_shuffle_epi32(xmm8, 0x00);  
-    
-    xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
-    xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
-    
-
-    xmm9 = _mm_add_epi32(xmm11, xmm12);
-
-  }
-  
-  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-
-  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-
-  _mm_store_ps((float*)&(holderf.f), xmm3);
-  _mm_store_si128(&(holderi.int_vec), xmm9);
-  
-  target[0] = holderi.i[0];
-  sq_dist = holderf.f[0]; 
-  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-
- 
-  
-  /*
-  float placeholder = 0.0;
-  uint32_t temp0, temp1; 
-  unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
-  unsigned int l0 = g0 ^ 1;
-
-  unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
-  unsigned int l1 = g1 ^ 1;
-  
-  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
-  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
-  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; 
-  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
-  
-  g0 = (sq_dist > placeholder);
-  l0 = g0 ^ 1;
-  target[0] = g0 * temp0 + l0 * temp1;
-  */
-  
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_index_max_aligned16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
-  float sq_dist = 0.0;
-  float max = 0.0;
-  unsigned int index = 0;
-  
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-
-    sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
-    
-    index = sq_dist > max ? i : index;
-    max = sq_dist > max ? sq_dist : max;
-    
-    
-  }
-  target[0] = index;
-  
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
deleted file mode 100644
index 4e590e120e..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result);
-
-    result = _mm_mul_ps(result, vScalar);
-
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result);
-
-    result = _mm_mul_ps(result, vScalar);
-
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h
new file mode 100644
index 0000000000..be7216dce2
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_32f_a16.h
@@ -0,0 +1,132 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H
+#define INCLUDED_volk_32fc_magnitude_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
+static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
deleted file mode 100644
index 3ea62da6a6..0000000000
--- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-      result = _mm_sqrt_ps(result);
-
-      _mm_store_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      float val1Real = *complexVectorPtr++;
-      float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-    }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    const float* complexVectorPtr = (float*)complexVector;
-    float* magnitudeVectorPtr = magnitudeVector;
-
-    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-    for(;number < quarterPoints; number++){
-      cplxValue1 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      cplxValue2 = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-      result = _mm_sqrt_ps(result);
-
-      _mm_store_ps(magnitudeVectorPtr, result);
-      magnitudeVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-       float val1Real = *complexVectorPtr++;
-       float val1Imag = *complexVectorPtr++;
-      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-    \param complexVector The vector containing the complex input values
-    \param magnitudeVector The vector containing the real output values
-    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-  */
-extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points);
-static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
-    volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h
deleted file mode 100644
index c8f2418c32..0000000000
--- a/volk/include/volk/volk_32fc_multiply_aligned16.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_multiply_aligned16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-
-    for(;number < halfPoints; number++){
-      
-      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-      
-      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-      
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-      
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-      
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-      
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-    
-      _mm_store_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      b += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * (*b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
-static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    static const float mask = -0.0;
-    volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
deleted file mode 100644
index 52ec0f95b6..0000000000
--- a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Calculates the log10 power value divided by the RBW for each input point
-  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param rbw The resolution bandwith of the fft spectrum
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_power_spectral_density_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 magScalar = _mm_set_ps1(10.0);
-  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
-
-  __m128 invRBW = _mm_set_ps1(iRBW);
-  
-  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
-
-  __m128 power;
-  __m128 input1, input2;
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the complex values 
-    input1 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    input2 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    
-    // Apply the normalization factor
-    input1 = _mm_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm_mul_ps(input2, invNormalizationFactor);
-
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm_mul_ps(input2, input2);
-    
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    power = _mm_hadd_ps(input1, input2);
-
-    // Divide by the rbw
-    power = _mm_mul_ps(power, invRBW);
-
-    // Calculate the natural log power
-    power = logf4(power);
-    
-    // Convert to log10 and multiply by 10.0
-    power = _mm_mul_ps(power, magScalar);
-    
-    // Store the floating point results
-    _mm_store_ps(destPtr, power);
-    
-    destPtr += 4;
-  }
-  
-  number = quarterPoints*4;  
-#endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-    
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
-    destPtr++;
-  }
-  
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the log10 power value divided by the RBW for each input point
-  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param rbw The resolution bandwith of the fft spectrum
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_power_spectral_density_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
-  // Calculate the Power of the complex point
-  const float* inputPtr = (float*)complexFFTInput;
-  float* realFFTDataPointsPtr = logPowerOutput;
-  unsigned int point;
-  const float invRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-  for(point = 0; point < num_points; point++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
-    
-    realFFTDataPointsPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
deleted file mode 100644
index 645629b9d4..0000000000
--- a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H
-#define INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Calculates the log10 power value for each input point
-  \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_power_spectrum_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 magScalar = _mm_set_ps1(10.0);
-  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
-
-  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
-
-  __m128 power;
-  __m128 input1, input2;
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the complex values 
-    input1 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    input2 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-
-    // Apply the normalization factor
-    input1 = _mm_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm_mul_ps(input2, invNormalizationFactor);
-    
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm_mul_ps(input2, input2);
-    
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    power = _mm_hadd_ps(input1, input2);
-    
-    // Calculate the natural log power
-    power = logf4(power);
-    
-    // Convert to log10 and multiply by 10.0
-    power = _mm_mul_ps(power, magScalar);
-    
-    // Store the floating point results
-    _mm_store_ps(destPtr, power);
-    
-    destPtr += 4;
-  }
-  
-  number = quarterPoints*4;  
-#endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
-    
-    destPtr++;
-  }
-  
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the log10 power value for each input point
-  \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided agains all the input values before the power is calculated
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_power_spectrum_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
-  // Calculate the Power of the complex point
-  const float* inputPtr = (float*)complexFFTInput;
-  float* realFFTDataPointsPtr = logPowerOutput;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
-
-    
-    realFFTDataPointsPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h
new file mode 100644
index 0000000000..e9f74438d7
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H
+#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief performs the atan2 on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+  \param normalizeFactor The atan2 results will be divided by this normalization factor.
+  \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* outPtr = outputVector;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 testVector = _mm_set_ps1(2*M_PI);
+  __m128 correctVector = _mm_set_ps1(M_PI);
+  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+  __m128 phase;
+  __m128 complex1, complex2, iValue, qValue;
+  __m128 keepMask;
+    
+  for (; number < quarterPoints; number++) {
+    // Load IQ data:
+    complex1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    complex2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    // Deinterleave IQ data:
+    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); 
+    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); 
+    // Arctan to get phase:
+    phase = atan2f4(qValue, iValue);
+    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+    // Compare to 2pi:
+    keepMask = _mm_cmpneq_ps(phase,testVector);
+    phase = _mm_blendv_ps(correctVector, phase, keepMask);
+    // done with above correction.      
+    phase = _mm_mul_ps(phase, vNormalizeFactor);
+    _mm_store_ps((float*)outPtr, phase);
+    outPtr += 4;
+  }
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+  for (; number < num_points; number++) {
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief performs the atan2 on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+  \param normalizeFactor The atan2 results will be divided by this normalization factor.
+  \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* outPtr = outputVector;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 testVector = _mm_set_ps1(2*M_PI);
+  __m128 correctVector = _mm_set_ps1(M_PI);
+  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+  __m128 phase;
+  __m128 complex1, complex2, iValue, qValue;
+  __m128 mask;
+  __m128 keepMask;
+    
+  for (; number < quarterPoints; number++) {
+    // Load IQ data:
+    complex1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    complex2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    // Deinterleave IQ data:
+    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); 
+    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); 
+    // Arctan to get phase:
+    phase = atan2f4(qValue, iValue);
+    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+    // Compare to 2pi:
+    keepMask = _mm_cmpneq_ps(phase,testVector);
+    phase = _mm_and_ps(phase, keepMask);
+    mask = _mm_andnot_ps(keepMask, correctVector);
+    phase = _mm_or_ps(phase, mask);
+    // done with above correction.      
+    phase = _mm_mul_ps(phase, vNormalizeFactor);
+    _mm_store_ps((float*)outPtr, phase);
+    outPtr += 4;
+  }
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+  for (; number < num_points; number++) {
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief performs the atan2 on the input vector and stores the results in the output vector.
+  \param outputVector The vector where the results will be stored.
+  \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin).
+  \param normalizeFactor The atan2 results will be divided by this normalization factor.
+  \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+  float* outPtr = outputVector;
+  const float* inPtr = (float*)inputVector;
+  const float invNormalizeFactor = 1.0 / normalizeFactor;
+  unsigned int number;
+  for ( number = 0; number < num_points; number++) {
+    const float real = *inPtr++;
+    const float imag = *inPtr++;
+    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
new file mode 100644
index 0000000000..dc3c6741a0
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h
new file mode 100644
index 0000000000..39d8f7aa2e
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H
+#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value for each input point
+  \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 magScalar = _mm_set_ps1(10.0);
+  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+  __m128 power;
+  __m128 input1, input2;
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the complex values 
+    input1 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    input2 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+
+    // Apply the normalization factor
+    input1 = _mm_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+    
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm_mul_ps(input2, input2);
+    
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    power = _mm_hadd_ps(input1, input2);
+    
+    // Calculate the natural log power
+    power = logf4(power);
+    
+    // Convert to log10 and multiply by 10.0
+    power = _mm_mul_ps(power, magScalar);
+    
+    // Store the floating point results
+    _mm_store_ps(destPtr, power);
+    
+    destPtr += 4;
+  }
+  
+  number = quarterPoints*4;  
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+    
+    destPtr++;
+  }
+  
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the log10 power value for each input point
+  \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided agains all the input values before the power is calculated
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+  // Calculate the Power of the complex point
+  const float* inputPtr = (float*)complexFFTInput;
+  float* realFFTDataPointsPtr = logPowerOutput;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+    
+    realFFTDataPointsPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
new file mode 100644
index 0000000000..29ccdaef70
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H
+#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 magScalar = _mm_set_ps1(10.0);
+  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+  __m128 invRBW = _mm_set_ps1(iRBW);
+  
+  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+  __m128 power;
+  __m128 input1, input2;
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the complex values 
+    input1 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    input2 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    
+    // Apply the normalization factor
+    input1 = _mm_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm_mul_ps(input2, input2);
+    
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    power = _mm_hadd_ps(input1, input2);
+
+    // Divide by the rbw
+    power = _mm_mul_ps(power, invRBW);
+
+    // Calculate the natural log power
+    power = logf4(power);
+    
+    // Convert to log10 and multiply by 10.0
+    power = _mm_mul_ps(power, magScalar);
+    
+    // Store the floating point results
+    _mm_store_ps(destPtr, power);
+    
+    destPtr += 4;
+  }
+  
+  number = quarterPoints*4;  
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+    
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+    destPtr++;
+  }
+  
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  // Calculate the Power of the complex point
+  const float* inputPtr = (float*)complexFFTInput;
+  float* realFFTDataPointsPtr = logPowerOutput;
+  unsigned int point;
+  const float invRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+  for(point = 0; point < num_points; point++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+    
+    realFFTDataPointsPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_square_dist_aligned16.h b/volk/include/volk/volk_32fc_square_dist_aligned16.h
deleted file mode 100644
index 6458ea4dd4..0000000000
--- a/volk/include/volk/volk_32fc_square_dist_aligned16.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H
-#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-  
-
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  lv_32fc_t diff;
-  float sq_dist;
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
-  
-
-  for(; i < bound - 1; ++i) {
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    xmm5 = _mm_sub_ps(xmm1, xmm3);
-    points += 4;
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-    xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
-    xmm3 = _mm_load_ps((float*)&points[2]);
-
-    _mm_store_ps(target, xmm4);
-
-    target += 4;
-
-  }
-  
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-  
-  
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
-   
-  _mm_store_ps(target, xmm4);
-  
-  target += 4;
-
-  for(i = 0; i < leftovers0; ++i) {
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    
-    points += 2;
-    
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
-    
-    _mm_storeh_pi((__m64*)target, xmm4);
-
-    target += 2;
-  }
-
-  for(i = 0; i < leftovers1; ++i) {
-    
-    diff = src0[0] - points[0];
-
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-
-    target[0] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-  lv_32fc_t diff;
-  float sq_dist;
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
-
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-    
-    target[i] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
deleted file mode 100644
index 0fcc86f1ed..0000000000
--- a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
-#define INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-#include <string.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32fc_square_dist_scalar_mult_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-  
-
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
-  lv_32fc_t diff;
-  memset(&diff, 0x0, 2*sizeof(float));
-
-  float sq_dist = 0.0;
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-  
-  
-  
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm8 = _mm_load1_ps(&scalar);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
-  
-  
-  for(; i < bound - 1; ++i) {
-  
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    xmm5 = _mm_sub_ps(xmm1, xmm3);
-    points += 4;
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-    xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
-    xmm3 = _mm_load_ps((float*)&points[2]);
-    
-    xmm4 = _mm_mul_ps(xmm4, xmm8);
-
-    _mm_store_ps(target, xmm4);
-
-    target += 4;
-
-  }
-  
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-  
-  
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
-  
-  xmm4 = _mm_mul_ps(xmm4, xmm8);
-   
-  _mm_store_ps(target, xmm4);
-  
-  target += 4;
-  
-
-  for(i = 0; i < leftovers0; ++i) {
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    
-    points += 2;
-    
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
-
-    xmm4 = _mm_mul_ps(xmm4, xmm8);
-    
-    _mm_storeh_pi((__m64*)target, xmm4);
-
-    target += 2;
-  }
-
-  for(i = 0; i < leftovers1; ++i) {
-    
-    diff = src0[0] - points[0];
-
-    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-
-    target[0] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_square_dist_scalar_mult_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-  lv_32fc_t diff;
-  float sq_dist;
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
-
-    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-    
-    target[i] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h
new file mode 100644
index 0000000000..0e8380757e
--- /dev/null
+++ b/volk/include/volk/volk_32s_32s_and_32s_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H
+#define INCLUDED_volk_32s_32s_and_32s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_and_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */
diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h
new file mode 100644
index 0000000000..2dcf2e5515
--- /dev/null
+++ b/volk/include/volk/volk_32s_32s_or_32s_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H
+#define INCLUDED_volk_32s_32s_or_32s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_or_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */
diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h
deleted file mode 100644
index 16c63fd482..0000000000
--- a/volk/include/volk/volk_32s_and_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32s_AND_ALIGNED16_H
-#define INCLUDED_VOLK_32s_AND_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-static inline void volk_32s_and_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = (float*)cVector;
-    const float* aPtr = (float*)aVector;
-    const float* bPtr = (float*)bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_and_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      cVector[number] = aVector[number] & bVector[number];
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    int32_t* cPtr = cVector;
-    const int32_t* aPtr = aVector;
-    const int32_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) & (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_convert_32f_aligned16.h b/volk/include/volk/volk_32s_convert_32f_aligned16.h
deleted file mode 100644
index a407e68bda..0000000000
--- a/volk/include/volk/volk_32s_convert_32f_aligned16.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H
-#define INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32s_convert_32f_aligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_load_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_store_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32s_convert_32f_aligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_convert_32f_unaligned16.h b/volk/include/volk/volk_32s_convert_32f_unaligned16.h
deleted file mode 100644
index ad7d4eb173..0000000000
--- a/volk/include/volk/volk_32s_convert_32f_unaligned16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H
-#define INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32s_convert_32f_unaligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32s_convert_32f_unaligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h
deleted file mode 100644
index 64748d5354..0000000000
--- a/volk/include/volk/volk_32s_or_aligned16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_VOLK_32s_OR_ALIGNED16_H
-#define INCLUDED_VOLK_32s_OR_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-static inline void volk_32s_or_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = (float*)cVector;
-    const float* aPtr = (float*)aVector;
-    const float* bPtr = (float*)bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_or_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      cVector[number] = aVector[number] | bVector[number];
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    int32_t* cPtr = cVector;
-    const int32_t* aPtr = aVector;
-    const int32_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) | (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h
new file mode 100644
index 0000000000..c16ecc9dde
--- /dev/null
+++ b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H
+#define INCLUDED_volk_32s_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_store_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
new file mode 100644
index 0000000000..4eb5a5b850
--- /dev/null
+++ b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H
+#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h
new file mode 100644
index 0000000000..7556ec7b1d
--- /dev/null
+++ b/volk/include/volk/volk_32u_byteswap_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32u_byteswap_a16_H
+#define INCLUDED_volk_32u_byteswap_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+
+  uint32_t* inputPtr = intsToSwap;
+  __m128i input, byte1, byte2, byte3, byte4, output;
+  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the 32t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_load_si128((__m128i*)inputPtr);
+    // Do the four shifts
+    byte1 = _mm_slli_epi32(input, 24);
+    byte2 = _mm_slli_epi32(input, 8);
+    byte3 = _mm_srli_epi32(input, 8);
+    byte4 = _mm_srli_epi32(input, 24);
+    // Or bytes together
+    output = _mm_or_si128(byte1, byte4);
+    byte2 = _mm_and_si128(byte2, byte2mask);
+    output = _mm_or_si128(output, byte2);
+    byte3 = _mm_and_si128(byte3, byte3mask);
+    output = _mm_or_si128(output, byte3);
+    // Store the results
+    _mm_store_si128((__m128i*)inputPtr, output);
+    inputPtr += 4;
+  }
+  
+  // Byteswap any remaining points:
+  number = quarterPoints*4;  
+  for(; number < num_points; number++){
+    uint32_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = intsToSwap;
+
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output = *inputPtr;
+    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+    
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_a16_H */
diff --git a/volk/include/volk/volk_32u_byteswap_aligned16.h b/volk/include/volk/volk_32u_byteswap_aligned16.h
deleted file mode 100644
index 09173a9d58..0000000000
--- a/volk/include/volk/volk_32u_byteswap_aligned16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H
-#define INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int32_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_32u_byteswap_aligned16_sse2(uint32_t* intsToSwap, unsigned int num_points){
-  unsigned int number = 0;
-
-  uint32_t* inputPtr = intsToSwap;
-  __m128i input, byte1, byte2, byte3, byte4, output;
-  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
-  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the 32t values, increment inputPtr later since we're doing it in-place.
-    input = _mm_load_si128((__m128i*)inputPtr);
-    // Do the four shifts
-    byte1 = _mm_slli_epi32(input, 24);
-    byte2 = _mm_slli_epi32(input, 8);
-    byte3 = _mm_srli_epi32(input, 8);
-    byte4 = _mm_srli_epi32(input, 24);
-    // Or bytes together
-    output = _mm_or_si128(byte1, byte4);
-    byte2 = _mm_and_si128(byte2, byte2mask);
-    output = _mm_or_si128(output, byte2);
-    byte3 = _mm_and_si128(byte3, byte3mask);
-    output = _mm_or_si128(output, byte3);
-    // Store the results
-    _mm_store_si128((__m128i*)inputPtr, output);
-    inputPtr += 4;
-  }
-  
-  // Byteswap any remaining points:
-  number = quarterPoints*4;  
-  for(; number < num_points; number++){
-    uint32_t outputVal = *inputPtr;
-    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
-    *inputPtr = outputVal;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int32_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_32u_byteswap_aligned16_generic(uint32_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = intsToSwap;
-
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output = *inputPtr;
-    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
-    
-    *inputPtr = output;
-    inputPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h
new file mode 100644
index 0000000000..f6e25e4e81
--- /dev/null
+++ b/volk/include/volk/volk_32u_popcnt_a16.h
@@ -0,0 +1,36 @@
+#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
+#define INCLUDED_VOLK_32u_POPCNT_A16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) {
+
+  // This is faster than a lookup table
+  uint32_t retVal = value;
+
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+  *ret = retVal;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) {
+  *ret = _mm_popcnt_u32(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
diff --git a/volk/include/volk/volk_32u_popcnt_aligned16.h b/volk/include/volk/volk_32u_popcnt_aligned16.h
deleted file mode 100644
index 37cfd112c0..0000000000
--- a/volk/include/volk/volk_32u_popcnt_aligned16.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H
-#define INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H
-
-#include <stdio.h>
-#include <inttypes.h>
-
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_32u_popcnt_aligned16_generic(uint32_t* ret, const uint32_t value) {
-
-  // This is faster than a lookup table
-  uint32_t retVal = value;
-
-  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
-  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
-  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-  retVal = (retVal + (retVal >> 8));
-  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
-
-  *ret = retVal;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-#if LV_HAVE_SSE4_2
-
-#include <nmmintrin.h>
-
-static inline void volk_32u_popcnt_aligned16_sse4_2(uint32_t* ret, const uint32_t value) {
-  *ret = _mm_popcnt_u32(value);
-}
-
-#endif /*LV_HAVE_SSE4_2*/
-
-#endif /*INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h
new file mode 100644
index 0000000000..7e091851f2
--- /dev/null
+++ b/volk/include/volk/volk_64f_64f_max_64f_a16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H
+#define INCLUDED_volk_64f_64f_max_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_max_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h
new file mode 100644
index 0000000000..f2bcbe83b6
--- /dev/null
+++ b/volk/include/volk/volk_64f_64f_min_64f_a16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H
+#define INCLUDED_volk_64f_64f_min_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_min_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h
new file mode 100644
index 0000000000..7dca065f08
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_a16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_64f_convert_32f_a16_H
+#define INCLUDED_volk_64f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ 
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_store_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_aligned16.h b/volk/include/volk/volk_64f_convert_32f_aligned16.h
deleted file mode 100644
index 44df661044..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_aligned16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H
-#define INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the double values into float values
-    \param dVector The converted float vector values
-    \param fVector The double vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_64f_convert_32f_aligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
- 
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
-
-    ret = _mm_movelh_ps(ret, ret2);
-
-    _mm_store_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the double values into float values
-  \param dVector The converted float vector values
-  \param fVector The double vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_aligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h
new file mode 100644
index 0000000000..7774db1b75
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_ua16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_64f_convert_32f_ua16_H
+#define INCLUDED_volk_64f_convert_32f_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ 
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_storeu_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_unaligned16.h b/volk/include/volk/volk_64f_convert_32f_unaligned16.h
deleted file mode 100644
index 08cfb6127b..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_unaligned16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H
-#define INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the double values into float values
-    \param dVector The converted float vector values
-    \param fVector The double vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_64f_convert_32f_unaligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
- 
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
-
-    ret = _mm_movelh_ps(ret, ret2);
-
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the double values into float values
-  \param dVector The converted float vector values
-  \param fVector The double vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_unaligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_64f_max_aligned16.h b/volk/include/volk/volk_64f_max_aligned16.h
deleted file mode 100644
index ce4907a8c4..0000000000
--- a/volk/include/volk/volk_64f_max_aligned16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_VOLK_64f_MAX_ALIGNED16_H
-#define INCLUDED_VOLK_64f_MAX_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_max_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-
-    __m128d aVal, bVal, cVal;
-    for(;number < halfPoints; number++){
-      
-      aVal = _mm_load_pd(aPtr); 
-      bVal = _mm_load_pd(bPtr);
-      
-      cVal = _mm_max_pd(aVal, bVal); 
-      
-      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 2;
-      bPtr += 2;
-      cPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(;number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_max_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_VOLK_64f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64f_min_aligned16.h b/volk/include/volk/volk_64f_min_aligned16.h
deleted file mode 100644
index acf4d6b2a4..0000000000
--- a/volk/include/volk/volk_64f_min_aligned16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_VOLK_64f_MIN_ALIGNED16_H
-#define INCLUDED_VOLK_64f_MIN_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_min_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-
-    __m128d aVal, bVal, cVal;
-    for(;number < halfPoints; number++){
-      
-      aVal = _mm_load_pd(aPtr); 
-      bVal = _mm_load_pd(bPtr);
-      
-      cVal = _mm_min_pd(aVal, bVal); 
-      
-      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 2;
-      bPtr += 2;
-      cPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(;number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_min_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_VOLK_64f_MIN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h
new file mode 100644
index 0000000000..0eefe01383
--- /dev/null
+++ b/volk/include/volk/volk_64u_byteswap_a16.h
@@ -0,0 +1,88 @@
+#ifndef INCLUDED_volk_64u_byteswap_a16_H
+#define INCLUDED_volk_64u_byteswap_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+    uint64_t number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    for(;number < halfPoints; number++){
+      // Load the 32t values, increment inputPtr later since we're doing it in-place.
+      input = _mm_load_si128((__m128i*)inputPtr);
+
+      // Do the four shifts
+      byte1 = _mm_slli_epi32(input, 24);
+      byte2 = _mm_slli_epi32(input, 8);
+      byte3 = _mm_srli_epi32(input, 8);
+      byte4 = _mm_srli_epi32(input, 24);
+      // Or bytes together
+      output = _mm_or_si128(byte1, byte4);
+      byte2 = _mm_and_si128(byte2, byte2mask);
+      output = _mm_or_si128(output, byte2);
+      byte3 = _mm_and_si128(byte3, byte3mask);
+      output = _mm_or_si128(output, byte3);
+      
+      // Reorder the two words
+      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+      // Store the results
+      _mm_store_si128((__m128i*)inputPtr, output);
+      inputPtr += 4;
+    }
+  
+    // Byteswap any remaining points:
+    number = halfPoints*2;  
+    for(; number < num_points; number++){
+      uint32_t output1 = *inputPtr;
+      uint32_t output2 = inputPtr[1];
+      
+      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+      
+      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+      
+      *inputPtr++ = output2;
+      *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output1 = *inputPtr;
+    uint32_t output2 = inputPtr[1];
+    
+    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+    
+    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+    
+    *inputPtr++ = output2;
+    *inputPtr++ = output1;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_a16_H */
diff --git a/volk/include/volk/volk_64u_byteswap_aligned16.h b/volk/include/volk/volk_64u_byteswap_aligned16.h
deleted file mode 100644
index d5e1b6f307..0000000000
--- a/volk/include/volk/volk_64u_byteswap_aligned16.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H
-#define INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int64_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_64u_byteswap_aligned16_sse2(uint64_t* intsToSwap, unsigned int num_points){
-    uint32_t* inputPtr = (uint32_t*)intsToSwap;
-    __m128i input, byte1, byte2, byte3, byte4, output;
-    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
-    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
-    uint64_t number = 0;
-    const unsigned int halfPoints = num_points / 2;
-    for(;number < halfPoints; number++){
-      // Load the 32t values, increment inputPtr later since we're doing it in-place.
-      input = _mm_load_si128((__m128i*)inputPtr);
-
-      // Do the four shifts
-      byte1 = _mm_slli_epi32(input, 24);
-      byte2 = _mm_slli_epi32(input, 8);
-      byte3 = _mm_srli_epi32(input, 8);
-      byte4 = _mm_srli_epi32(input, 24);
-      // Or bytes together
-      output = _mm_or_si128(byte1, byte4);
-      byte2 = _mm_and_si128(byte2, byte2mask);
-      output = _mm_or_si128(output, byte2);
-      byte3 = _mm_and_si128(byte3, byte3mask);
-      output = _mm_or_si128(output, byte3);
-      
-      // Reorder the two words
-      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
-
-      // Store the results
-      _mm_store_si128((__m128i*)inputPtr, output);
-      inputPtr += 4;
-    }
-  
-    // Byteswap any remaining points:
-    number = halfPoints*2;  
-    for(; number < num_points; number++){
-      uint32_t output1 = *inputPtr;
-      uint32_t output2 = inputPtr[1];
-      
-      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-      
-      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-      
-      *inputPtr++ = output2;
-      *inputPtr++ = output1;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Byteswaps (in-place) an aligned vector of int64_t's.
-  \param intsToSwap The vector of data to byte swap
-  \param numDataPoints The number of data points
-*/
-static inline void volk_64u_byteswap_aligned16_generic(uint64_t* intsToSwap, unsigned int num_points){
-  uint32_t* inputPtr = (uint32_t*)intsToSwap;
-  unsigned int point;
-  for(point = 0; point < num_points; point++){
-    uint32_t output1 = *inputPtr;
-    uint32_t output2 = inputPtr[1];
-    
-    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
-    
-    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
-    
-    *inputPtr++ = output2;
-    *inputPtr++ = output1;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h
new file mode 100644
index 0000000000..59511dc29d
--- /dev/null
+++ b/volk/include/volk/volk_64u_popcnt_a16.h
@@ -0,0 +1,50 @@
+#ifndef INCLUDED_volk_64u_popcnt_a16_H
+#define INCLUDED_volk_64u_popcnt_a16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) {
+
+  const uint32_t* valueVector = (const uint32_t*)&value;
+  
+  // This is faster than a lookup table
+  uint32_t retVal = valueVector[0];
+
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+  uint64_t retVal64  = retVal;
+
+  retVal = valueVector[1];
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+  retVal64 += retVal;
+
+  *ret = retVal64;
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+
+#include <nmmintrin.h>
+
+static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) {
+  *ret = _mm_popcnt_u64(value);
+
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_volk_64u_popcnt_a16_H*/
diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h
deleted file mode 100644
index 4d62f93754..0000000000
--- a/volk/include/volk/volk_64u_popcnt_aligned16.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H
-#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H
-
-#include <stdio.h>
-#include <inttypes.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) {
-
-  const uint32_t* valueVector = (const uint32_t*)&value;
-  
-  // This is faster than a lookup table
-  uint32_t retVal = valueVector[0];
-
-  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
-  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
-  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-  retVal = (retVal + (retVal >> 8));
-  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
-  uint64_t retVal64  = retVal;
-
-  retVal = valueVector[1];
-  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
-  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
-  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-  retVal = (retVal + (retVal >> 8));
-  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
-  retVal64 += retVal;
-
-  *ret = retVal64;
-
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-#if LV_HAVE_SSE4_2 && LV_HAVE_64
-
-#include <nmmintrin.h>
-
-static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) {
-  *ret = _mm_popcnt_u64(value);
-
-}
-
-#endif /*LV_HAVE_SSE4_2*/
-
-#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h
new file mode 100644
index 0000000000..38efdb6a35
--- /dev/null
+++ b/volk/include/volk/volk_8s_convert_16s_a16.h
@@ -0,0 +1,83 @@
+#ifndef INCLUDED_volk_8s_convert_16s_a16_H
+#define INCLUDED_volk_8s_convert_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h
deleted file mode 100644
index c52c64eaef..0000000000
--- a/volk/include/volk/volk_8s_convert_16s_aligned16.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H
-#define INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_16s_aligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_load_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_store_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_store_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
-static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h
new file mode 100644
index 0000000000..a726bfb5e1
--- /dev/null
+++ b/volk/include/volk/volk_8s_convert_16s_ua16.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_8s_convert_16s_ua16_H
+#define INCLUDED_volk_8s_convert_16s_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_unaligned16.h b/volk/include/volk/volk_8s_convert_16s_unaligned16.h
deleted file mode 100644
index 05b916cea4..0000000000
--- a/volk/include/volk/volk_8s_convert_16s_unaligned16.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H
-#define INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_16s_unaligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_16s_unaligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h
deleted file mode 100644
index 700a0fa422..0000000000
--- a/volk/include/volk/volk_8s_convert_32f_aligned16.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H
-#define INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_32f_aligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
-static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_32f_unaligned16.h b/volk/include/volk/volk_8s_convert_32f_unaligned16.h
deleted file mode 100644
index 8019aac9a2..0000000000
--- a/volk/include/volk/volk_8s_convert_32f_unaligned16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H
-#define INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_32f_unaligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1( iScalar );
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_32f_unaligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h
new file mode 100644
index 0000000000..45185ac2ee
--- /dev/null
+++ b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h
@@ -0,0 +1,105 @@
+#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H
+#define INCLUDED_volk_8s_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
new file mode 100644
index 0000000000..310824580c
--- /dev/null
+++ b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H
+#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1( iScalar );
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
new file mode 100644
index 0000000000..eae1185ec7
--- /dev/null
+++ b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H
+#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  lv_16sc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+    
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+      
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+      
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+    a += 4;
+    b += 4;
+    c += 4;
+  }
+    
+  number = quarterPoints * 4;
+  int16_t* c16Ptr = (int16_t*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  int16_t* c16Ptr = (int16_t*)cVector;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number =0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */
diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
new file mode 100644
index 0000000000..621276b089
--- /dev/null
+++ b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H
+#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  __m128 ret;
+  lv_32fc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    a += 4;
+    b += 4;
+  }
+
+  number = quarterPoints * 4;
+  float* cFloatPtr = (float*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cFloatPtr++ = lv_creal(temp) / scalar;
+    *cFloatPtr++ = lv_cimag(temp) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* cPtr = (float*)cVector;
+  const float invScalar = 1.0 / scalar;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number = 0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cPtr++ = (lv_creal(temp) * invScalar);
+    *cPtr++ = (lv_cimag(temp) * invScalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
new file mode 100644
index 0000000000..6a35e969d1
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H
+#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+  __m128i complexVal, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
+    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+
+    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
deleted file mode 100644
index 38eaa49eab..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_16s_aligned16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-  __m128i complexVal, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
-    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
-
-    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
-    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
deleted file mode 100644
index d0c118965f..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_32f_aligned16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;    
-  __m128 iFloatValue, qFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    iComplexVal = _mm_srli_si128(iComplexVal, 4);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 4;
-
-    qComplexVal = _mm_srli_si128(qComplexVal, 4);
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-
-    qBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  float floatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(complexVectorPtr[0]);
-    floatBuffer[1] = (float)(complexVectorPtr[1]);
-    floatBuffer[2] = (float)(complexVectorPtr[2]);
-    floatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    floatBuffer[4] = (float)(complexVectorPtr[4]);
-    floatBuffer[5] = (float)(complexVectorPtr[5]);
-    floatBuffer[6] = (float)(complexVectorPtr[6]);
-    floatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int8_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
new file mode 100644
index 0000000000..67ffebd992
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H
+#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i complexVal, outputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    outputVal = _mm_cvtepi8_epi16(complexVal);
+    outputVal = _mm_slli_epi16(outputVal, 7);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
deleted file mode 100644
index d0cb494946..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_16s_aligned16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i complexVal, outputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    outputVal = _mm_cvtepi8_epi16(complexVal);
-    outputVal = _mm_slli_epi16(outputVal, 7);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
deleted file mode 100644
index c849448eae..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;    
-  __m128 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-
-    complexVal = _mm_srli_si128(complexVal, 4);
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 iValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
-
-    iValue = _mm_load_ps(floatBuffer);
-
-    iValue = _mm_mul_ps(iValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
new file mode 100644
index 0000000000..ecffc092e1
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, outputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+    outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
deleted file mode 100644
index d84d645682..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, outputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
-
-    outputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
deleted file mode 100644
index 470a67539f..0000000000
--- a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H
-#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_multiply_conjugate_16sc_aligned16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  lv_16sc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
-    
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
-    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
-      
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-      
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-    
-  number = quarterPoints * 4;
-  int16_t* c16Ptr = (int16_t*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_multiply_conjugate_16sc_aligned16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  int16_t* c16Ptr = (int16_t*)cVector;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number =0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
deleted file mode 100644
index 52b444cf7e..0000000000
--- a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H
-#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_multiply_conjugate_32fc_aligned16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  __m128 ret;
-  lv_32fc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
-    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    a += 4;
-    b += 4;
-  }
-
-  number = quarterPoints * 4;
-  float* cFloatPtr = (float*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-    
-    *cFloatPtr++ = lv_creal(temp) / scalar;
-    *cFloatPtr++ = lv_cimag(temp) / scalar;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_multiply_conjugate_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* cPtr = (float*)cVector;
-  const float invScalar = 1.0 / scalar;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number = 0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-    
-    *cPtr++ = (lv_creal(temp) * invScalar);
-    *cPtr++ = (lv_cimag(temp) * invScalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
new file mode 100644
index 0000000000..cedbf202cc
--- /dev/null
+++ b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
@@ -0,0 +1,164 @@
+#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H
+#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue, qFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+    qBufferPtr += 4;
+
+    qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+
+    qBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(complexVectorPtr[0]);
+    floatBuffer[1] = (float)(complexVectorPtr[1]);
+    floatBuffer[2] = (float)(complexVectorPtr[2]);
+    floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    floatBuffer[4] = (float)(complexVectorPtr[4]);
+    floatBuffer[5] = (float)(complexVectorPtr[5]);
+    floatBuffer[6] = (float)(complexVectorPtr[6]);
+    floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int8_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 0000000000..902795131c
--- /dev/null
+++ b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,133 @@
+#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+
+    complexVal = _mm_srli_si128(complexVal, 4);
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py
index 9fded9a3ec..fc1ec10ef6 100755
--- a/volk/include/volk/volk_register.py
+++ b/volk/include/volk/volk_register.py
@@ -55,7 +55,7 @@ functions = [];
 
 
 for line in mfile:
-    subline = re.search(".*(aligned).*", line);
+    subline = re.search(".*(a16).*", line);
     if subline:
         subsubline = re.search("(?<=volk_).*", subline.group(0));
         if subsubline:
@@ -70,11 +70,10 @@ datatypes = set(datatypes);
 for line in mfile:
     for dt in datatypes:
         if dt in line:
-            subline = re.search("(volk_" + dt +"_.*(aligned).*\.h)", line);
+            subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line);
             if subline:
                 
                 subsubline = re.search(".+(?=\.h)", subline.group(0));
-                
                 functions.append(subsubline.group(0));
 
 archs = [];
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
index 5c995148a2..f609f5bf94 100644
--- a/volk/lib/Makefile.am
+++ b/volk/lib/Makefile.am
@@ -184,7 +184,8 @@ testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN
 testqa_LDFLAGS = -lboost_unit_test_framework
 testqa_LDADD  = \
 	libvolk.la \
-	libvolk_runtime.la
+	libvolk_runtime.la \
+	../orc/libvolk_orc.la
 
 
 distclean-local: 
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index fa21db4870..a8c00c143c 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -110,7 +110,11 @@ static std::vector<std::string> get_arch_list(const int archs[]) {
 }
 
 static bool is_valid_type(std::string type) {
-    std::vector<std::string> valid_types = boost::assign::list_of("64f")("64u")("32fc")("32f")("32s")("32u")("16sc")("16s")("16u")("8s")("8sc")("8u");
+    std::vector<std::string> valid_types = boost::assign::list_of("64f")("64u")("32fc")("32f")
+                                                                 ("32s")("32u")("16sc")("16s")
+                                                                 ("16u")("8s")("8sc")("8u")
+                                                                 ("s32f")("s16u")("s16s")("s8u")
+                                                                 ("s8s");
     
     BOOST_FOREACH(std::string this_type, valid_types) {
         if(type == this_type) return true;
@@ -148,17 +152,11 @@ static void get_function_signature(std::vector<std::string> &inputsig,
         pos++;
     }
     
-    //if there's no output sig and only one input sig, assume there are 2 inputs
-    //this handles conversion fn's (which have a specified output sig) and most of the rest
-    if(outputsig.size() == 0 && inputsig.size() == 1) {
-        outputsig.push_back(inputsig[0]);
-        inputsig.push_back(inputsig[0]);
-    }//if there's no explicit output sig then assume the output is the same as the first input
-    else if(outputsig.size() == 0) outputsig.push_back(inputsig[0]);
-    
-    
     assert(inputsig.size() != 0);
-    assert(outputsig.size() != 0);
+}
+
+inline void run_cast_test1(volk_fn_1arg func, void *buff, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buff, vlen, arch.c_str());
 }
 
 inline void run_cast_test2(volk_fn_2arg func, void *outbuff, std::vector<void *> &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) {
@@ -190,26 +188,42 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
     for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i] << std::endl;
     for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i] << std::endl;
     
-    //now that we have that, we'll set up input and output buffers based on the function signature
+    //now that we have that, we'll set up input buffers based on the function signature
     std::vector<void *> inbuffs;
     make_buffer_for_signature(inbuffs, inputsig, vlen);
     
+    //allocate output buffers -- one for each output for each arch
+    std::vector<void *> outbuffs;
+    BOOST_FOREACH(std::string arch, arch_list) {
+        make_buffer_for_signature(outbuffs, outputsig, vlen);
+    }
+
     //and set the input buffers to something random
     for(int i=0; i<inputsig.size(); i++) {
         load_random_data(inbuffs[i], inputsig[i], vlen);        
     }
     
-    //allocate output buffers -- one for each output for each arch
-    std::vector<void *> outbuffs;
-    BOOST_FOREACH(std::string arch, arch_list) {
-        make_buffer_for_signature(outbuffs, outputsig, vlen);
+    //so let's see here. if the operation has no output sig, it operates in place,
+    //and we want the output buffers to be the input buffers; we want to copy the input buffer to allllll the output buffers.
+    if(outputsig.size() == 0) {
+        //make a set of output buffers according to the input signature
+        BOOST_FOREACH(std::string arch, arch_list) {
+            make_buffer_for_signature(outbuffs, inputsig, vlen);
+        }
+        //copy input buffer[0] to all the output buffers so it has something to operate on
+        //output buffer element size is the same as input buffer[0]
+        if(
     }
+        
     
     //now run the test
     clock_t start, end;
     for(int i = 0; i < arch_list.size(); i++) {
         start = clock();
         switch(outputsig.size()+inputsig.size()) {
+            case 1:
+                run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); 
+                break;
             case 2:
                 run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
                 break;
@@ -262,6 +276,13 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
                         return 1;
                     }
                 }
+            } else if(outputsig[0] == "8s" || outputsig[0] == "8u") {
+                for(int j=0; j<vlen; j++) {
+                    if(((uint8_t *)(outbuffs[generic_offset]))[j] != ((uint8_t *)(outbuffs[i]))[j]) {
+                        std::cout << "Generic: " << ((uint8_t *)(outbuffs[generic_offset]))[j] << " " << arch_list[i] << ": " << ((uint8_t *)(outbuffs[i]))[j] << std::endl;
+                        return 1;
+                    }
+                }
             } else { 
                 std::cout << "Error: invalid type " << outputsig[0] << std::endl;
                 return 1;
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index f81d652fbe..00883bf8ef 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -3,7 +3,6 @@
 
 #include <stdlib.h>
 #include <string>
-#include <volk/volk.h>
 
 float uniform(void);
 void random_floats(float *buf, unsigned n);
@@ -12,6 +11,7 @@ bool run_volk_tests(const int[], void(*)(), std::string, float, int, int);
 
 #define VOLK_RUN_TESTS(func, tol, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, len, iter), 0)
 
+typedef void (*volk_fn_1arg)(void *, unsigned int, const char*);
 typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
 typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
 typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 066050a7c4..43f38543cd 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -25,27 +25,27 @@ lib_LTLIBRARIES = libvolk_orc.la
 libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS)
 
 libvolk_orc_la_SOURCES = \
-volk_8s_convert_16s_aligned16_orc_impl.orc \
-volk_8s_convert_32f_aligned16_orc_impl.orc \
-volk_16u_byteswap_aligned16_orc_impl.orc \
-volk_32s_and_aligned16_orc_impl.orc \
-volk_32s_or_aligned16_orc_impl.orc \
-volk_32f_add_aligned16_orc_impl.orc \
-volk_32f_subtract_aligned16_orc_impl.orc \
-volk_32f_divide_aligned16_orc_impl.orc \
-volk_32f_multiply_aligned16_orc_impl.orc \
-volk_32fc_multiply_aligned16_orc_impl.orc \
-volk_32fc_32f_multiply_aligned16_orc_impl.orc \
-volk_32f_sqrt_aligned16_orc_impl.orc \
-volk_32f_max_aligned16_orc_impl.orc \
-volk_32f_min_aligned16_orc_impl.orc \
-volk_32f_normalize_aligned16_orc_impl.orc \
-volk_32fc_magnitude_32f_aligned16_orc_impl.orc \
-volk_32fc_magnitude_16s_aligned16_orc_impl.orc \
-volk_16sc_magnitude_16s_aligned16_orc_impl.orc \
-volk_16sc_deinterleave_16s_aligned16_orc_impl.orc \
-volk_16sc_deinterleave_32f_aligned16_orc_impl.orc \
-volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
+volk_8s_convert_16s_a16_orc_impl.orc \
+volk_8s_s32f_convert_32f_a16_orc_impl.orc \
+volk_16u_byteswap_a16_orc_impl.orc \
+volk_32s_32s_and_32s_a16_orc_impl.orc \
+volk_32s_32s_or_32s_a16_orc_impl.orc \
+volk_32f_32f_add_32f_a16_orc_impl.orc \
+volk_32f_32f_subtract_32f_a16_orc_impl.orc \
+volk_32f_32f_divide_32f_a16_orc_impl.orc \
+volk_32f_32f_multiply_32f_a16_orc_impl.orc \
+volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc \
+volk_32fc_32f_multiply_32fc_a16_orc_impl.orc \
+volk_32f_sqrt_32f_a16_orc_impl.orc \
+volk_32f_32f_max_32f_a16_orc_impl.orc \
+volk_32f_32f_min_32f_a16_orc_impl.orc \
+volk_32f_s32f_normalize_a16_orc_impl.orc \
+volk_32fc_magnitude_32f_a16_orc_impl.orc \
+volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc \
+volk_16sc_magnitude_16s_a16_orc_impl.orc \
+volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc \
+volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc \
+volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
 
 
 
diff --git a/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
new file mode 100644
index 0000000000..d396a00527
--- /dev/null
+++ b/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_16sc_deinterleave_16s_16s_a16_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
diff --git a/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc
deleted file mode 100644
index d226064a73..0000000000
--- a/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_16sc_deinterleave_16s_aligned16_orc_impl
-.dest 2 idst
-.dest 2 qdst
-.source 4 src
-splitlw qdst, idst, src
diff --git a/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc
deleted file mode 100644
index dddf682ca0..0000000000
--- a/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,12 +0,0 @@
-.function volk_16sc_deinterleave_32f_aligned16_orc_impl
-.dest 4 idst
-.dest 4 qdst
-.source 4 src
-.floatparam 4 scalar
-.temp 8 iql
-.temp 8 iqf
-
-x2 convswl iql, src
-x2 convlf iqf, iql
-x2 divf iqf, iqf, scalar
-splitql qdst, idst, iqf
diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
new file mode 100644
index 0000000000..5954c406f0
--- /dev/null
+++ b/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_16sc_deinterleave_real_8s_a16_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
deleted file mode 100644
index 609750096a..0000000000
--- a/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,6 +0,0 @@
-.function volk_16sc_deinterleave_real_8s_aligned16_orc_impl
-.dest 1 dst
-.source 4 src
-.temp 2 iw
-select0lw iw, src
-convhwb dst, iw
diff --git a/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
new file mode 100644
index 0000000000..2a49d4ecb4
--- /dev/null
+++ b/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_16sc_magnitude_16s_a16_orc_impl
+.source 4 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+x2 mulf prodiqf, iqf, iqf
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+convfl rootl, rootf
+convlw dst, rootl
diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
deleted file mode 100644
index 088f56312e..0000000000
--- a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,23 +0,0 @@
-.function volk_16sc_magnitude_16s_aligned16_orc_impl
-.source 4 src
-.dest 2 dst
-.floatparam 4 scalar
-.temp 8 iql
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-.temp 4 rootf
-.temp 4 rootl
-
-x2 convswl iql, src
-x2 convlf iqf, iql
-x2 divf iqf, iqf, scalar
-x2 mulf prodiqf, iqf, iqf
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf rootf, sumf
-mulf rootf, rootf, scalar
-convfl rootl, rootf
-convlw dst, rootl
diff --git a/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc b/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..47c3d28a9c
--- /dev/null
+++ b/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
@@ -0,0 +1,12 @@
+.function volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
diff --git a/volk/orc/volk_16u_byteswap_a16_orc_impl.orc b/volk/orc/volk_16u_byteswap_a16_orc_impl.orc
new file mode 100644
index 0000000000..c1c8ee59ec
--- /dev/null
+++ b/volk/orc/volk_16u_byteswap_a16_orc_impl.orc
@@ -0,0 +1,3 @@
+.function volk_16u_byteswap_a16_orc_impl
+.dest 2 dst
+swapw dst, dst
diff --git a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
deleted file mode 100644
index 3ffd12ec0e..0000000000
--- a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,3 +0,0 @@
-.function volk_16u_byteswap_aligned16_orc_impl
-.dest 2 dst
-swapw dst, dst
diff --git a/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..e6a30cf01c
--- /dev/null
+++ b/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_32f_add_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..0bdcd0010d
--- /dev/null
+++ b/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_32f_divide_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..9584e6634f
--- /dev/null
+++ b/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_32f_max_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..47b9c05dbd
--- /dev/null
+++ b/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_32f_min_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..e5a049c161
--- /dev/null
+++ b/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_32f_multiply_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..2ab42d5f6e
--- /dev/null
+++ b/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_32f_subtract_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
diff --git a/volk/orc/volk_32f_add_aligned16_orc_impl.orc b/volk/orc/volk_32f_add_aligned16_orc_impl.orc
deleted file mode 100644
index 20e000f688..0000000000
--- a/volk/orc/volk_32f_add_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_add_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-addf dst, src1, src2
diff --git a/volk/orc/volk_32f_divide_aligned16_orc_impl.orc b/volk/orc/volk_32f_divide_aligned16_orc_impl.orc
deleted file mode 100644
index 870843f2af..0000000000
--- a/volk/orc/volk_32f_divide_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_divide_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-divf dst, src1, src2
diff --git a/volk/orc/volk_32f_max_aligned16_orc_impl.orc b/volk/orc/volk_32f_max_aligned16_orc_impl.orc
deleted file mode 100644
index 97f48ba4ad..0000000000
--- a/volk/orc/volk_32f_max_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_max_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_min_aligned16_orc_impl.orc b/volk/orc/volk_32f_min_aligned16_orc_impl.orc
deleted file mode 100644
index a597933deb..0000000000
--- a/volk/orc/volk_32f_min_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_min_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-minf dst, src1, src2
diff --git a/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc
deleted file mode 100644
index 23619af4ed..0000000000
--- a/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_multiply_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_normalize_aligned16_orc_impl.orc b/volk/orc/volk_32f_normalize_aligned16_orc_impl.orc
deleted file mode 100644
index 84d965e7fd..0000000000
--- a/volk/orc/volk_32f_normalize_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_normalize_aligned16_orc_impl
-.source 4 src1
-.floatparam 4 invscalar
-.dest 4 dst
-mulf dst, src1, invscalar
diff --git a/volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc b/volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc
new file mode 100644
index 0000000000..acd319b166
--- /dev/null
+++ b/volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_normalize_a16_orc_impl
+.source 4 src1
+.floatparam 4 invscalar
+.dest 4 dst
+mulf dst, src1, invscalar
diff --git a/volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc b/volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..ae5680f154
--- /dev/null
+++ b/volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc
@@ -0,0 +1,4 @@
+.function volk_32f_sqrt_32f_a16_orc_impl
+.source 4 src
+.dest 4 dst
+sqrtf dst, src
diff --git a/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc b/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc
deleted file mode 100644
index 0983271db5..0000000000
--- a/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,4 +0,0 @@
-.function volk_32f_sqrt_aligned16_orc_impl
-.source 4 src
-.dest 4 dst
-sqrtf dst, src
diff --git a/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc b/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc
deleted file mode 100644
index 17dbcad465..0000000000
--- a/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_subtract_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-subf dst, src1, src2
diff --git a/volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc
new file mode 100644
index 0000000000..455293cffd
--- /dev/null
+++ b/volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc
@@ -0,0 +1,7 @@
+.function volk_32fc_32f_multiply_32fc_a16_orc_impl
+.source 8 src1
+.source 4 src2
+.dest 8 dst
+.temp 8 tmp
+mergelq tmp, src2, src2
+x2 mulf dst, src1, tmp
diff --git a/volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc
deleted file mode 100644
index 27c0598525..0000000000
--- a/volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,7 +0,0 @@
-.function volk_32fc_32f_multiply_aligned16_orc_impl
-.source 8 src1
-.source 4 src2
-.dest 8 dst
-.temp 8 tmp
-mergelq tmp, src2, src2
-x2 mulf dst, src1, tmp
diff --git a/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
new file mode 100644
index 0000000000..ed928b90f4
--- /dev/null
+++ b/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_32fc_32fc_multiply_32fc_a16_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 tmp
+x2 mulf dst, src1, src2
diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
deleted file mode 100644
index 15f8fdff0e..0000000000
--- a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,23 +0,0 @@
-.function volk_32fc_magnitude_16s_aligned16_orc_impl
-.source 8 src
-.dest 2 dst
-.floatparam 4 scalar
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-.temp 4 rootf
-.temp 4 rootl
-.temp 4 maskl
-
-x2 mulf prodiqf, src, src
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf rootf, sumf
-mulf rootf, rootf, scalar
-cmpltf maskl, scalar, rootf
-andl maskl, maskl, 0x80000000
-orl rootf, rootf, maskl
-convfl rootl, rootf
-convssslw dst, rootl
diff --git a/volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..c5e2e57f14
--- /dev/null
+++ b/volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc
@@ -0,0 +1,13 @@
+.function volk_32fc_magnitude_32f_a16_orc_impl
+.source 8 src
+.dest 4 dst
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf dst, sumf
diff --git a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
deleted file mode 100644
index 47a10531d7..0000000000
--- a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,13 +0,0 @@
-.function volk_32fc_magnitude_32f_aligned16_orc_impl
-.source 8 src
-.dest 4 dst
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-
-x2 mulf prodiqf, src, src
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf dst, sumf
diff --git a/volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc
deleted file mode 100644
index ffe9cc3ef4..0000000000
--- a/volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,6 +0,0 @@
-.function volk_32fc_multiply_aligned16_orc_impl
-.source 8 src1
-.source 8 src2
-.dest 8 dst
-.temp 8 tmp
-x2 mulf dst, src1, src2
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
new file mode 100644
index 0000000000..cccda8a0f9
--- /dev/null
+++ b/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_32fc_s32f_magnitude_16s_a16_orc_impl
+.source 8 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+.temp 4 maskl
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+cmpltf maskl, scalar, rootf
+andl maskl, maskl, 0x80000000
+orl rootf, rootf, maskl
+convfl rootl, rootf
+convssslw dst, rootl
diff --git a/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
new file mode 100644
index 0000000000..bff3af875f
--- /dev/null
+++ b/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32s_32s_and_32s_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+andl dst, src1, src2
diff --git a/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
new file mode 100644
index 0000000000..b6961f79ea
--- /dev/null
+++ b/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32s_32s_or_32s_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
diff --git a/volk/orc/volk_32s_and_aligned16_orc_impl.orc b/volk/orc/volk_32s_and_aligned16_orc_impl.orc
deleted file mode 100644
index 9d3c7b733c..0000000000
--- a/volk/orc/volk_32s_and_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32s_and_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-andl dst, src1, src2
diff --git a/volk/orc/volk_32s_or_aligned16_orc_impl.orc b/volk/orc/volk_32s_or_aligned16_orc_impl.orc
deleted file mode 100644
index 6d2a3859a3..0000000000
--- a/volk/orc/volk_32s_or_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32s_or_aligned16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-orl dst, src1, src2
diff --git a/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
new file mode 100644
index 0000000000..a55c7f7232
--- /dev/null
+++ b/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_8s_convert_16s_a16_orc_impl
+.source 1 src
+.dest 2 dst
+convsbw dst, src
+shlw dst, dst, 8
diff --git a/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc
deleted file mode 100644
index a089877d1c..0000000000
--- a/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_8s_convert_16s_aligned16_orc_impl
-.source 1 src
-.dest 2 dst
-convsbw dst, src
-shlw dst, dst, 8
diff --git a/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc b/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc
deleted file mode 100644
index 91a0084d71..0000000000
--- a/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc
+++ /dev/null
@@ -1,9 +0,0 @@
-.function volk_8s_convert_32f_aligned16_orc_impl
-.source 2 src
-.dest 4 dst
-.floatparam 4 scalar
-.temp 4 flsrc
-.temp 4 lsrc
-convswl lsrc, src
-convlf flsrc, lsrc
-mulf dst, flsrc, scalar
diff --git a/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..3274ab9d6e
--- /dev/null
+++ b/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc
@@ -0,0 +1,9 @@
+.function volk_8s_s32f_convert_32f_a16_orc_impl
+.source 2 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 flsrc
+.temp 4 lsrc
+convswl lsrc, src
+convlf flsrc, lsrc
+mulf dst, flsrc, scalar
-- 
cgit v1.2.3


From e3600f59e76c3dc08aedfd77629b7c5c48df86af Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Thu, 20 Jan 2011 16:30:09 -0800
Subject: volk: renamed all files. added all tests. some test things are still
 broken.

---
 volk/include/volk/Makefile.am                      | 128 +++---
 volk/include/volk/make_c.py                        |   2 +-
 volk/include/volk/volk_16i_branch_4_state_8_a16.h  | 194 +++++++++
 volk/include/volk/volk_16i_convert_8i_a16.h        |  69 +++
 volk/include/volk/volk_16i_convert_8i_u.h          |  71 ++++
 volk/include/volk/volk_16i_max_star_16i_a16.h      | 108 +++++
 .../volk/volk_16i_max_star_horizontal_16i_a16.h    | 130 ++++++
 .../volk/volk_16i_permute_and_scalar_add_a16.h     | 139 ++++++
 volk/include/volk/volk_16i_s32f_convert_32f_a16.h  | 119 ++++++
 volk/include/volk/volk_16i_s32f_convert_32f_u.h    | 122 ++++++
 .../volk/volk_16i_x4_quad_max_star_16i_a16.h       | 191 +++++++++
 .../include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 136 ++++++
 .../volk/volk_16ic_deinterleave_16i_x2_a16.h       | 158 +++++++
 .../volk/volk_16ic_deinterleave_real_16i_a16.h     | 120 ++++++
 .../volk/volk_16ic_deinterleave_real_8i_a16.h      |  94 +++++
 volk/include/volk/volk_16ic_magnitude_16i_a16.h    | 190 +++++++++
 .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h  | 108 +++++
 .../volk_16ic_s32f_deinterleave_real_32f_a16.h     | 125 ++++++
 .../volk/volk_16ic_s32f_magnitude_32f_a16.h        | 179 ++++++++
 volk/include/volk/volk_16s_add_quad_a16.h          | 136 ------
 volk/include/volk/volk_16s_branch_4_state_8_a16.h  | 194 ---------
 volk/include/volk/volk_16s_convert_8s_a16.h        |  69 ---
 volk/include/volk/volk_16s_convert_8s_ua16.h       |  71 ----
 volk/include/volk/volk_16s_max_star_16s_a16.h      | 108 -----
 .../volk/volk_16s_max_star_horizontal_16s_a16.h    | 130 ------
 .../volk/volk_16s_permute_and_scalar_add_a16.h     | 139 ------
 volk/include/volk/volk_16s_quad_max_star_16s_a16.h | 191 ---------
 volk/include/volk/volk_16s_s32f_convert_32f_a16.h  | 119 ------
 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h | 122 ------
 .../volk/volk_16sc_deinterleave_16s_16s_a16.h      | 158 -------
 .../volk/volk_16sc_deinterleave_real_16s_a16.h     | 120 ------
 .../volk/volk_16sc_deinterleave_real_8s_a16.h      |  94 -----
 volk/include/volk/volk_16sc_magnitude_16s_a16.h    | 190 ---------
 .../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h | 108 -----
 .../volk_16sc_s32f_deinterleave_real_32f_a16.h     | 125 ------
 .../volk/volk_16sc_s32f_magnitude_32f_a16.h        | 179 --------
 .../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h    | 151 -------
 volk/include/volk/volk_32f_32f_add_32f_a16.h       |  81 ----
 volk/include/volk/volk_32f_32f_divide_32f_a16.h    |  82 ----
 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h  | 184 --------
 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h | 184 --------
 .../volk/volk_32f_32f_interleave_32fc_a16.h        |  75 ----
 volk/include/volk/volk_32f_32f_max_32f_a16.h       |  85 ----
 volk/include/volk/volk_32f_32f_min_32f_a16.h       |  85 ----
 volk/include/volk/volk_32f_32f_multiply_32f_a16.h  |  81 ----
 .../volk/volk_32f_32f_s32f_interleave_16sc_a16.h   | 155 -------
 volk/include/volk/volk_32f_32f_subtract_32f_a16.h  |  81 ----
 volk/include/volk/volk_32f_convert_64f_u.h         |  70 +++
 volk/include/volk/volk_32f_convert_64f_ua16.h      |  70 ---
 volk/include/volk/volk_32f_s32f_convert_16i_a16.h  | 110 +++++
 volk/include/volk/volk_32f_s32f_convert_16i_u.h    | 113 +++++
 volk/include/volk/volk_32f_s32f_convert_16s_a16.h  | 110 -----
 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h | 113 -----
 volk/include/volk/volk_32f_s32f_convert_32i_a16.h  | 106 +++++
 volk/include/volk/volk_32f_s32f_convert_32i_u.h    | 109 +++++
 volk/include/volk/volk_32f_s32f_convert_32s_a16.h  | 106 -----
 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h | 109 -----
 volk/include/volk/volk_32f_s32f_convert_8i_a16.h   | 117 ++++++
 volk/include/volk/volk_32f_s32f_convert_8i_u.h     | 120 ++++++
 volk/include/volk/volk_32f_s32f_convert_8s_a16.h   | 117 ------
 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h  | 120 ------
 .../volk/volk_32f_stddev_and_mean_32f_32f_a16.h    | 169 --------
 .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h     | 169 ++++++++
 volk/include/volk/volk_32f_x2_add_32f_a16.h        |  81 ++++
 volk/include/volk/volk_32f_x2_divide_32f_a16.h     |  82 ++++
 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h   | 184 ++++++++
 volk/include/volk/volk_32f_x2_dot_prod_32f_u.h     | 184 ++++++++
 .../include/volk/volk_32f_x2_interleave_32fc_a16.h |  75 ++++
 volk/include/volk/volk_32f_x2_max_32f_a16.h        |  85 ++++
 volk/include/volk/volk_32f_x2_min_32f_a16.h        |  85 ++++
 volk/include/volk/volk_32f_x2_multiply_32f_a16.h   |  81 ++++
 .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h    | 155 +++++++
 volk/include/volk/volk_32f_x2_subtract_32f_a16.h   |  81 ++++
 .../include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 151 +++++++
 .../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h   | 344 ---------------
 .../volk/volk_32fc_32fc_dot_prod_32fc_a16.h        | 468 ---------------------
 .../volk/volk_32fc_32fc_multiply_32fc_a16.h        |  95 -----
 ...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h | 126 ------
 .../volk/volk_32fc_32fc_square_dist_32f_a16.h      | 112 -----
 .../volk/volk_32fc_deinterleave_32f_32f_a16.h      |  75 ----
 .../volk/volk_32fc_deinterleave_32f_x2_a16.h       |  75 ++++
 .../volk/volk_32fc_deinterleave_64f_64f_a16.h      |  78 ----
 .../volk/volk_32fc_deinterleave_64f_x2_a16.h       |  78 ++++
 .../volk/volk_32fc_deinterleave_real_16i_a16.h     |  80 ++++
 .../volk/volk_32fc_deinterleave_real_16s_a16.h     |  80 ----
 .../volk/volk_32fc_s32f_magnitude_16i_a16.h        | 158 +++++++
 .../volk/volk_32fc_s32f_magnitude_16s_a16.h        | 158 -------
 ...32fc_s32f_s32f_power_spectral_density_32f_a16.h | 134 ------
 ...k_32fc_s32f_x2_power_spectral_density_32f_a16.h | 134 ++++++
 .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h     | 344 +++++++++++++++
 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 468 +++++++++++++++++++++
 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h |  95 +++++
 ..._32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 126 ++++++
 .../volk/volk_32fc_x2_square_dist_32f_a16.h        | 112 +++++
 volk/include/volk/volk_32i_s32f_convert_32f_a16.h  |  73 ++++
 volk/include/volk/volk_32i_s32f_convert_32f_u.h    |  75 ++++
 volk/include/volk/volk_32i_x2_and_32i_a16.h        |  81 ++++
 volk/include/volk/volk_32i_x2_or_32i_a16.h         |  81 ++++
 volk/include/volk/volk_32s_32s_and_32s_a16.h       |  81 ----
 volk/include/volk/volk_32s_32s_or_32s_a16.h        |  81 ----
 volk/include/volk/volk_32s_s32f_convert_32f_a16.h  |  73 ----
 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h |  75 ----
 volk/include/volk/volk_64f_64f_max_64f_a16.h       |  71 ----
 volk/include/volk/volk_64f_64f_min_64f_a16.h       |  71 ----
 volk/include/volk/volk_64f_convert_32f_u.h         |  67 +++
 volk/include/volk/volk_64f_convert_32f_ua16.h      |  67 ---
 volk/include/volk/volk_64f_x2_max_64f_a16.h        |  71 ++++
 volk/include/volk/volk_64f_x2_min_64f_a16.h        |  71 ++++
 volk/include/volk/volk_8i_convert_16i_a16.h        |  83 ++++
 volk/include/volk/volk_8i_convert_16i_u.h          |  73 ++++
 volk/include/volk/volk_8i_s32f_convert_32f_a16.h   | 105 +++++
 volk/include/volk/volk_8i_s32f_convert_32f_u.h     |  94 +++++
 .../volk/volk_8ic_deinterleave_16i_x2_a16.h        |  77 ++++
 .../volk/volk_8ic_deinterleave_real_16i_a16.h      |  66 +++
 .../volk/volk_8ic_deinterleave_real_8i_a16.h       |  67 +++
 .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h   | 164 ++++++++
 .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 133 ++++++
 .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 102 +++++
 .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 122 ++++++
 volk/include/volk/volk_8s_convert_16s_a16.h        |  83 ----
 volk/include/volk/volk_8s_convert_16s_ua16.h       |  73 ----
 volk/include/volk/volk_8s_s32f_convert_32f_a16.h   | 105 -----
 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h  |  94 -----
 .../volk_8sc_8sc_multiply_conjugate_16sc_a16.h     | 102 -----
 ...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h | 122 ------
 .../volk/volk_8sc_deinterleave_16s_16s_a16.h       |  77 ----
 .../volk/volk_8sc_deinterleave_real_16s_a16.h      |  66 ---
 .../volk/volk_8sc_deinterleave_real_8s_a16.h       |  67 ---
 .../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h  | 164 --------
 .../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h | 133 ------
 volk/include/volk/volk_register.py                 |   4 +-
 volk/lib/qa_utils.cc                               | 101 +++--
 volk/lib/qa_utils.h                                |   1 +
 volk/orc/Makefile.am                               |  33 +-
 ...k_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc |  12 +
 .../volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc |   5 +
 ...volk_16ic_deinterleave_real_8i_a16_orc_impl.orc |   6 +
 volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc  |  23 +
 ...volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc |   5 -
 ...volk_16sc_deinterleave_real_8s_a16_orc_impl.orc |   6 -
 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc  |  23 -
 .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc |   2 +-
 ...16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc |  12 -
 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc     |   5 -
 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc  |   5 -
 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc     |   5 -
 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc     |   5 -
 .../orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc |   5 -
 .../orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc |   5 -
 volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc   |   5 +
 volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc |   5 +
 volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc |   5 +
 .../volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc  |   6 -
 .../volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc  |  23 +
 .../volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc  |  23 -
 .../volk_32fc_x2_multiply_32fc_a16_orc_impl.orc    |   6 +
 volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc       |   5 +
 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc     |   5 -
 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc      |   5 -
 volk/orc/volk_8i_convert_16i_a16_orc_impl.orc      |   5 +
 volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc |   9 +
 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc      |   5 -
 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc |   9 -
 167 files changed, 7962 insertions(+), 7939 deletions(-)
 create mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a16.h
 create mode 100644 volk/include/volk/volk_16i_convert_8i_a16.h
 create mode 100644 volk/include/volk/volk_16i_convert_8i_u.h
 create mode 100644 volk/include/volk/volk_16i_max_star_16i_a16.h
 create mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h
 create mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a16.h
 create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_u.h
 create mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h
 create mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h
 create mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h
 create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h
 create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
 create mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a16.h
 create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h
 create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16s_add_quad_a16.h
 delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h
 delete mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
 delete mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_convert_64f_u.h
 delete mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_u.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_u.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_u.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
 delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_add_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
 create mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_max_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_min_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h
 create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_u.h
 create mode 100644 volk/include/volk/volk_32i_x2_and_32i_a16.h
 create mode 100644 volk/include/volk/volk_32i_x2_or_32i_a16.h
 delete mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h
 delete mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h
 create mode 100644 volk/include/volk/volk_64f_convert_32f_u.h
 delete mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h
 create mode 100644 volk/include/volk/volk_64f_x2_max_64f_a16.h
 create mode 100644 volk/include/volk/volk_64f_x2_min_64f_a16.h
 create mode 100644 volk/include/volk/volk_8i_convert_16i_a16.h
 create mode 100644 volk/include/volk/volk_8i_convert_16i_u.h
 create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_u.h
 create mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h
 create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h
 create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h
 create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h
 create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h
 create mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h
 delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_8i_convert_16i_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index aef1d7ba8c..43c8ae9df3 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -41,93 +41,93 @@ volkinclude_HEADERS = \
 	volk.h \
 	volk_cpu.h \
 	volk_environment_init.h \
-	volk_16s_add_quad_a16.h \
-	volk_16s_branch_4_state_8_a16.h \
-	volk_16sc_deinterleave_16s_16s_a16.h \
-	volk_16sc_s32f_deinterleave_32f_32f_a16.h \
-	volk_16sc_deinterleave_real_16s_a16.h \
-	volk_16sc_s32f_deinterleave_real_32f_a16.h \
-	volk_16sc_deinterleave_real_8s_a16.h \
-	volk_16sc_magnitude_16s_a16.h \
-	volk_16sc_s32f_magnitude_32f_a16.h \
-	volk_16s_s32f_convert_32f_a16.h \
-	volk_16s_s32f_convert_32f_ua16.h \
-	volk_16s_convert_8s_a16.h \
-	volk_16s_convert_8s_ua16.h \
-	volk_16s_max_star_16s_a16.h \
-	volk_16s_max_star_horizontal_16s_a16.h \
-	volk_16s_permute_and_scalar_add_a16.h \
-	volk_16s_quad_max_star_16s_a16.h \
+	volk_16i_x5_add_quad_16i_x4_a16.h \
+	volk_16i_branch_4_state_8_a16.h \
+	volk_16ic_deinterleave_16i_x2_a16.h \
+	volk_16ic_s32f_deinterleave_32f_x2_a16.h \
+	volk_16ic_deinterleave_real_16i_a16.h \
+	volk_16ic_s32f_deinterleave_real_32f_a16.h \
+	volk_16ic_deinterleave_real_8i_a16.h \
+	volk_16ic_magnitude_16i_a16.h \
+	volk_16ic_s32f_magnitude_32f_a16.h \
+	volk_16i_s32f_convert_32f_a16.h \
+	volk_16i_s32f_convert_32f_u.h \
+	volk_16i_convert_8i_a16.h \
+	volk_16i_convert_8i_u.h \
+	volk_16i_max_star_16i_a16.h \
+	volk_16i_max_star_horizontal_16i_a16.h \
+	volk_16i_permute_and_scalar_add_a16.h \
+	volk_16i_x4_quad_max_star_16i_a16.h \
 	volk_16u_byteswap_a16.h \
 	volk_32f_accumulator_s32f_a16.h \
-	volk_32f_32f_add_32f_a16.h \
+	volk_32f_x2_add_32f_a16.h \
 	volk_32fc_32f_multiply_32fc_a16.h \
 	volk_32fc_32f_power_32fc_a16.h \
 	volk_32f_calc_spectral_noise_floor_a16.h \
 	volk_32fc_s32f_atan2_32f_a16.h \
-	volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \
-	volk_32fc_deinterleave_32f_32f_a16.h \
-	volk_32fc_deinterleave_64f_64f_a16.h \
-	volk_32fc_deinterleave_real_16s_a16.h \
+	volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \
+	volk_32fc_deinterleave_32f_x2_a16.h \
+	volk_32fc_deinterleave_64f_x2_a16.h \
+	volk_32fc_deinterleave_real_16i_a16.h \
 	volk_32fc_deinterleave_real_32f_a16.h \
 	volk_32fc_deinterleave_real_64f_a16.h \
-	volk_32fc_32fc_dot_prod_32fc_a16.h \
+	volk_32fc_x2_dot_prod_32fc_a16.h \
 	volk_32fc_index_max_16u_a16.h \
-	volk_32fc_s32f_magnitude_16s_a16.h \
+	volk_32fc_s32f_magnitude_16i_a16.h \
 	volk_32fc_magnitude_32f_a16.h \
-	volk_32fc_32fc_multiply_32fc_a16.h \
-	volk_32f_s32f_convert_16s_a16.h \
-	volk_32f_s32f_convert_16s_ua16.h \
-	volk_32f_s32f_convert_32s_a16.h \
-	volk_32f_s32f_convert_32s_ua16.h \
+	volk_32fc_x2_multiply_32fc_a16.h \
+	volk_32f_s32f_convert_16i_a16.h \
+	volk_32f_s32f_convert_16i_u.h \
+	volk_32f_s32f_convert_32i_a16.h \
+	volk_32f_s32f_convert_32i_u.h \
 	volk_32f_convert_64f_a16.h \
-	volk_32f_convert_64f_ua16.h \
-	volk_32f_s32f_convert_8s_a16.h \
-	volk_32f_s32f_convert_8s_ua16.h \
-	volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \
+	volk_32f_convert_64f_u.h \
+	volk_32f_s32f_convert_8i_a16.h \
+	volk_32f_s32f_convert_8i_u.h \
+	volk_32fc_s32f_x2_power_spectral_density_32f_a16.h \
 	volk_32fc_s32f_power_spectrum_32f_a16.h \
-	volk_32fc_32fc_square_dist_32f_a16.h \
-	volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \
-	volk_32f_32f_divide_32f_a16.h \
-	volk_32f_32f_dot_prod_32f_a16.h \
-	volk_32f_32f_dot_prod_32f_ua16.h \
+	volk_32fc_x2_square_dist_32f_a16.h \
+	volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h \
+	volk_32f_x2_divide_32f_a16.h \
+	volk_32f_x2_dot_prod_32f_a16.h \
+	volk_32f_x2_dot_prod_32f_u.h \
 	volk_32f_s32f_32f_fm_detect_32f_a16.h \
 	volk_32f_index_max_16u_a16.h \
-	volk_32f_32f_s32f_interleave_16sc_a16.h \
-	volk_32f_32f_interleave_32fc_a16.h \
-	volk_32f_32f_max_32f_a16.h \
-	volk_32f_32f_min_32f_a16.h \
-	volk_32f_32f_multiply_32f_a16.h \
+	volk_32f_x2_s32f_interleave_16ic_a16.h \
+	volk_32f_x2_interleave_32fc_a16.h \
+	volk_32f_x2_max_32f_a16.h \
+	volk_32f_x2_min_32f_a16.h \
+	volk_32f_x2_multiply_32f_a16.h \
 	volk_32f_s32f_normalize_a16.h \
 	volk_32f_s32f_power_32f_a16.h \
 	volk_32f_sqrt_32f_a16.h \
 	volk_32f_s32f_stddev_32f_a16.h \
-	volk_32f_stddev_and_mean_32f_32f_a16.h \
-	volk_32f_32f_subtract_32f_a16.h \
-	volk_32f_32f_32f_sum_of_poly_32f_a16.h \
-	volk_32s_32s_and_32s_a16.h \
-	volk_32s_s32f_convert_32f_a16.h \
-	volk_32s_s32f_convert_32f_ua16.h \
-	volk_32s_32s_or_32s_a16.h \
+	volk_32f_stddev_and_mean_32f_x2_a16.h \
+	volk_32f_x2_subtract_32f_a16.h \
+	volk_32f_x3_sum_of_poly_32f_a16.h \
+	volk_32i_x2_and_32i_a16.h \
+	volk_32i_s32f_convert_32f_a16.h \
+	volk_32i_s32f_convert_32f_u.h \
+	volk_32i_x2_or_32i_a16.h \
 	volk_32u_byteswap_a16.h \
 	volk_32u_popcnt_a16.h \
 	volk_64f_convert_32f_a16.h \
-	volk_64f_convert_32f_ua16.h \
-	volk_64f_64f_max_64f_a16.h \
-	volk_64f_64f_min_64f_a16.h \
+	volk_64f_convert_32f_u.h \
+	volk_64f_x2_max_64f_a16.h \
+	volk_64f_x2_min_64f_a16.h \
 	volk_64u_byteswap_a16.h \
 	volk_64u_popcnt_a16.h \
-	volk_8sc_deinterleave_16s_16s_a16.h \
-	volk_8sc_s32f_deinterleave_32f_32f_a16.h \
-	volk_8sc_deinterleave_real_16s_a16.h \
-	volk_8sc_s32f_deinterleave_real_32f_a16.h \
-	volk_8sc_deinterleave_real_8s_a16.h \
-	volk_8sc_8sc_multiply_conjugate_16sc_a16.h \
-	volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \
-	volk_8s_convert_16s_a16.h \
-	volk_8s_convert_16s_ua16.h \
-	volk_8s_s32f_convert_32f_a16.h \
-	volk_8s_s32f_convert_32f_ua16.h 
+	volk_8ic_deinterleave_16i_x2_a16.h \
+	volk_8ic_s32f_deinterleave_32f_x2_a16.h \
+	volk_8ic_deinterleave_real_16i_a16.h \
+	volk_8ic_s32f_deinterleave_real_32f_a16.h \
+	volk_8ic_deinterleave_real_8i_a16.h \
+	volk_8ic_x2_multiply_conjugate_16ic_a16.h \
+	volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h \
+	volk_8i_convert_16i_a16.h \
+	volk_8i_convert_16i_u.h \
+	volk_8i_s32f_convert_32f_a16.h \
+	volk_8i_s32f_convert_32f_u.h 
 
 VOLK_MKTABLES_SOURCES = \
 	$(top_srcdir)/lib/volk_rank_archs.c \
diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py
index f708ba7d07..6e75067d06 100644
--- a/volk/include/volk/make_c.py
+++ b/volk/include/volk/make_c.py
@@ -24,7 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
     tempstring = tempstring + "    }\n"
     tempstring = tempstring + "    return 0;\n"
     tempstring = tempstring + "}\n"
-    
+
     for i in range(len(funclist)): 
         tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
         
diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h
new file mode 100644
index 0000000000..3437c1a6b3
--- /dev/null
+++ b/volk/include/volk/volk_16i_branch_4_state_8_a16.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H
+#define INCLUDED_volk_16i_branch_4_state_8_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16i_branch_4_state_8_a16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+  
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = 1;
+  
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+  for(; i < bound; ++i) {
+    
+    xmm5 = _mm_load_si128(p_src0);
+    
+    
+    
+    
+    
+    
+    
+
+
+    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+    
+    p_src0 += 4;
+   
+    
+    xmm5 = _mm_add_epi16(xmm1, xmm2);
+    
+    xmm6 = _mm_add_epi16(xmm2, xmm6);
+    xmm8 = _mm_add_epi16(xmm1, xmm8);
+   
+     
+    xmm7 = _mm_load_si128(p_cntl2);
+    xmm9 = _mm_load_si128(p_cntl3);
+    
+    xmm0 = _mm_add_epi16(xmm5, xmm0);
+    
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm5 = _mm_load_si128(&p_cntl2[1]);
+    xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm7);
+   
+   
+ 
+    xmm7 = _mm_load_si128(&p_cntl2[2]);
+    xmm9 = _mm_load_si128(&p_cntl3[2]);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm5);
+   
+    
+    xmm5 = _mm_load_si128(&p_cntl2[3]);
+    xmm11 = _mm_load_si128(&p_cntl3[3]);
+    
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+    
+    xmm8 = _mm_add_epi16(xmm8, xmm7);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    _mm_store_si128(p_target, xmm0);
+    _mm_store_si128(&p_target[1], xmm6);
+
+    xmm10 = _mm_add_epi16(xmm5, xmm10);
+    
+    _mm_store_si128(&p_target[2], xmm8);
+    
+    _mm_store_si128(&p_target[3], xmm10);
+    
+    p_target += 3;   
+  }
+}
+	
+	
+#endif /*LV_HAVE_SSEs*/
+
+#if LV_HAVE_GENERIC
+static inline  void volk_16i_branch_4_state_8_a16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	int i = 0;
+	
+	int bound = 4;
+	
+	for(; i < bound; ++i) {
+	  target[i* 8] = src0[((char)permuters[i][0])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8] & scalars[2])
+	    + (cntl3[i * 8] & scalars[3]);
+	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 1] & scalars[2])
+	    + (cntl3[i * 8 + 1] & scalars[3]);
+	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 2] & scalars[2])
+	    + (cntl3[i * 8 + 2] & scalars[3]);
+	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 3] & scalars[2])
+	    + (cntl3[i * 8 + 3] & scalars[3]);
+	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 4] & scalars[2])
+	    + (cntl3[i * 8 + 4] & scalars[3]);
+	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 5] & scalars[2])
+	    + (cntl3[i * 8 + 5] & scalars[3]);
+	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 6] & scalars[2])
+	    + (cntl3[i * 8 + 6] & scalars[3]);
+	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 7] & scalars[2])
+	    + (cntl3[i * 8 + 7] & scalars[3]);
+	  
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/
diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h
new file mode 100644
index 0000000000..73e45ad636
--- /dev/null
+++ b/volk/include/volk/volk_16i_convert_8i_a16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_volk_16i_convert_8i_a16_H
+#define INCLUDED_volk_16i_convert_8i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_a16_H */
diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h
new file mode 100644
index 0000000000..5fc792b566
--- /dev/null
+++ b/volk/include/volk/volk_16i_convert_8i_u.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_16i_convert_8i_u_H
+#define INCLUDED_volk_16i_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_u_H */
diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h
new file mode 100644
index 0000000000..ff57bd2a11
--- /dev/null
+++ b/volk/include/volk/volk_16i_max_star_16i_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16i_max_star_16i_a16_H
+#define INCLUDED_volk_16i_max_star_16i_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+  
+  short candidate = src0[0];
+  short cands[8];
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+  
+
+  __m128i *p_src0;
+  
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  xmm1 = _mm_setzero_si128();
+  xmm0 = _mm_setzero_si128();
+  //_mm_insert_epi16(xmm0, candidate, 0);
+  
+  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
+
+  
+  for(i = 0; i < bound; ++i) {
+    xmm1 = _mm_load_si128(p_src0);
+    p_src0 += 1;
+    xmm2 = _mm_sub_epi16(xmm1, xmm0);
+  
+
+    
+  
+  
+  
+    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+    xmm6 = _mm_xor_si128(xmm4, xmm5);
+    
+    xmm3 = _mm_and_si128(xmm3, xmm0);
+    xmm4 = _mm_and_si128(xmm6, xmm1);
+    
+    xmm0 = _mm_add_epi16(xmm3, xmm4);
+    
+  
+  }
+  
+  _mm_store_si128((__m128i*)cands, xmm0);
+  
+  for(i = 0; i < 8; ++i) {
+    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+  }
+  
+ 
+  
+  for(i = 0; i < leftovers; ++i) {
+  
+    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+  }
+
+  target[0] = candidate;
+  
+    
+    
+ 
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	short candidate = src0[0];
+	for(i = 1; i < bound; ++i) {
+	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+	}
+	target[0] = candidate;
+	  
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h
new file mode 100644
index 0000000000..695e08dbf1
--- /dev/null
+++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H
+#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+  
+  
+  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
+  __m128i  xmm5, xmm6, xmm7, xmm8;
+  
+  xmm4 = _mm_load_si128((__m128i*)shufmask0);
+  xmm5 = _mm_load_si128((__m128i*)shufmask1);
+  xmm6 = _mm_load_si128((__m128i*)andmask0);
+  xmm7 = _mm_load_si128((__m128i*)andmask1);
+  
+  __m128i *p_target, *p_src0;
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 5;
+  int intermediate = (num_bytes >> 4) & 1;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  for(i = 0; i < bound; ++i) {
+     
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(&p_src0[1]);
+    
+    
+
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 2;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    xmm3 = _mm_and_si128(xmm2, xmm7);
+    
+
+    xmm8 = _mm_add_epi8(xmm8, xmm4);
+    xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+    
+    
+    xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+    
+    _mm_store_si128(p_target, xmm3);
+    
+    p_target += 1;
+  
+  }
+
+  for(i = 0; i < intermediate; ++i) {
+    
+    xmm0 = _mm_load_si128(p_src0);
+    
+    
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 1;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    
+    xmm3 = _mm_add_epi8(xmm8, xmm4);
+    
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+    
+
+    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+    
+    p_target = (__m128i*)((int8_t*)p_target + 8);
+
+  }
+    
+  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
+    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+  }
+  
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+      
+	for(i = 0; i < bound; i += 2) {
+	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+	}
+		
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/
diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h
new file mode 100644
index 0000000000..e52a949fba
--- /dev/null
+++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H
+#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16i_permute_and_scalar_add_a16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+  short* p_permute_indexes = permute_indexes;
+  
+  p_target = (__m128i*)target;
+  p_cntl0 = (__m128i*)cntl0;
+  p_cntl1 = (__m128i*)cntl1;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_setzero_si128();
+    xmm5 = _mm_setzero_si128();
+    xmm6 = _mm_setzero_si128();
+    xmm7 = _mm_setzero_si128();
+
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+    
+    p_permute_indexes += 8;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    xmm5 = _mm_load_si128(p_cntl0);
+    xmm6 = _mm_load_si128(p_cntl1);
+    xmm7 = _mm_load_si128(p_cntl2);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm1);
+    xmm6 = _mm_and_si128(xmm6, xmm2);
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    
+    xmm5 = _mm_load_si128(p_cntl3);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+    p_cntl0 += 1;
+    
+    xmm5 = _mm_and_si128(xmm5, xmm4);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    p_cntl1 += 1;
+    p_cntl2 += 1;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5); 
+    
+    p_cntl3 += 1;
+
+    _mm_store_si128(p_target, xmm0);
+    
+    p_target += 1;
+  }
+	
+	
+	
+	
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target[i] = src0[permute_indexes[i]] 
+      + (cntl0[i] & scalars[0])
+      + (cntl1[i] & scalars[1])
+      + (cntl2[i] & scalars[2])
+      + (cntl3[i] & scalars[3]);
+  }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target[i] = src0[permute_indexes[i]] 
+			+ (cntl0[i] & scalars[0])
+			+ (cntl1[i] & scalars[1])
+			+ (cntl2[i] & scalars[2])
+			+ (cntl3[i] & scalars[3]);
+		
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h
new file mode 100644
index 0000000000..83fd26ff9a
--- /dev/null
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
new file mode 100644
index 0000000000..8f0dd0083d
--- /dev/null
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h
new file mode 100644
index 0000000000..e4ec5ab4ed
--- /dev/null
+++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H
+#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline  void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+
+
+
+	int i = 0;
+
+	int bound = (num_bytes >> 4);
+	int bound_copy = bound;
+	int leftovers = (num_bytes >> 1) & 7;
+	
+	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+	p_target = (__m128i*) target;
+	p_src0 =  (__m128i*)src0;
+	p_src1 =  (__m128i*)src1;
+	p_src2 =  (__m128i*)src2;
+	p_src3 =  (__m128i*)src3;
+	
+	
+
+	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+	while(bound_copy > 0) {
+	 
+	  xmm1 = _mm_load_si128(p_src0);
+	  xmm2 = _mm_load_si128(p_src1);
+	  xmm3 = _mm_load_si128(p_src2);
+	  xmm4 = _mm_load_si128(p_src3);
+	  
+	  xmm5 = _mm_setzero_si128();
+	  xmm6 = _mm_setzero_si128();
+	  xmm7 = xmm1;
+	  xmm8 = xmm3;
+	  
+	  
+	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+	  
+
+	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+	  
+
+	  xmm2 = _mm_and_si128(xmm5, xmm2);
+	  xmm4 = _mm_and_si128(xmm6, xmm4);
+	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
+	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+	  xmm5 = _mm_add_epi16(xmm2, xmm5);
+	  xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+	  
+	  xmm1 = _mm_xor_si128(xmm1, xmm1);
+	  xmm2 = xmm5;
+	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
+	  p_src0 += 1;
+	  bound_copy -= 1;
+
+	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+	  p_src1 += 1;
+	  
+	  xmm6 = _mm_and_si128(xmm1, xmm6);
+	  	  
+	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
+	  p_src2 += 1;
+
+
+	  
+	  xmm1 = _mm_add_epi16(xmm6, xmm1);
+	  p_src3 += 1;
+
+	  
+	  _mm_store_si128(p_target, xmm1);
+	  p_target += 1;
+      
+	}
+	
+
+	/*asm volatile
+		(
+		 "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t"
+
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+
+		 "pxor %%xmm5, %%xmm5\n\t"
+		 "pxor %%xmm6, %%xmm6\n\t"
+		 "movaps %%xmm1, %%xmm7\n\t"
+		 "movaps %%xmm3, %%xmm8\n\t"
+		 "psubw %%xmm2, %%xmm1\n\t"
+		 "psubw %%xmm4, %%xmm3\n\t"
+		 
+		 "pcmpgtw %%xmm1, %%xmm5\n\t"
+		 "pcmpgtw %%xmm3, %%xmm6\n\t"
+		 
+		 "pand %%xmm5, %%xmm2\n\t"
+		 "pand %%xmm6, %%xmm4\n\t"
+		 "pandn %%xmm7, %%xmm5\n\t"
+		 "pandn %%xmm8, %%xmm6\n\t"
+		 
+		 "paddw %%xmm2, %%xmm5\n\t"
+		 "paddw %%xmm4, %%xmm6\n\t"
+
+		 "pxor %%xmm1, %%xmm1\n\t"
+		 "movaps %%xmm5, %%xmm2\n\t"
+		 
+		 "psubw %%xmm6, %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $-1, %[bound]\n\t"
+	  
+		 "pcmpgtw %%xmm5, %%xmm1\n\t"
+		 "add $16, %[src1]\n\t"
+
+		 "pand %%xmm1, %%xmm6\n\t"
+
+		 "pandn %%xmm2, %%xmm1\n\t"
+		 "add $16, %[src2]\n\t"
+
+		 "paddw %%xmm6, %%xmm1\n\t"
+		 "add $16, %[src3]\n\t"
+
+		 "movaps %%xmm1, (%[target])\n\t"
+		 "addw $16, %[target]\n\t"
+		 "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t"
+		 
+		 "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+		 :
+		 );
+	*/ 
+
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+	return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+	
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = 0; i < bound; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/
diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h
new file mode 100644
index 0000000000..5744ca3a6b
--- /dev/null
+++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H
+#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+  p_target0 = (__m128i*)target0;
+  p_target1 = (__m128i*)target1;
+  p_target2 = (__m128i*)target2;
+  p_target3 = (__m128i*)target3;
+
+  p_src0 = (__m128i*)src0;
+  p_src1 = (__m128i*)src1;
+  p_src2 = (__m128i*)src2;
+  p_src3 = (__m128i*)src3;
+  p_src4 = (__m128i*)src4;
+
+  int i = 0;
+
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(p_src1);
+    xmm2 = _mm_load_si128(p_src2);
+    xmm3 = _mm_load_si128(p_src3);
+    xmm4 = _mm_load_si128(p_src4);
+    
+    p_src0 += 1;
+    p_src1 += 1;
+    
+    xmm1 = _mm_add_epi16(xmm0, xmm1);
+    xmm2 = _mm_add_epi16(xmm0, xmm2);
+    xmm3 = _mm_add_epi16(xmm0, xmm3);
+    xmm4 = _mm_add_epi16(xmm0, xmm4);
+    
+    
+    p_src2 += 1;
+    p_src3 += 1;
+    p_src4 += 1;
+
+    _mm_store_si128(p_target0, xmm1);
+    _mm_store_si128(p_target1, xmm2);
+    _mm_store_si128(p_target2, xmm3);
+    _mm_store_si128(p_target3, xmm4);
+    
+    p_target0 += 1;
+    p_target1 += 1;
+    p_target2 += 1;
+    p_target3 += 1;
+  }
+    /*asm volatile
+		(
+		 ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t"
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+		 "movaps (%[src4]), %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $16, %[src1]\n\t"
+		 "add $16, %[src2]\n\t"
+		 "add $16, %[src3]\n\t"
+		 "add $16, %[src4]\n\t"
+		 "paddw %%xmm1, %%xmm2\n\t"
+		 "paddw %%xmm1, %%xmm3\n\t"
+		 "paddw %%xmm1, %%xmm4\n\t"
+		 "paddw %%xmm1, %%xmm5\n\t"
+		 "add $-1, %[bound]\n\t"
+		 "movaps %%xmm2, (%[target0])\n\t"
+		 "movaps %%xmm3, (%[target1])\n\t"
+		 "movaps %%xmm4, (%[target2])\n\t"
+		 "movaps %%xmm5, (%[target3])\n\t"
+		 "add $16, %[target0]\n\t"
+		 "add $16, %[target1]\n\t"
+		 "add $16, %[target2]\n\t"
+		 "add $16, %[target3]\n\t"
+		 "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t"
+		 ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+		 );
+		 
+    */
+	 
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target0[i] = src0[i] + src1[i];
+    target1[i] = src0[i] + src2[i];
+    target2[i] = src0[i] + src3[i];
+    target3[i] = src0[i] + src4[i];
+  }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target0[i] = src0[i] + src1[i];
+		target1[i] = src0[i] + src2[i];
+		target2[i] = src0[i] + src3[i];
+		target3[i] = src0[i] + src4[i];
+	}
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/
diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h
new file mode 100644
index 0000000000..7e08bf182c
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *int16ComplexVectorPtr++;
+    *qBufferPtr++ = *int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h
new file mode 100644
index 0000000000..388c00592b
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i complexVal1, complexVal2, iOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
new file mode 100644
index 0000000000..437d5ab6bf
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+    complexVal1 = _mm_srai_epi16(complexVal1, 8);
+    complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+    int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */
diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h
new file mode 100644
index 0000000000..bdcace750a
--- /dev/null
+++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h
@@ -0,0 +1,190 @@
+#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H
+#define INCLUDED_volk_16ic_magnitude_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+   
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float inputFloatBuffer[4] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    cplxValue1 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+    cplxValue2 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float scalar = 32768.0;
+  for(number = 0; number < num_points; number++){
+    float real = ((float)(*complexVectorPtr++)) / scalar;
+    float imag = ((float)(*complexVectorPtr++)) / scalar;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h
new file mode 100644
index 0000000000..606de2fc56
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t quarterPoints = num_points / 4;    
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[8] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      
+      floatBuffer[0] = (float)(complexVectorPtr[0]);
+      floatBuffer[1] = (float)(complexVectorPtr[1]);
+      floatBuffer[2] = (float)(complexVectorPtr[2]);
+      floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+      floatBuffer[4] = (float)(complexVectorPtr[4]);
+      floatBuffer[5] = (float)(complexVectorPtr[5]);
+      floatBuffer[6] = (float)(complexVectorPtr[6]);
+      floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      _mm_store_ps(iBufferPtr, iValue);
+      _mm_store_ps(qBufferPtr, qValue);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for(; number < num_points; number++){
+      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 0000000000..62331e4964
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+  for(;number < quarterPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi16_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+    sixteenTComplexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar = 1.0/scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h
new file mode 100644
index 0000000000..ae64efbeba
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h
@@ -0,0 +1,179 @@
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) / scalar;
+    float val1Imag = (float)(*complexVectorPtr++) / scalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  const float iScalar = 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+
+  __m128 cplxValue1, cplxValue2, result, re, im;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+    
+    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(re, invScalar);
+    cplxValue2 = _mm_mul_ps(im, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) * iScalar;
+    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+
+ 
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    float real = ( (float) (*complexVectorPtr++)) * invScalar;
+    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h
deleted file mode 100644
index 67d0c55a3d..0000000000
--- a/volk/include/volk/volk_16s_add_quad_a16.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef INCLUDED_volk_16s_add_quad_a16_H
-#define INCLUDED_volk_16s_add_quad_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-
-#if LV_HAVE_SSE2
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline  void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
-  p_target0 = (__m128i*)target0;
-  p_target1 = (__m128i*)target1;
-  p_target2 = (__m128i*)target2;
-  p_target3 = (__m128i*)target3;
-
-  p_src0 = (__m128i*)src0;
-  p_src1 = (__m128i*)src1;
-  p_src2 = (__m128i*)src2;
-  p_src3 = (__m128i*)src3;
-  p_src4 = (__m128i*)src4;
-
-  int i = 0;
-
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(p_src1);
-    xmm2 = _mm_load_si128(p_src2);
-    xmm3 = _mm_load_si128(p_src3);
-    xmm4 = _mm_load_si128(p_src4);
-    
-    p_src0 += 1;
-    p_src1 += 1;
-    
-    xmm1 = _mm_add_epi16(xmm0, xmm1);
-    xmm2 = _mm_add_epi16(xmm0, xmm2);
-    xmm3 = _mm_add_epi16(xmm0, xmm3);
-    xmm4 = _mm_add_epi16(xmm0, xmm4);
-    
-    
-    p_src2 += 1;
-    p_src3 += 1;
-    p_src4 += 1;
-
-    _mm_store_si128(p_target0, xmm1);
-    _mm_store_si128(p_target1, xmm2);
-    _mm_store_si128(p_target2, xmm3);
-    _mm_store_si128(p_target3, xmm4);
-    
-    p_target0 += 1;
-    p_target1 += 1;
-    p_target2 += 1;
-    p_target3 += 1;
-  }
-    /*asm volatile
-		(
-		 ".%=volk_16s_add_quad_a16_sse2_L1:\n\t"
-		 "cmp $0, %[bound]\n\t"
-		 "je .%=volk_16s_add_quad_a16_sse2_END\n\t"
-		 "movaps (%[src0]), %%xmm1\n\t"
-		 "movaps (%[src1]), %%xmm2\n\t"
-		 "movaps (%[src2]), %%xmm3\n\t"
-		 "movaps (%[src3]), %%xmm4\n\t"
-		 "movaps (%[src4]), %%xmm5\n\t"
-		 "add $16, %[src0]\n\t"
-		 "add $16, %[src1]\n\t"
-		 "add $16, %[src2]\n\t"
-		 "add $16, %[src3]\n\t"
-		 "add $16, %[src4]\n\t"
-		 "paddw %%xmm1, %%xmm2\n\t"
-		 "paddw %%xmm1, %%xmm3\n\t"
-		 "paddw %%xmm1, %%xmm4\n\t"
-		 "paddw %%xmm1, %%xmm5\n\t"
-		 "add $-1, %[bound]\n\t"
-		 "movaps %%xmm2, (%[target0])\n\t"
-		 "movaps %%xmm3, (%[target1])\n\t"
-		 "movaps %%xmm4, (%[target2])\n\t"
-		 "movaps %%xmm5, (%[target3])\n\t"
-		 "add $16, %[target0]\n\t"
-		 "add $16, %[target1]\n\t"
-		 "add $16, %[target2]\n\t"
-		 "add $16, %[target3]\n\t"
-		 "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t"
-		 ".%=volk_16s_add_quad_a16_sse2_END:\n\t"
-		 :
-		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
-		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-		 );
-		 
-    */
-	 
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target0[i] = src0[i] + src1[i];
-    target1[i] = src0[i] + src2[i];
-    target2[i] = src0[i] + src3[i];
-    target3[i] = src0[i] + src4[i];
-  }
-}
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	for(i = 0; i < bound; ++i) {
-		target0[i] = src0[i] + src1[i];
-		target1[i] = src0[i] + src2[i];
-		target2[i] = src0[i] + src3[i];
-		target3[i] = src0[i] + src4[i];
-	}
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-
-#endif /*INCLUDED_volk_16s_add_quad_a16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h
deleted file mode 100644
index 4c1af87297..0000000000
--- a/volk/include/volk/volk_16s_branch_4_state_8_a16.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H
-#define INCLUDED_volk_16s_branch_4_state_8_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_branch_4_state_8_a16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-	
-  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
-
-  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
-
-  
-  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
-  
-  int i = 0;
-  
-  int bound = 1;
-  
-  
-  xmm0 = _mm_load_si128(p_scalars);
-  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
-  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
-  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
-  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
-
-  for(; i < bound; ++i) {
-    
-    xmm5 = _mm_load_si128(p_src0);
-    
-    
-    
-    
-    
-    
-    
-
-
-    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
-    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
-    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
-    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
-    
-    p_src0 += 4;
-   
-    
-    xmm5 = _mm_add_epi16(xmm1, xmm2);
-    
-    xmm6 = _mm_add_epi16(xmm2, xmm6);
-    xmm8 = _mm_add_epi16(xmm1, xmm8);
-   
-     
-    xmm7 = _mm_load_si128(p_cntl2);
-    xmm9 = _mm_load_si128(p_cntl3);
-    
-    xmm0 = _mm_add_epi16(xmm5, xmm0);
-    
-    
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    xmm9 = _mm_and_si128(xmm9, xmm4);
-    
-    xmm5 = _mm_load_si128(&p_cntl2[1]);
-    xmm11 = _mm_load_si128(&p_cntl3[1]);
-
-    xmm7 = _mm_add_epi16(xmm7, xmm9);
-
-    xmm5 = _mm_and_si128(xmm5, xmm3);
-    xmm11 = _mm_and_si128(xmm11, xmm4);
-
-    xmm0 = _mm_add_epi16(xmm0, xmm7);
-   
-   
- 
-    xmm7 = _mm_load_si128(&p_cntl2[2]);
-    xmm9 = _mm_load_si128(&p_cntl3[2]);
-    
-    xmm5 = _mm_add_epi16(xmm5, xmm11);
-    
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    xmm9 = _mm_and_si128(xmm9, xmm4);
-    
-    xmm6 = _mm_add_epi16(xmm6, xmm5);
-   
-    
-    xmm5 = _mm_load_si128(&p_cntl2[3]);
-    xmm11 = _mm_load_si128(&p_cntl3[3]);
-    
-    xmm7 = _mm_add_epi16(xmm7, xmm9);
-    
-    xmm5 = _mm_and_si128(xmm5, xmm3);
-    xmm11 = _mm_and_si128(xmm11, xmm4);
-    
-    xmm8 = _mm_add_epi16(xmm8, xmm7);
-    
-    xmm5 = _mm_add_epi16(xmm5, xmm11);
-    
-    _mm_store_si128(p_target, xmm0);
-    _mm_store_si128(&p_target[1], xmm6);
-
-    xmm10 = _mm_add_epi16(xmm5, xmm10);
-    
-    _mm_store_si128(&p_target[2], xmm8);
-    
-    _mm_store_si128(&p_target[3], xmm10);
-    
-    p_target += 3;   
-  }
-}
-	
-	
-#endif /*LV_HAVE_SSEs*/
-
-#if LV_HAVE_GENERIC
-static inline  void volk_16s_branch_4_state_8_a16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-	int i = 0;
-	
-	int bound = 4;
-	
-	for(; i < bound; ++i) {
-	  target[i* 8] = src0[((char)permuters[i][0])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8] & scalars[2])
-	    + (cntl3[i * 8] & scalars[3]);
-	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 1] & scalars[2])
-	    + (cntl3[i * 8 + 1] & scalars[3]);
-	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 2] & scalars[2])
-	    + (cntl3[i * 8 + 2] & scalars[3]);
-	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 3] & scalars[2])
-	    + (cntl3[i * 8 + 3] & scalars[3]);
-	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 4] & scalars[2])
-	    + (cntl3[i * 8 + 4] & scalars[3]);
-	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 5] & scalars[2])
-	    + (cntl3[i * 8 + 5] & scalars[3]);
-	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 6] & scalars[2])
-	    + (cntl3[i * 8 + 6] & scalars[3]);
-	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 7] & scalars[2])
-	    + (cntl3[i * 8 + 7] & scalars[3]);
-	  
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/
diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h
deleted file mode 100644
index 13db435def..0000000000
--- a/volk/include/volk/volk_16s_convert_8s_a16.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef INCLUDED_volk_16s_convert_8s_a16_H
-#define INCLUDED_volk_16s_convert_8s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-      
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_store_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h
deleted file mode 100644
index 9941118ae6..0000000000
--- a/volk/include/volk/volk_16s_convert_8s_ua16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_volk_16s_convert_8s_ua16_H
-#define INCLUDED_volk_16s_convert_8s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-  \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-      
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-  \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h
deleted file mode 100644
index b2ec90552c..0000000000
--- a/volk/include/volk/volk_16s_max_star_16s_a16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_volk_16s_max_star_16s_a16_H
-#define INCLUDED_volk_16s_max_star_16s_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) {
-
-
-  
-  short candidate = src0[0];
-  short cands[8];
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-  
-
-  __m128i *p_src0;
-  
-  p_src0 = (__m128i*)src0;
-
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  int i = 0;
-  
-  
-  xmm1 = _mm_setzero_si128();
-  xmm0 = _mm_setzero_si128();
-  //_mm_insert_epi16(xmm0, candidate, 0);
-  
-  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
-
-  
-  for(i = 0; i < bound; ++i) {
-    xmm1 = _mm_load_si128(p_src0);
-    p_src0 += 1;
-    xmm2 = _mm_sub_epi16(xmm1, xmm0);
-  
-
-    
-  
-  
-  
-    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
-    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
-    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
-
-    xmm6 = _mm_xor_si128(xmm4, xmm5);
-    
-    xmm3 = _mm_and_si128(xmm3, xmm0);
-    xmm4 = _mm_and_si128(xmm6, xmm1);
-    
-    xmm0 = _mm_add_epi16(xmm3, xmm4);
-    
-  
-  }
-  
-  _mm_store_si128((__m128i*)cands, xmm0);
-  
-  for(i = 0; i < 8; ++i) {
-    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
-  }
-  
- 
-  
-  for(i = 0; i < leftovers; ++i) {
-  
-    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
-  }
-
-  target[0] = candidate;
-  
-    
-    
- 
-
-}   
- 
-#endif /*LV_HAVE_SSSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	short candidate = src0[0];
-	for(i = 1; i < bound; ++i) {
-	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
-	}
-	target[0] = candidate;
-	  
-}
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
deleted file mode 100644
index 68994593bc..0000000000
--- a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
-#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-
-  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
-  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
-
-  
-  
-  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
-  __m128i  xmm5, xmm6, xmm7, xmm8;
-  
-  xmm4 = _mm_load_si128((__m128i*)shufmask0);
-  xmm5 = _mm_load_si128((__m128i*)shufmask1);
-  xmm6 = _mm_load_si128((__m128i*)andmask0);
-  xmm7 = _mm_load_si128((__m128i*)andmask1);
-  
-  __m128i *p_target, *p_src0;
-  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-
-  int bound = num_bytes >> 5;
-  int intermediate = (num_bytes >> 4) & 1;
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  int i = 0;
-  
-  
-  for(i = 0; i < bound; ++i) {
-     
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(&p_src0[1]);
-    
-    
-
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 2;
-    
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
-
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    xmm3 = _mm_and_si128(xmm2, xmm7);
-    
-
-    xmm8 = _mm_add_epi8(xmm8, xmm4);
-    xmm3 = _mm_add_epi8(xmm3, xmm5);
-
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
-    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
-    
-    
-    xmm3 = _mm_add_epi16(xmm0, xmm1);
-
-    
-    _mm_store_si128(p_target, xmm3);
-    
-    p_target += 1;
-  
-  }
-
-  for(i = 0; i < intermediate; ++i) {
-    
-    xmm0 = _mm_load_si128(p_src0);
-    
-    
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 1;
-    
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
-
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    
-    xmm3 = _mm_add_epi8(xmm8, xmm4);
-    
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
-    
-
-    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
-    
-    p_target = (__m128i*)((int8_t*)p_target + 8);
-
-  }
-    
-  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
-    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
-  }
-  
-
-}   
- 
-#endif /*LV_HAVE_SSSE3*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-      
-	for(i = 0; i < bound; i += 2) {
-	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
-	}
-		
-}
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
deleted file mode 100644
index 2e7586b575..0000000000
--- a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H
-#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline  void volk_16s_permute_and_scalar_add_a16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-	
-
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
-
-  short* p_permute_indexes = permute_indexes;
-  
-  p_target = (__m128i*)target;
-  p_cntl0 = (__m128i*)cntl0;
-  p_cntl1 = (__m128i*)cntl1;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
-  
-  int i = 0;
-  
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  xmm0 = _mm_load_si128(p_scalars);
-  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_setzero_si128();
-    xmm5 = _mm_setzero_si128();
-    xmm6 = _mm_setzero_si128();
-    xmm7 = _mm_setzero_si128();
-
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
-
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
-    
-    p_permute_indexes += 8;
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
-    
-    xmm5 = _mm_load_si128(p_cntl0);
-    xmm6 = _mm_load_si128(p_cntl1);
-    xmm7 = _mm_load_si128(p_cntl2);
-    
-    xmm5 = _mm_and_si128(xmm5, xmm1);
-    xmm6 = _mm_and_si128(xmm6, xmm2);
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    
-    xmm5 = _mm_load_si128(p_cntl3);
-    
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
-
-    p_cntl0 += 1;
-    
-    xmm5 = _mm_and_si128(xmm5, xmm4);
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
-    
-    p_cntl1 += 1;
-    p_cntl2 += 1;
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm5); 
-    
-    p_cntl3 += 1;
-
-    _mm_store_si128(p_target, xmm0);
-    
-    p_target += 1;
-  }
-	
-	
-	
-	
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target[i] = src0[permute_indexes[i]] 
-      + (cntl0[i] & scalars[0])
-      + (cntl1[i] & scalars[1])
-      + (cntl2[i] & scalars[2])
-      + (cntl3[i] & scalars[3]);
-  }
-}
-#endif /*LV_HAVE_SSEs*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	for(i = 0; i < bound; ++i) {
-		target[i] = src0[permute_indexes[i]] 
-			+ (cntl0[i] & scalars[0])
-			+ (cntl1[i] & scalars[1])
-			+ (cntl2[i] & scalars[2])
-			+ (cntl3[i] & scalars[3]);
-		
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
deleted file mode 100644
index 3e89ff963d..0000000000
--- a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H
-#define INCLUDED_volk_16s_quad_max_star_16s_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include<emmintrin.h>
-
-static inline  void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-	
-
-
-
-	int i = 0;
-
-	int bound = (num_bytes >> 4);
-	int bound_copy = bound;
-	int leftovers = (num_bytes >> 1) & 7;
-	
-	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
-	p_target = (__m128i*) target;
-	p_src0 =  (__m128i*)src0;
-	p_src1 =  (__m128i*)src1;
-	p_src2 =  (__m128i*)src2;
-	p_src3 =  (__m128i*)src3;
-	
-	
-
-	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
-	while(bound_copy > 0) {
-	 
-	  xmm1 = _mm_load_si128(p_src0);
-	  xmm2 = _mm_load_si128(p_src1);
-	  xmm3 = _mm_load_si128(p_src2);
-	  xmm4 = _mm_load_si128(p_src3);
-	  
-	  xmm5 = _mm_setzero_si128();
-	  xmm6 = _mm_setzero_si128();
-	  xmm7 = xmm1;
-	  xmm8 = xmm3;
-	  
-	  
-	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
-
-	  
-
-	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
-
-	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
-	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
-
-	  
-
-	  xmm2 = _mm_and_si128(xmm5, xmm2);
-	  xmm4 = _mm_and_si128(xmm6, xmm4);
-	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
-	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
-
-	  xmm5 = _mm_add_epi16(xmm2, xmm5);
-	  xmm6 = _mm_add_epi16(xmm4, xmm6);
-
-	  
-	  xmm1 = _mm_xor_si128(xmm1, xmm1);
-	  xmm2 = xmm5;
-	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
-	  p_src0 += 1;
-	  bound_copy -= 1;
-
-	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
-	  p_src1 += 1;
-	  
-	  xmm6 = _mm_and_si128(xmm1, xmm6);
-	  	  
-	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
-	  p_src2 += 1;
-
-
-	  
-	  xmm1 = _mm_add_epi16(xmm6, xmm1);
-	  p_src3 += 1;
-
-	  
-	  _mm_store_si128(p_target, xmm1);
-	  p_target += 1;
-      
-	}
-	
-
-	/*asm volatile
-		(
-		 "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t"
-		 "cmp $0, %[bound]\n\t"
-		 "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t"
-
-		 "movaps (%[src0]), %%xmm1\n\t"
-		 "movaps (%[src1]), %%xmm2\n\t"
-		 "movaps (%[src2]), %%xmm3\n\t"
-		 "movaps (%[src3]), %%xmm4\n\t"
-
-		 "pxor %%xmm5, %%xmm5\n\t"
-		 "pxor %%xmm6, %%xmm6\n\t"
-		 "movaps %%xmm1, %%xmm7\n\t"
-		 "movaps %%xmm3, %%xmm8\n\t"
-		 "psubw %%xmm2, %%xmm1\n\t"
-		 "psubw %%xmm4, %%xmm3\n\t"
-		 
-		 "pcmpgtw %%xmm1, %%xmm5\n\t"
-		 "pcmpgtw %%xmm3, %%xmm6\n\t"
-		 
-		 "pand %%xmm5, %%xmm2\n\t"
-		 "pand %%xmm6, %%xmm4\n\t"
-		 "pandn %%xmm7, %%xmm5\n\t"
-		 "pandn %%xmm8, %%xmm6\n\t"
-		 
-		 "paddw %%xmm2, %%xmm5\n\t"
-		 "paddw %%xmm4, %%xmm6\n\t"
-
-		 "pxor %%xmm1, %%xmm1\n\t"
-		 "movaps %%xmm5, %%xmm2\n\t"
-		 
-		 "psubw %%xmm6, %%xmm5\n\t"
-		 "add $16, %[src0]\n\t"
-		 "add $-1, %[bound]\n\t"
-	  
-		 "pcmpgtw %%xmm5, %%xmm1\n\t"
-		 "add $16, %[src1]\n\t"
-
-		 "pand %%xmm1, %%xmm6\n\t"
-
-		 "pandn %%xmm2, %%xmm1\n\t"
-		 "add $16, %[src2]\n\t"
-
-		 "paddw %%xmm6, %%xmm1\n\t"
-		 "add $16, %[src3]\n\t"
-
-		 "movaps %%xmm1, (%[target])\n\t"
-		 "addw $16, %[target]\n\t"
-		 "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t"
-		 
-		 "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t"
-		 :
-		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
-		 :
-		 );
-	*/ 
-
-	short temp0 = 0;
-	short temp1 = 0;
-	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-	}
-	return;
-
-
-}
-
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-	
-	short temp0 = 0;
-	short temp1 = 0;
-	for(i = 0; i < bound; ++i) {
-	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-	}
-}
-
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
deleted file mode 100644
index 8f9b44478d..0000000000
--- a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H
-#define INCLUDED_volk_16s_s32f_convert_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-    
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-      
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-      
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
deleted file mode 100644
index ad52aea1a5..0000000000
--- a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H
-#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-    
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-      
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-      
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
deleted file mode 100644
index 8e5da24ec6..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
-#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
-  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
-    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *int16ComplexVectorPtr++;
-    *qBufferPtr++ = *int16ComplexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
-  unsigned int eighthPoints = num_points / 8;
- 
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
-
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
deleted file mode 100644
index 068c1350c2..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
-#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
-    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i complexVal1, complexVal2, iOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
-  unsigned int eighthPoints = num_points / 8;
- 
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
deleted file mode 100644
index afa21ebc43..0000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
-#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
-    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
-
-    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
-    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
-
-    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
-
-
-    complexVal1 = _mm_srai_epi16(complexVal1, 8);
-    complexVal3 = _mm_srai_epi16(complexVal3, 8);
-
-    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
-    int16ComplexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h
deleted file mode 100644
index d832de5fe7..0000000000
--- a/volk/include/volk/volk_16sc_magnitude_16s_a16.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H
-#define INCLUDED_volk_16sc_magnitude_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-   
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(32768.0);
-  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    result = _mm_mul_ps(result, vScalar); // Scale the results
-
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
-    *magnitudeVectorPtr++ = (int16_t)(val1Result);
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(32768.0);
-  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
-  float inputFloatBuffer[4] __attribute__((aligned(128)));
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    cplxValue1 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
-    cplxValue2 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    result = _mm_mul_ps(result, vScalar); // Scale the results
-
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
-    *magnitudeVectorPtr++ = (int16_t)(val1Result);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float scalar = 32768.0;
-  for(number = 0; number < num_points; number++){
-    float real = ((float)(*complexVectorPtr++)) / scalar;
-    float imag = ((float)(*complexVectorPtr++)) / scalar;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
deleted file mode 100644
index 53e4253c44..0000000000
--- a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
-#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    float* iBufferPtr = iBuffer;
-    float* qBufferPtr = qBuffer;
-
-    uint64_t number = 0;
-    const uint64_t quarterPoints = num_points / 4;    
-    __m128 cplxValue1, cplxValue2, iValue, qValue;
-
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    float floatBuffer[8] __attribute__((aligned(128)));
-
-    for(;number < quarterPoints; number++){
-      
-      floatBuffer[0] = (float)(complexVectorPtr[0]);
-      floatBuffer[1] = (float)(complexVectorPtr[1]);
-      floatBuffer[2] = (float)(complexVectorPtr[2]);
-      floatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-      floatBuffer[4] = (float)(complexVectorPtr[4]);
-      floatBuffer[5] = (float)(complexVectorPtr[5]);
-      floatBuffer[6] = (float)(complexVectorPtr[6]);
-      floatBuffer[7] = (float)(complexVectorPtr[7]);
-
-      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
-      complexVectorPtr += 8;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      _mm_store_ps(iBufferPtr, iValue);
-      _mm_store_ps(qBufferPtr, qValue);
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)&complexVector[number];
-    for(; number < num_points; number++){
-      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
deleted file mode 100644
index 7320db3688..0000000000
--- a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
-#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-
-  __m128 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
-  for(;number < quarterPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm_cvtepi16_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
-    sixteenTComplexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 iValue;
-
-  const float iScalar = 1.0/scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-
-    iValue = _mm_load_ps(floatBuffer);
-
-    iValue = _mm_mul_ps(iValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
deleted file mode 100644
index 649b5cc96c..0000000000
--- a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
-#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    _mm_store_ps(magnitudeVectorPtr, result);
-      
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) / scalar;
-    float val1Imag = (float)(*complexVectorPtr++) / scalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-
-  __m128 cplxValue1, cplxValue2, result, re, im;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-    
-    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
-    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(re, invScalar);
-    cplxValue2 = _mm_mul_ps(im, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    _mm_store_ps(magnitudeVectorPtr, result);
-      
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) * iScalar;
-    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
-}
-
- 
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    float real = ( (float) (*complexVectorPtr++)) * invScalar;
-    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
deleted file mode 100644
index a0f97f94e2..0000000000
--- a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
-#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
-#endif
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-  
-  
-  float result = 0.0;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-  //float fith = 0.0;
-  
-  
-  
-  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
-
-  xmm9 = _mm_setzero_ps();
-  xmm1 = _mm_setzero_ps();
-  
-  xmm0 = _mm_load1_ps(&center_point_array[0]);
-  xmm6 = _mm_load1_ps(&center_point_array[1]);
-  xmm7 = _mm_load1_ps(&center_point_array[2]);
-  xmm8 = _mm_load1_ps(&center_point_array[3]);
-  //xmm11 = _mm_load1_ps(&center_point_array[4]);
-  xmm10 = _mm_load1_ps(cutoff);
-  
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 2) & 3;
-  int i = 0;
-  
-  for(; i < bound; ++i) {
-    xmm2 = _mm_load_ps(src0);
-    xmm2 = _mm_max_ps(xmm10, xmm2);
-    xmm3 = _mm_mul_ps(xmm2, xmm2);
-    xmm4 = _mm_mul_ps(xmm2, xmm3);
-    xmm5 = _mm_mul_ps(xmm3, xmm3);
-    //xmm12 = _mm_mul_ps(xmm3, xmm4);
-
-    xmm2 = _mm_mul_ps(xmm2, xmm0);
-    xmm3 = _mm_mul_ps(xmm3, xmm6);
-    xmm4 = _mm_mul_ps(xmm4, xmm7);
-    xmm5 = _mm_mul_ps(xmm5, xmm8);
-    //xmm12 = _mm_mul_ps(xmm12, xmm11);
-
-    xmm2 = _mm_add_ps(xmm2, xmm3);
-    xmm3 = _mm_add_ps(xmm4, xmm5);
-    
-    src0 += 4;
-    
-    xmm9 = _mm_add_ps(xmm2, xmm9);
-    
-    xmm1 = _mm_add_ps(xmm3, xmm1);
-
-    //xmm9 = _mm_add_ps(xmm12, xmm9);
-  }
-  
-  xmm2 = _mm_hadd_ps(xmm9, xmm1);
-  xmm3 = _mm_hadd_ps(xmm2, xmm2);
-  xmm4 = _mm_hadd_ps(xmm3, xmm3);
-
-  _mm_store_ss(&result, xmm4);
-    
-  
-
-  for(i = 0; i < leftovers; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-    
-    result += (center_point_array[0] * fst + 
-	       center_point_array[1] * sq + 
-	       center_point_array[2] * thrd + 
-	       center_point_array[3] * frth);// + 
-	       //center_point_array[4] * fith);
-  }
-
-  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
-
-  target[0] = result;
-}
- 
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-
-
-    
-  float result = 0.0;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-  //float fith = 0.0;
-  
-
-
-  int i = 0; 
-  
-  for(; i < num_bytes >> 2; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-    
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-    
-    result += (center_point_array[0] * fst + 
-	       center_point_array[1] * sq + 
-	       center_point_array[2] * thrd + 
-	       center_point_array[3] * frth); //+
-	       //center_point_array[4] * fith);
-    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
-		  center_point_array[1] * sq + 
-		  center_point_array[2] * thrd + 
-			 center_point_array[3] * frth) +
-	   //center_point_array[4] * fith) + 
-	   (center_point_array[4]), i);
-    */
-  }
-
-  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
-
-  
-  
-  *target = result;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h
deleted file mode 100644
index ba38c310f7..0000000000
--- a/volk/include/volk/volk_32f_32f_add_32f_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H
-#define INCLUDED_volk_32f_32f_add_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_add_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h
deleted file mode 100644
index a0995e631b..0000000000
--- a/volk/include/volk/volk_32f_32f_divide_32f_a16.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H
-#define INCLUDED_volk_32f_32f_divide_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_div_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) / (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) / (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
deleted file mode 100644
index 63f5221d32..0000000000
--- a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
-#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
-
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-  
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_load_ps(aPtr); 
-    bVal = _mm_load_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_load_ps(aPtr); 
-    bVal = _mm_load_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){      
-
-    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-    
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
deleted file mode 100644
index b5fa7d7a44..0000000000
--- a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
-#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
-
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-  
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_loadu_ps(aPtr); 
-    bVal = _mm_loadu_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_loadu_ps(aPtr); 
-    bVal = _mm_loadu_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-      
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-    
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/
diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
deleted file mode 100644
index 34ea933491..0000000000
--- a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H
-#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Interleaves the I & Q vector data into the complex vector
-  \param iBuffer The I buffer data to be interleaved
-  \param qBuffer The Q buffer data to be interleaved
-  \param complexVector The complex output vector
-  \param num_points The number of complex data values to be interleaved
-*/
-static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
-  unsigned int number = 0;
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-
-  const uint64_t quarterPoints = num_points / 4;
-    
-  __m128 iValue, qValue, cplxValue;
-  for(;number < quarterPoints; number++){
-    iValue = _mm_load_ps(iBufferPtr);
-    qValue = _mm_load_ps(qBufferPtr);
-
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue = _mm_unpacklo_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue = _mm_unpackhi_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Interleaves the I & Q vector data into the complex vector.
-  \param iBuffer The I buffer data to be interleaved
-  \param qBuffer The Q buffer data to be interleaved
-  \param complexVector The complex output vector
-  \param num_points The number of complex data values to be interleaved
-*/
-static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h
deleted file mode 100644
index 8ca7a5ba8b..0000000000
--- a/volk/include/volk/volk_32f_32f_max_32f_a16.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H
-#define INCLUDED_volk_32f_32f_max_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_max_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h
deleted file mode 100644
index dd05988bea..0000000000
--- a/volk/include/volk/volk_32f_32f_min_32f_a16.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H
-#define INCLUDED_volk_32f_32f_min_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_min_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
deleted file mode 100644
index 2d004db105..0000000000
--- a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H
-#define INCLUDED_volk_32f_32f_multiply_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_mul_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
deleted file mode 100644
index 207382a199..0000000000
--- a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
-#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const float* iBufferPtr = iBuffer;
-    const float* qBufferPtr = qBuffer;
-
-    __m128 vScalar = _mm_set_ps1(scalar);
-
-    const unsigned int quarterPoints = num_points / 4;
-    
-    __m128 iValue, qValue, cplxValue1, cplxValue2;
-    __m128i intValue1, intValue2;
-
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    for(;number < quarterPoints; number++){
-      iValue = _mm_load_ps(iBufferPtr);
-      qValue = _mm_load_ps(qBufferPtr);
-
-      // Interleaves the lower two values in the i and q variables into one buffer
-      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
-      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
-
-      // Interleaves the upper two values in the i and q variables into one buffer
-      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
-      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
-
-      intValue1 = _mm_cvtps_epi32(cplxValue1);
-      intValue2 = _mm_cvtps_epi32(cplxValue2);
-
-      intValue1 = _mm_packs_epi32(intValue1, intValue2);
-
-      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
-      complexVectorPtr += 8;
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)(&complexVector[number]);
-    for(; number < num_points; number++){
-      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-    }
-    
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const float* iBufferPtr = iBuffer;
-    const float* qBufferPtr = qBuffer;
-
-    __m128 vScalar = _mm_set_ps1(scalar);
-
-    const unsigned int quarterPoints = num_points / 4;
-    
-    __m128 iValue, qValue, cplxValue;
-
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    float floatBuffer[4] __attribute__((aligned(128)));
-
-    for(;number < quarterPoints; number++){
-      iValue = _mm_load_ps(iBufferPtr);
-      qValue = _mm_load_ps(qBufferPtr);
-
-      // Interleaves the lower two values in the i and q variables into one buffer
-      cplxValue = _mm_unpacklo_ps(iValue, qValue);
-      cplxValue = _mm_mul_ps(cplxValue, vScalar);
-
-      _mm_store_ps(floatBuffer, cplxValue);
-
-      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
-
-      // Interleaves the upper two values in the i and q variables into one buffer
-      cplxValue = _mm_unpackhi_ps(iValue, qValue);
-      cplxValue = _mm_mul_ps(cplxValue, vScalar);
- 
-      _mm_store_ps(floatBuffer, cplxValue);
-      
-      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)(&complexVector[number]);
-    for(; number < num_points; number++){
-      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-    }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
deleted file mode 100644
index 9fea6aa27e..0000000000
--- a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H
-#define INCLUDED_volk_32f_32f_subtract_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_sub_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) - (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) - (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h
new file mode 100644
index 0000000000..a825767dea
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_u.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ 
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h
deleted file mode 100644
index c8de768dc9..0000000000
--- a/volk/include/volk/volk_32f_convert_64f_ua16.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_volk_32f_convert_64f_ua16_H
-#define INCLUDED_volk_32f_convert_64f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the float values into double values
-    \param dVector The converted double vector values
-    \param fVector The float vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
-
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- 
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the float values into double values
-  \param dVector The converted double vector values
-  \param fVector The float vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h
new file mode 100644
index 0000000000..d6b16e3363
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h
@@ -0,0 +1,110 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
new file mode 100644
index 0000000000..4d306e53c5
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -0,0 +1,113 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h
deleted file mode 100644
index cf51cf9c53..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H
-#define INCLUDED_volk_32f_s32f_convert_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;    
-  for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
deleted file mode 100644
index 53d159f82a..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H
-#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
new file mode 100644
index 0000000000..ae874fd7ba
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H
+#define INCLUDED_volk_32f_s32f_convert_32i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
new file mode 100644
index 0000000000..561fcd800c
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h
deleted file mode 100644
index 0be649418a..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H
-#define INCLUDED_volk_32f_s32f_convert_32s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
deleted file mode 100644
index efb2c3a20d..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H
-#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
new file mode 100644
index 0000000000..c91448951e
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
@@ -0,0 +1,117 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
new file mode 100644
index 0000000000..4206935710
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
+#define INCLUDED_volk_32f_s32f_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h
deleted file mode 100644
index 69ccec5c63..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H
-#define INCLUDED_volk_32f_s32f_convert_8s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
deleted file mode 100644
index af1652b194..0000000000
--- a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H
-#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
deleted file mode 100644
index 2ba8098454..0000000000
--- a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H
-#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const float* aPtr = inputBuffer;
-    float meanBuffer[4] __attribute__((aligned(128)));
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal1, aVal2, aVal3, aVal4;
-    __m128 cVal1, cVal2, cVal3, cVal4;
-    for(;number < sixteenthPoints; number++) {
-      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
-      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
-
-      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
-
-      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
-
-      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
-
-      cVal1 = _mm_or_ps(cVal1, cVal2);
-      cVal3 = _mm_or_ps(cVal3, cVal4);
-      cVal1 = _mm_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = sixteenthPoints * 16;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* aPtr = inputBuffer;
-    float meanBuffer[4] __attribute__((aligned(128)));
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal = _mm_setzero_ps();
-    for(;number < quarterPoints; number++) {
-      aVal = _mm_load_ps(aPtr);                     // aVal = x
-      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
-      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
-      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
-      aPtr += 4;
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    const float* aPtr = inputBuffer;
-    unsigned int number = 0;
-    
-    for(number = 0; number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h
new file mode 100644
index 0000000000..2780898412
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h
@@ -0,0 +1,169 @@
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h
new file mode 100644
index 0000000000..d0d0e0a0e0
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_add_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H
+#define INCLUDED_volk_32f_x2_add_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_add_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h
new file mode 100644
index 0000000000..d844e25b0f
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_divide_32f_a16.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H
+#define INCLUDED_volk_32f_x2_divide_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_div_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h
new file mode 100644
index 0000000000..61aa56815a
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){      
+
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
new file mode 100644
index 0000000000..8469a3ceac
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+      
+    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h
new file mode 100644
index 0000000000..29c9392dfb
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  unsigned int number = 0;
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+
+  const uint64_t quarterPoints = num_points / 4;
+    
+  __m128 iValue, qValue, cplxValue;
+  for(;number < quarterPoints; number++){
+    iValue = _mm_load_ps(iBufferPtr);
+    qValue = _mm_load_ps(qBufferPtr);
+
+    // Interleaves the lower two values in the i and q variables into one buffer
+    cplxValue = _mm_unpacklo_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    // Interleaves the upper two values in the i and q variables into one buffer
+    cplxValue = _mm_unpackhi_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector.
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h
new file mode 100644
index 0000000000..26e7f1246f
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_max_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H
+#define INCLUDED_volk_32f_x2_max_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_max_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h
new file mode 100644
index 0000000000..23bae044c7
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_min_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H
+#define INCLUDED_volk_32f_x2_min_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_min_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
new file mode 100644
index 0000000000..a0dcfa86e8
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H
+#define INCLUDED_volk_32f_x2_multiply_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h
new file mode 100644
index 0000000000..30306774da
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue1, cplxValue2;
+    __m128i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+      intValue1 = _mm_cvtps_epi32(cplxValue1);
+      intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+      intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+      complexVectorPtr += 8;
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[4] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+      _mm_store_ps(floatBuffer, cplxValue);
+
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+ 
+      _mm_store_ps(floatBuffer, cplxValue);
+      
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h
new file mode 100644
index 0000000000..7404bfe799
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H
+#define INCLUDED_volk_32f_x2_subtract_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_sub_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h
new file mode 100644
index 0000000000..af9e39537d
--- /dev/null
+++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+  
+  
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+  
+  
+  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+  xmm9 = _mm_setzero_ps();
+  xmm1 = _mm_setzero_ps();
+  
+  xmm0 = _mm_load1_ps(&center_point_array[0]);
+  xmm6 = _mm_load1_ps(&center_point_array[1]);
+  xmm7 = _mm_load1_ps(&center_point_array[2]);
+  xmm8 = _mm_load1_ps(&center_point_array[3]);
+  //xmm11 = _mm_load1_ps(&center_point_array[4]);
+  xmm10 = _mm_load1_ps(cutoff);
+  
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 2) & 3;
+  int i = 0;
+  
+  for(; i < bound; ++i) {
+    xmm2 = _mm_load_ps(src0);
+    xmm2 = _mm_max_ps(xmm10, xmm2);
+    xmm3 = _mm_mul_ps(xmm2, xmm2);
+    xmm4 = _mm_mul_ps(xmm2, xmm3);
+    xmm5 = _mm_mul_ps(xmm3, xmm3);
+    //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+    xmm2 = _mm_mul_ps(xmm2, xmm0);
+    xmm3 = _mm_mul_ps(xmm3, xmm6);
+    xmm4 = _mm_mul_ps(xmm4, xmm7);
+    xmm5 = _mm_mul_ps(xmm5, xmm8);
+    //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+    xmm2 = _mm_add_ps(xmm2, xmm3);
+    xmm3 = _mm_add_ps(xmm4, xmm5);
+    
+    src0 += 4;
+    
+    xmm9 = _mm_add_ps(xmm2, xmm9);
+    
+    xmm1 = _mm_add_ps(xmm3, xmm1);
+
+    //xmm9 = _mm_add_ps(xmm12, xmm9);
+  }
+  
+  xmm2 = _mm_hadd_ps(xmm9, xmm1);
+  xmm3 = _mm_hadd_ps(xmm2, xmm2);
+  xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+  _mm_store_ss(&result, xmm4);
+    
+  
+
+  for(i = 0; i < leftovers; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth);// + 
+	       //center_point_array[4] * fith);
+  }
+
+  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+  target[0] = result;
+}
+ 
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+    
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+
+
+  int i = 0; 
+  
+  for(; i < num_bytes >> 2; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth); //+
+	       //center_point_array[4] * fith);
+    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
+		  center_point_array[1] * sq + 
+		  center_point_array[2] * thrd + 
+			 center_point_array[3] * frth) +
+	   //center_point_array[4] * fith) + 
+	   (center_point_array[4]), i);
+    */
+  }
+
+  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+  
+  
+  *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
deleted file mode 100644
index cd9cc81609..0000000000
--- a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
+++ /dev/null
@@ -1,344 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H
-#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H
-
-#include<volk/volk_complex.h>
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-  
-  
-  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  int i = 0;
-
-  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    
-
-    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
-    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
-    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-    
-    
-    in += 4;
-    tp += 4;
-
-  }
- 
-  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  
-  
-  
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-
-  }
-  /*
-  for(i = 0; i < num_bytes >> 3; ++i) {
-    *result += input[i] * conjf(taps[i]);
-  }
-  */
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE && LV_HAVE_64
-
-
-static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-  
-
-
-
-  asm volatile 
-    (
-     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %[conjugator], %%r9\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movaps 0(%%r9), %%xmm8\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%r9), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movups	0(%%r10), %%xmm2\n\t"
-     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "  xorps  %%xmm8, %%xmm2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%r9), %%xmmA\n\t"
-     "#	movaps	(%%r10), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%r9), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%r10), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "  xorps   %%xmm8, %%xmm3\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%r9), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	add	$32, %%r9\n\t"
-     "	movaps	32(%%r10), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	add	$32, %%r10\n\t"
-     "  xorps   %%xmm8, %%xmm2\n\t"
-     ".%=L1_test:\n\t"
-     "	dec	%%rax\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	and	$1, %%r8\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "	mov	$0x80000000, %%r9\n\t"
-     "	movd	%%r9, %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
-     :"rax", "r8", "r9", "r10"
-     );
-  
-  
-  int getem = num_bytes % 16;
-  
-  
-  for(; getem > 0; getem -= 8) {
-  
-    
-    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
-  
-  }
-
-  return;
-}  
-#endif
-
-#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
-  int bound = num_bytes >> 4;
-  int leftovers = num_bytes % 16;
-
-  
-  asm volatile 
-    (
-     "	#pushl	%%ebp\n\t"
-     "	#movl	%%esp, %%ebp\n\t"
-     "	#movl	12(%%ebp), %%eax		# input\n\t"
-     "	#movl	16(%%ebp), %%edx		# taps\n\t"
-     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%[eax]), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%[edx]), %%xmm2\n\t"
-     "  movl    %[ecx], (%[out])\n\t"
-     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
-     
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%[eax]), %%xmmA\n\t"
-     "#	movaps	(%[edx]), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%[edx]), %%xmm3\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "  xorps   %%xmm1, %%xmm3\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	movaps	16(%[eax]), %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%[eax]), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	addl	$32, %[eax]\n\t"
-     "	movaps	32(%[edx]), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     "	addl	$32, %[edx]\n\t"
-     ".%=L1_test:\n\t"
-     "	decl	%[ecx]\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %[ecx]\n\t"
-     "	andl	$1, %[ecx]\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "  #movl 8(%%ebp), %[eax] \n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "  movl	$0x80000000, (%[out])\n\t"
-     "	movss	(%[out]), %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
-     "	#popl	%%ebp\n\t"
-     :
-     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
-     );
-
-  
-  
-  
-  printf("%d, %d\n", leftovers, bound);
-  
-  for(; leftovers > 0; leftovers -= 8) {
-    
-    
-    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
-    
-  }
-  
-  return;
-  
-  
-  
-
-  
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-
-
-#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
deleted file mode 100644
index 2ccfcf2f27..0000000000
--- a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
+++ /dev/null
@@ -1,468 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H
-#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H
-
-#include <volk/volk_complex.h>
-#include <stdio.h>
-#include <string.h>
-
-
-#if LV_HAVE_GENERIC 
-
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-  
-  
-  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  int i = 0;
-
-  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    
-
-    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
-    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
-    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-    
-    
-    in += 4;
-    tp += 4;
-
-  }
-
-  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  
-  
-  
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
-
-  }
-
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE && LV_HAVE_64
-
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-
-  asm 
-    (
-     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%r9), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%%r10), %%xmm2\n\t"
-     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%r9), %%xmmA\n\t"
-     "#	movaps	(%%r10), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%r9), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%r10), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%r9), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	add	$32, %%r9\n\t"
-     "	movaps	32(%%r10), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	add	$32, %%r10\n\t"
-     ".%=L1_test:\n\t"
-     "	dec	%%rax\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	and	$1, %%r8\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "	mov	$0x80000000, %%r9\n\t"
-     "	movd	%%r9, %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
-     :"rax", "r8", "r9", "r10"
-     );
-  
-  
-  int getem = num_bytes % 16;
-  
-  
-  for(; getem > 0; getem -= 8) {
-  
-    
-    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-  
-  }
-
-  return;
-  
-}
-
-#endif
-
-#if LV_HAVE_SSE && LV_HAVE_32
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  asm volatile 
-    (
-     "	#pushl	%%ebp\n\t"
-     "	#movl	%%esp, %%ebp\n\t"
-     "	movl	12(%%ebp), %%eax		# input\n\t"
-     "	movl	16(%%ebp), %%edx		# taps\n\t"
-     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%eax), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%%edx), %%xmm2\n\t"
-     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%eax), %%xmmA\n\t"
-     "#	movaps	(%%edx), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%eax), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%edx), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%eax), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	addl	$32, %%eax\n\t"
-     "	movaps	32(%%edx), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	addl	$32, %%edx\n\t"
-     ".%=L1_test:\n\t"
-     "	decl	%%ecx\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %%ecx\n\t"
-     "	andl	$1, %%ecx\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "  movl 8(%%ebp), %%eax \n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "  movl	$0x80000000, (%%eax)\n\t"
-     "	movss	(%%eax), %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	#movl	8(%%ebp), %%eax		# @result\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
-     "	#popl	%%ebp\n\t"
-     :
-     :
-     : "eax", "ecx", "edx"
-     );
-
-  
-  int getem = num_bytes % 16;
-  
-  for(; getem > 0; getem -= 8) {
-    
-    
-    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-    
-  }
-  
-  return;
-  
-  
-  
-
-  
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
-
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_bytes >> 4;
-
-  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
-
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
-
-  dotProdVal = _mm_setzero_ps();
-
-  for(;number < halfPoints; number++){
-      
-    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-      
-    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-      
-    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-      
-    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-      
-    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-      
-    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
-
-    a += 2;
-    b += 2;
-  }
-
-  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
-
-  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
-
-  if((num_bytes >> 2) != 0) {
-    dotProduct += (*a) * (*b);
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes);
-  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
-   /* 
-    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-    float *p_input, *p_taps;
-    __m64 *p_result;
-
-    p_result = (__m64*)result;
-    p_input = (float*)input;
-    p_taps = (float*)taps;
-
-    static const __m128i neg = {0x000000000000000080000000};
-
-    int i = 0;
-  
-    int bound = (num_bytes >> 5);
-    int leftovers = (num_bytes & 24) >> 3;
-
-    real0 = _mm_sub_ps(real0, real0);
-    real1 = _mm_sub_ps(real1, real1);
-    im0 = _mm_sub_ps(im0, im0);
-    im1 = _mm_sub_ps(im1, im1);
-  
-    for(; i < bound; ++i) {
-  
-    
-    xmm0 = _mm_load_ps(p_input);
-    xmm1 = _mm_load_ps(p_taps);
-    
-    p_input += 4;
-    p_taps += 4;
-    
-    xmm2 = _mm_load_ps(p_input);
-    xmm3 = _mm_load_ps(p_taps);
-    
-    p_input += 4;
-    p_taps += 4;
-    
-    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
-    
-    //imaginary vector from input
-    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-    //real vector from input
-    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-    //imaginary vector from taps
-    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-    //real vector from taps
-    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
-    
-    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
-    
-    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
-    
-    real0 = _mm_add_ps(xmm4, real0);
-    real1 = _mm_add_ps(xmm5, real1);
-    im0 = _mm_add_ps(xmm6, im0);
-    im1 = _mm_add_ps(xmm7, im1);
-    
-    }
-
-
-    
-    
-    real1 = _mm_xor_ps(real1, (__m128)neg);
-    
-  
-    im0 = _mm_add_ps(im0, im1);
-    real0 = _mm_add_ps(real0, real1);
-  
-    im0 = _mm_add_ps(im0, real0);
-  
-    _mm_storel_pi(p_result, im0);
-  
-    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
-    
-    *result += input[i] * taps[i];
-    }
-  */
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
deleted file mode 100644
index 59259882ca..0000000000
--- a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H
-#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-
-    for(;number < halfPoints; number++){
-      
-      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-      
-      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-      
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-      
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-      
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-      
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-    
-      _mm_store_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      b += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * (*b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
-static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    static const float mask = -0.0;
-    volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-
-
-#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
deleted file mode 100644
index 14f5116970..0000000000
--- a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H
-#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-#include <string.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-  
-
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
-  lv_32fc_t diff;
-  memset(&diff, 0x0, 2*sizeof(float));
-
-  float sq_dist = 0.0;
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-  
-  
-  
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm8 = _mm_load1_ps(&scalar);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
-  
-  
-  for(; i < bound - 1; ++i) {
-  
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    xmm5 = _mm_sub_ps(xmm1, xmm3);
-    points += 4;
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-    xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
-    xmm3 = _mm_load_ps((float*)&points[2]);
-    
-    xmm4 = _mm_mul_ps(xmm4, xmm8);
-
-    _mm_store_ps(target, xmm4);
-
-    target += 4;
-
-  }
-  
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-  
-  
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
-  
-  xmm4 = _mm_mul_ps(xmm4, xmm8);
-   
-  _mm_store_ps(target, xmm4);
-  
-  target += 4;
-  
-
-  for(i = 0; i < leftovers0; ++i) {
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    
-    points += 2;
-    
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
-
-    xmm4 = _mm_mul_ps(xmm4, xmm8);
-    
-    _mm_storeh_pi((__m64*)target, xmm4);
-
-    target += 2;
-  }
-
-  for(i = 0; i < leftovers1; ++i) {
-    
-    diff = src0[0] - points[0];
-
-    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-
-    target[0] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-  lv_32fc_t diff;
-  float sq_dist;
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
-
-    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-    
-    target[i] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
deleted file mode 100644
index b6c72adbf2..0000000000
--- a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H
-#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-  
-
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  lv_32fc_t diff;
-  float sq_dist;
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
-  
-
-  for(; i < bound - 1; ++i) {
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    xmm5 = _mm_sub_ps(xmm1, xmm3);
-    points += 4;
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-    xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
-    xmm3 = _mm_load_ps((float*)&points[2]);
-
-    _mm_store_ps(target, xmm4);
-
-    target += 4;
-
-  }
-  
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-  
-  
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
-   
-  _mm_store_ps(target, xmm4);
-  
-  target += 4;
-
-  for(i = 0; i < leftovers0; ++i) {
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    
-    points += 2;
-    
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
-    
-    _mm_storeh_pi((__m64*)target, xmm4);
-
-    target += 2;
-  }
-
-  for(i = 0; i < leftovers1; ++i) {
-    
-    diff = src0[0] - points[0];
-
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-
-    target[0] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-  lv_32fc_t diff;
-  float sq_dist;
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
-
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-    
-    target[i] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
deleted file mode 100644
index 3ee579c2ef..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H
-#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-  for(;number < quarterPoints; number++){
-      
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h
new file mode 100644
index 0000000000..84d2576edc
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
deleted file mode 100644
index 404defc367..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H
-#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    double* iBufferPtr = iBuffer;
-    double* qBufferPtr = qBuffer;
-
-    const unsigned int halfPoints = num_points / 2;    
-    __m128 cplxValue, fVal;
-    __m128d dVal;
-
-    for(;number < halfPoints; number++){
-      
-      cplxValue = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i1i2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
-      dVal = _mm_cvtps_pd(fVal); 
-      _mm_store_pd(iBufferPtr, dVal);
-
-      // Arrange in q1q2q1q2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
-      dVal = _mm_cvtps_pd(fVal); 
-      _mm_store_pd(qBufferPtr, dVal);
-
-      iBufferPtr += 2;
-      qBufferPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(; number < num_points; number++){
-      *iBufferPtr++ = *complexVectorPtr++;
-      *qBufferPtr++ = *complexVectorPtr++;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-  double* qBufferPtr = qBuffer;
-
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h
new file mode 100644
index 0000000000..34262a7af7
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;    
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for(;number < halfPoints; number++){
+      
+      cplxValue = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 2;
+      qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  double* qBufferPtr = qBuffer;
+
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    *qBufferPtr++ = (double)*complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
new file mode 100644
index 0000000000..6042e6d62d
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
@@ -0,0 +1,80 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    iValue = _mm_mul_ps(iValue, vScalar);
+
+    _mm_store_ps(floatBuffer, iValue);
+    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  iBufferPtr = &iBuffer[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
deleted file mode 100644
index 53235e5f79..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H
-#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-
-    iValue = _mm_mul_ps(iValue, vScalar);
-
-    _mm_store_ps(floatBuffer, iValue);
-    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h
new file mode 100644
index 0000000000..5303596003
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
deleted file mode 100644
index dc3c6741a0..0000000000
--- a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H
-#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result);
-
-    result = _mm_mul_ps(result, vScalar);
-
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result);
-
-    result = _mm_mul_ps(result, vScalar);
-
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
deleted file mode 100644
index 29ccdaef70..0000000000
--- a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H
-#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Calculates the log10 power value divided by the RBW for each input point
-  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param rbw The resolution bandwith of the fft spectrum
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 magScalar = _mm_set_ps1(10.0);
-  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
-
-  __m128 invRBW = _mm_set_ps1(iRBW);
-  
-  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
-
-  __m128 power;
-  __m128 input1, input2;
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the complex values 
-    input1 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    input2 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    
-    // Apply the normalization factor
-    input1 = _mm_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm_mul_ps(input2, invNormalizationFactor);
-
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm_mul_ps(input2, input2);
-    
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    power = _mm_hadd_ps(input1, input2);
-
-    // Divide by the rbw
-    power = _mm_mul_ps(power, invRBW);
-
-    // Calculate the natural log power
-    power = logf4(power);
-    
-    // Convert to log10 and multiply by 10.0
-    power = _mm_mul_ps(power, magScalar);
-    
-    // Store the floating point results
-    _mm_store_ps(destPtr, power);
-    
-    destPtr += 4;
-  }
-  
-  number = quarterPoints*4;  
-#endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-    
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
-    destPtr++;
-  }
-  
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the log10 power value divided by the RBW for each input point
-  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param rbw The resolution bandwith of the fft spectrum
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
-  // Calculate the Power of the complex point
-  const float* inputPtr = (float*)complexFFTInput;
-  float* realFFTDataPointsPtr = logPowerOutput;
-  unsigned int point;
-  const float invRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-  for(point = 0; point < num_points; point++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
-    
-    realFFTDataPointsPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h
new file mode 100644
index 0000000000..0120b53076
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H
+#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 magScalar = _mm_set_ps1(10.0);
+  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+  __m128 invRBW = _mm_set_ps1(iRBW);
+  
+  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+  __m128 power;
+  __m128 input1, input2;
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the complex values 
+    input1 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    input2 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    
+    // Apply the normalization factor
+    input1 = _mm_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm_mul_ps(input2, input2);
+    
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    power = _mm_hadd_ps(input1, input2);
+
+    // Divide by the rbw
+    power = _mm_mul_ps(power, invRBW);
+
+    // Calculate the natural log power
+    power = logf4(power);
+    
+    // Convert to log10 and multiply by 10.0
+    power = _mm_mul_ps(power, magScalar);
+    
+    // Store the floating point results
+    _mm_store_ps(destPtr, power);
+    
+    destPtr += 4;
+  }
+  
+  number = quarterPoints*4;  
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+    
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+    destPtr++;
+  }
+  
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  // Calculate the Power of the complex point
+  const float* inputPtr = (float*)complexFFTInput;
+  float* realFFTDataPointsPtr = logPowerOutput;
+  unsigned int point;
+  const float invRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+  for(point = 0; point < num_points; point++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+    
+    realFFTDataPointsPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h
new file mode 100644
index 0000000000..a01971df34
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h
@@ -0,0 +1,344 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H
+
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+ 
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+  }
+  /*
+  for(i = 0; i < num_bytes >> 3; ++i) {
+    *result += input[i] * conjf(taps[i]);
+  }
+  */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  
+
+
+
+  asm volatile 
+    (
+     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %[conjugator], %%r9\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movaps 0(%%r9), %%xmm8\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movups	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "  xorps  %%xmm8, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "  xorps   %%xmm8, %%xmm3\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     "  xorps   %%xmm8, %%xmm2\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+  
+  }
+
+  return;
+}  
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+  int bound = num_bytes >> 4;
+  int leftovers = num_bytes % 16;
+
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	#movl	12(%%ebp), %%eax		# input\n\t"
+     "	#movl	16(%%ebp), %%edx		# taps\n\t"
+     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%[eax]), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%[edx]), %%xmm2\n\t"
+     "  movl    %[ecx], (%[out])\n\t"
+     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%[eax]), %%xmmA\n\t"
+     "#	movaps	(%[edx]), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%[edx]), %%xmm3\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "  xorps   %%xmm1, %%xmm3\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	movaps	16(%[eax]), %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%[eax]), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %[eax]\n\t"
+     "	movaps	32(%[edx]), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	addl	$32, %[edx]\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%[ecx]\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %[ecx]\n\t"
+     "	andl	$1, %[ecx]\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  #movl 8(%%ebp), %[eax] \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%[out])\n\t"
+     "	movss	(%[out]), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+     );
+
+  
+  
+  
+  printf("%d, %d\n", leftovers, bound);
+  
+  for(; leftovers > 0; leftovers -= 8) {
+    
+    
+    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h
new file mode 100644
index 0000000000..9a7b65ab4e
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h
@@ -0,0 +1,468 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#if LV_HAVE_GENERIC 
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+
+  }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  asm 
+    (
+     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+  
+  }
+
+  return;
+  
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	movl	12(%%ebp), %%eax		# input\n\t"
+     "	movl	16(%%ebp), %%edx		# taps\n\t"
+     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%eax), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%edx), %%xmm2\n\t"
+     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%eax), %%xmmA\n\t"
+     "#	movaps	(%%edx), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%eax), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%edx), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%eax), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %%eax\n\t"
+     "	movaps	32(%%edx), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	addl	$32, %%edx\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%%ecx\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %%ecx\n\t"
+     "	andl	$1, %%ecx\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  movl 8(%%ebp), %%eax \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%%eax)\n\t"
+     "	movss	(%%eax), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %%eax		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     :
+     : "eax", "ecx", "edx"
+     );
+
+  
+  int getem = num_bytes % 16;
+  
+  for(; getem > 0; getem -= 8) {
+    
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int halfPoints = num_bytes >> 4;
+
+  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm_setzero_ps();
+
+  for(;number < halfPoints; number++){
+      
+    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+    a += 2;
+    b += 2;
+  }
+
+  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
+
+  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+  if((num_bytes >> 2) != 0) {
+    dotProduct += (*a) * (*b);
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes);
+  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
+   /* 
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+    float *p_input, *p_taps;
+    __m64 *p_result;
+
+    p_result = (__m64*)result;
+    p_input = (float*)input;
+    p_taps = (float*)taps;
+
+    static const __m128i neg = {0x000000000000000080000000};
+
+    int i = 0;
+  
+    int bound = (num_bytes >> 5);
+    int leftovers = (num_bytes & 24) >> 3;
+
+    real0 = _mm_sub_ps(real0, real0);
+    real1 = _mm_sub_ps(real1, real1);
+    im0 = _mm_sub_ps(im0, im0);
+    im1 = _mm_sub_ps(im1, im1);
+  
+    for(; i < bound; ++i) {
+  
+    
+    xmm0 = _mm_load_ps(p_input);
+    xmm1 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm2 = _mm_load_ps(p_input);
+    xmm3 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+    
+    //imaginary vector from input
+    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+    //real vector from input
+    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+    //imaginary vector from taps
+    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+    //real vector from taps
+    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+    
+    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+    
+    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+    
+    real0 = _mm_add_ps(xmm4, real0);
+    real1 = _mm_add_ps(xmm5, real1);
+    im0 = _mm_add_ps(xmm6, im0);
+    im1 = _mm_add_ps(xmm7, im1);
+    
+    }
+
+
+    
+    
+    real1 = _mm_xor_ps(real1, (__m128)neg);
+    
+  
+    im0 = _mm_add_ps(im0, im1);
+    real0 = _mm_add_ps(real0, real1);
+  
+    im0 = _mm_add_ps(im0, real0);
+  
+    _mm_storel_pi(p_result, im0);
+  
+    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
+    
+    *result += input[i] * taps[i];
+    }
+  */
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
new file mode 100644
index 0000000000..224ab19c8e
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
+static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    static const float mask = -0.0;
+    volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h
new file mode 100644
index 0000000000..6a863b16d2
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+  lv_32fc_t diff;
+  memset(&diff, 0x0, 2*sizeof(float));
+
+  float sq_dist = 0.0;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+  
+  
+  
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm8 = _mm_load1_ps(&scalar);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+  
+  for(; i < bound - 1; ++i) {
+  
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+    
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+  
+  xmm4 = _mm_mul_ps(xmm4, xmm8);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+  
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h
new file mode 100644
index 0000000000..406097fc88
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h
@@ -0,0 +1,112 @@
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  lv_32fc_t diff;
+  float sq_dist;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+
+  for(; i < bound - 1; ++i) {
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h
new file mode 100644
index 0000000000..0fcadd9cbf
--- /dev/null
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_store_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
new file mode 100644
index 0000000000..1dd6422f8f
--- /dev/null
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h
new file mode 100644
index 0000000000..3baa1d856d
--- /dev/null
+++ b/volk/include/volk/volk_32i_x2_and_32i_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H
+#define INCLUDED_volk_32i_x2_and_32i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_and_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */
diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h
new file mode 100644
index 0000000000..0be22f00ad
--- /dev/null
+++ b/volk/include/volk/volk_32i_x2_or_32i_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H
+#define INCLUDED_volk_32i_x2_or_32i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_or_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */
diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h
deleted file mode 100644
index 0e8380757e..0000000000
--- a/volk/include/volk/volk_32s_32s_and_32s_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H
-#define INCLUDED_volk_32s_32s_and_32s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = (float*)cVector;
-    const float* aPtr = (float*)aVector;
-    const float* bPtr = (float*)bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_and_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      cVector[number] = aVector[number] & bVector[number];
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    int32_t* cPtr = cVector;
-    const int32_t* aPtr = aVector;
-    const int32_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) & (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */
diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h
deleted file mode 100644
index 2dcf2e5515..0000000000
--- a/volk/include/volk/volk_32s_32s_or_32s_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H
-#define INCLUDED_volk_32s_32s_or_32s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = (float*)cVector;
-    const float* aPtr = (float*)aVector;
-    const float* bPtr = (float*)bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_or_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      cVector[number] = aVector[number] | bVector[number];
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    int32_t* cPtr = cVector;
-    const int32_t* aPtr = aVector;
-    const int32_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) | (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */
diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h
deleted file mode 100644
index c16ecc9dde..0000000000
--- a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H
-#define INCLUDED_volk_32s_s32f_convert_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_load_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_store_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
deleted file mode 100644
index 4eb5a5b850..0000000000
--- a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H
-#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h
deleted file mode 100644
index 7e091851f2..0000000000
--- a/volk/include/volk/volk_64f_64f_max_64f_a16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H
-#define INCLUDED_volk_64f_64f_max_64f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-
-    __m128d aVal, bVal, cVal;
-    for(;number < halfPoints; number++){
-      
-      aVal = _mm_load_pd(aPtr); 
-      bVal = _mm_load_pd(bPtr);
-      
-      cVal = _mm_max_pd(aVal, bVal); 
-      
-      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 2;
-      bPtr += 2;
-      cPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(;number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h
deleted file mode 100644
index f2bcbe83b6..0000000000
--- a/volk/include/volk/volk_64f_64f_min_64f_a16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H
-#define INCLUDED_volk_64f_64f_min_64f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-
-    __m128d aVal, bVal, cVal;
-    for(;number < halfPoints; number++){
-      
-      aVal = _mm_load_pd(aPtr); 
-      bVal = _mm_load_pd(bPtr);
-      
-      cVal = _mm_min_pd(aVal, bVal); 
-      
-      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 2;
-      bPtr += 2;
-      cPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(;number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h
new file mode 100644
index 0000000000..6338c14335
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_u.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ 
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_storeu_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h
deleted file mode 100644
index 7774db1b75..0000000000
--- a/volk/include/volk/volk_64f_convert_32f_ua16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_volk_64f_convert_32f_ua16_H
-#define INCLUDED_volk_64f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the double values into float values
-    \param dVector The converted float vector values
-    \param fVector The double vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
- 
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
-
-    ret = _mm_movelh_ps(ret, ret2);
-
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the double values into float values
-  \param dVector The converted float vector values
-  \param fVector The double vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h
new file mode 100644
index 0000000000..4b0c1f5f15
--- /dev/null
+++ b/volk/include/volk/volk_64f_x2_max_64f_a16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H
+#define INCLUDED_volk_64f_x2_max_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_max_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h
new file mode 100644
index 0000000000..aa961e3846
--- /dev/null
+++ b/volk/include/volk/volk_64f_x2_min_64f_a16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H
+#define INCLUDED_volk_64f_x2_min_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_min_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h
new file mode 100644
index 0000000000..3d70457539
--- /dev/null
+++ b/volk/include/volk/volk_8i_convert_16i_a16.h
@@ -0,0 +1,83 @@
+#ifndef INCLUDED_volk_8i_convert_16i_a16_H
+#define INCLUDED_volk_8i_convert_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h
new file mode 100644
index 0000000000..bcff13406b
--- /dev/null
+++ b/volk/include/volk/volk_8i_convert_16i_u.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
new file mode 100644
index 0000000000..d5c8eeb51c
--- /dev/null
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
@@ -0,0 +1,105 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H
+#define INCLUDED_volk_8i_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
new file mode 100644
index 0000000000..1e30957e84
--- /dev/null
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1( iScalar );
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h
new file mode 100644
index 0000000000..91c9b2c584
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+  __m128i complexVal, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
+    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+
+    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h
new file mode 100644
index 0000000000..bf3dc20dd9
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i complexVal, outputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    outputVal = _mm_cvtepi8_epi16(complexVal);
+    outputVal = _mm_slli_epi16(outputVal, 7);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h
new file mode 100644
index 0000000000..13de79423d
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, outputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+    outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h
new file mode 100644
index 0000000000..22c3ebb23c
--- /dev/null
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h
@@ -0,0 +1,164 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue, qFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+    qBufferPtr += 4;
+
+    qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+
+    qBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(complexVectorPtr[0]);
+    floatBuffer[1] = (float)(complexVectorPtr[1]);
+    floatBuffer[2] = (float)(complexVectorPtr[2]);
+    floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    floatBuffer[4] = (float)(complexVectorPtr[4]);
+    floatBuffer[5] = (float)(complexVectorPtr[5]);
+    floatBuffer[6] = (float)(complexVectorPtr[6]);
+    floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int8_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 0000000000..5f1430394b
--- /dev/null
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,133 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+
+    complexVal = _mm_srli_si128(complexVal, 4);
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h
new file mode 100644
index 0000000000..d9cacbf465
--- /dev/null
+++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  lv_16sc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+    
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+      
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+      
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+    a += 4;
+    b += 4;
+    c += 4;
+  }
+    
+  number = quarterPoints * 4;
+  int16_t* c16Ptr = (int16_t*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  int16_t* c16Ptr = (int16_t*)cVector;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number =0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h
new file mode 100644
index 0000000000..6ec923a4fa
--- /dev/null
+++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  __m128 ret;
+  lv_32fc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    a += 4;
+    b += 4;
+  }
+
+  number = quarterPoints * 4;
+  float* cFloatPtr = (float*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cFloatPtr++ = lv_creal(temp) / scalar;
+    *cFloatPtr++ = lv_cimag(temp) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* cPtr = (float*)cVector;
+  const float invScalar = 1.0 / scalar;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number = 0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cPtr++ = (lv_creal(temp) * invScalar);
+    *cPtr++ = (lv_cimag(temp) * invScalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h
deleted file mode 100644
index 38efdb6a35..0000000000
--- a/volk/include/volk/volk_8s_convert_16s_a16.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef INCLUDED_volk_8s_convert_16s_a16_H
-#define INCLUDED_volk_8s_convert_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_load_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_store_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_store_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
-static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h
deleted file mode 100644
index a726bfb5e1..0000000000
--- a/volk/include/volk/volk_8s_convert_16s_ua16.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_8s_convert_16s_ua16_H
-#define INCLUDED_volk_8s_convert_16s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h
deleted file mode 100644
index 45185ac2ee..0000000000
--- a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H
-#define INCLUDED_volk_8s_s32f_convert_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
-static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
deleted file mode 100644
index 310824580c..0000000000
--- a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H
-#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1( iScalar );
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
deleted file mode 100644
index eae1185ec7..0000000000
--- a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H
-#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  lv_16sc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
-    
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
-    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
-      
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-      
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-    
-  number = quarterPoints * 4;
-  int16_t* c16Ptr = (int16_t*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  int16_t* c16Ptr = (int16_t*)cVector;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number =0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */
diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
deleted file mode 100644
index 621276b089..0000000000
--- a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H
-#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  __m128 ret;
-  lv_32fc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
-    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    a += 4;
-    b += 4;
-  }
-
-  number = quarterPoints * 4;
-  float* cFloatPtr = (float*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-    
-    *cFloatPtr++ = lv_creal(temp) / scalar;
-    *cFloatPtr++ = lv_cimag(temp) / scalar;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* cPtr = (float*)cVector;
-  const float invScalar = 1.0 / scalar;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number = 0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-    
-    *cPtr++ = (lv_creal(temp) * invScalar);
-    *cPtr++ = (lv_cimag(temp) * invScalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
deleted file mode 100644
index 6a35e969d1..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H
-#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-  __m128i complexVal, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
-    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
-
-    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
-    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
deleted file mode 100644
index 67ffebd992..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H
-#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i complexVal, outputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    outputVal = _mm_cvtepi8_epi16(complexVal);
-    outputVal = _mm_slli_epi16(outputVal, 7);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
deleted file mode 100644
index ecffc092e1..0000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, outputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
-
-    outputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
deleted file mode 100644
index cedbf202cc..0000000000
--- a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H
-#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;    
-  __m128 iFloatValue, qFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    iComplexVal = _mm_srli_si128(iComplexVal, 4);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 4;
-
-    qComplexVal = _mm_srli_si128(qComplexVal, 4);
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-
-    qBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  float floatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(complexVectorPtr[0]);
-    floatBuffer[1] = (float)(complexVectorPtr[1]);
-    floatBuffer[2] = (float)(complexVectorPtr[2]);
-    floatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    floatBuffer[4] = (float)(complexVectorPtr[4]);
-    floatBuffer[5] = (float)(complexVectorPtr[5]);
-    floatBuffer[6] = (float)(complexVectorPtr[6]);
-    floatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int8_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
deleted file mode 100644
index 902795131c..0000000000
--- a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H
-#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;    
-  __m128 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-
-    complexVal = _mm_srli_si128(complexVal, 4);
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 iValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
-
-    iValue = _mm_load_ps(floatBuffer);
-
-    iValue = _mm_mul_ps(iValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py
index fc1ec10ef6..bc8f959af6 100755
--- a/volk/include/volk/volk_register.py
+++ b/volk/include/volk/volk_register.py
@@ -55,7 +55,7 @@ functions = [];
 
 
 for line in mfile:
-    subline = re.search(".*(a16).*", line);
+    subline = re.search(".*_(a16|u)\.h.*", line);
     if subline:
         subsubline = re.search("(?<=volk_).*", subline.group(0));
         if subsubline:
@@ -70,7 +70,7 @@ datatypes = set(datatypes);
 for line in mfile:
     for dt in datatypes:
         if dt in line:
-            subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line);
+            subline = re.search("(volk_" + dt +"_.*(a16|u).*\.h)", line);
             if subline:
                 
                 subsubline = re.search(".+(?=\.h)", subline.group(0));
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index e73b709854..4c151bd6fa 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -19,7 +19,8 @@ float uniform() {
   return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
 }
 
-void random_floats (float *buf, unsigned n)
+template <class t>
+void random_floats (t *buf, unsigned n)
 {
   for (unsigned i = 0; i < n; i++)
     buf[i] = uniform ();
@@ -28,8 +29,8 @@ void random_floats (float *buf, unsigned n)
 void load_random_data(void *data, volk_type_t type, unsigned int n) {
     if(type.is_complex) n *= 2;
     if(type.is_float) {
-        assert(type.size == 4); //TODO: double support
-        random_floats((float *)data, n);
+        if(type.size == 8) random_floats<double>((double *)data, n);
+        else random_floats<float>((float *)data, n);
     } else {
         float int_max = pow(2, type.size*8);
         if(type.is_signed) int_max /= 2.0;
@@ -54,7 +55,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) {
                 else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
             break;
             default:
-                throw; //no shenanigans here
+                throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
             }
         }
     }
@@ -94,6 +95,9 @@ static std::vector<std::string> get_arch_list(const int archs[]) {
         case (1<<LV_SSE2):
             archlist.push_back("sse2");
             break;
+        case (1<<LV_SSE3):
+            archlist.push_back("sse3");
+            break;
         case (1<<LV_SSSE3):
             archlist.push_back("ssse3");
             break;
@@ -128,7 +132,7 @@ volk_type_t volk_type_from_string(std::string name) {
     type.size = 0;
     type.str = name;
     
-    assert(name.size() > 1);
+    if(name.size() < 2) throw std::string("name too short to be a datatype");
     
     //is it a scalar?
     if(name[0] == 's') { 
@@ -138,7 +142,7 @@ volk_type_t volk_type_from_string(std::string name) {
     
     //get the data size
     int last_size_pos = name.find_last_of("0123456789");
-    if(last_size_pos < 0) throw 0;
+    if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
     //will throw if malformed
     int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
 
@@ -182,12 +186,14 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
     //ok. we're assuming a string in the form
     //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
 
-    enum { SIDE_INPUT, SIDE_OUTPUT } side = SIDE_INPUT;
+    enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
     std::string fn_name;
     volk_type_t type;
     BOOST_FOREACH(std::string token, toked) {
         try {
             type = volk_type_from_string(token);
+            if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+            
             if(side == SIDE_INPUT) inputsig.push_back(type);
             else outputsig.push_back(type);
         } catch (...){
@@ -201,9 +207,11 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
                 }
             }
             else if(side == SIDE_INPUT) { //it's the function name, at least it better be
-                side = SIDE_OUTPUT;
-                fn_name = token;
-            } else {
+                side = SIDE_NAME;
+                fn_name.append("_");
+                fn_name.append(token);
+            } 
+            else if(side == SIDE_OUTPUT) {
                 if(token != toked.back()) throw; //the last token in the name is the alignment
             }
         }
@@ -236,20 +244,40 @@ inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, void *outbuff, std::vect
     while(iter--) func(outbuff, inbuffs[0], scalar, vlen, arch.c_str());
 }
 
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, void *outbuff, std::vector<void *> &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(outbuff, inbuffs[0], inbuffs[1], scalar, vlen, arch.c_str());
+}
+
 template <class t>
 bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+    bool fail = false;
+    int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
-        if(fabs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) return 1;
+        if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+            fail=true;
+            if(print_max_errs-- > 0) {
+                std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
+            }
+        }
     }
-    return 0;
+    
+    return fail;
 }
 
 template <class t>
-bool icompare(t *in1, t *in2, unsigned int vlen) {
+bool icompare(t *in1, t *in2, unsigned int vlen, float tol) {
+    bool fail = false;
+    int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
-        if(((t *)(in1))[i] != ((t *)(in2))[i]) return 1;
+        if(((t *)(in1))[i] != ((t *)(in2))[i]) {
+            fail=true;
+            if(print_max_errs-- > 0) {
+                std::cout << "offset " << i << " in1: " << int(((t *)(in1))[i]) << " in2: " << int(((t *)(in2))[i]) << std::endl;
+            }
+        }
     }
-    return 0;
+    
+    return fail;
 }
 
 bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, int vlen, int iter) {
@@ -300,7 +328,7 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
             load_random_data(inbuffs[i], inputsig[i], vlen);        
         }
     }
-    
+
     //now run the test
     clock_t start, end;
     for(int i = 0; i < arch_list.size(); i++) {
@@ -311,18 +339,22 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
                 if(inputsc.size() == 0) {
                     run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); 
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 1000.0, vlen, iter, arch_list[i]);
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 255.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 1 arg function >1 scalars";
                 break;
             case 2:
                 if(inputsc.size() == 0) {
                     run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 1000.0, vlen, iter, arch_list[i]);
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 2 arg function >1 scalars";
                 break;
             case 3:
-                run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                if(inputsc.size() == 0) {
+                    run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]);
+                } else throw "unsupported 3 arg function >1 scalars";
                 break;
             case 4:
                 run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
@@ -337,29 +369,24 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
     }
     //and now compare each output to the generic output
     //first we have to know which output is the generic one, they aren't in order...
-    int generic_offset;
+    int generic_offset=0;
     for(int i=0; i<arch_list.size(); i++) 
         if(arch_list[i] == "generic") generic_offset=i;
-        
+
     //now compare
     if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
 
     bool fail = false;
     for(int i=0; i<arch_list.size(); i++) {
         if(i != generic_offset) {
-            if(outputsig[0].str == "32fc") {
-                fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen*2, tol);
-            } else if(outputsig[0].str == "32f") {
-                fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen, tol);
-            } else if(outputsig[0].str == "32u" || outputsig[0].str == "32s" || outputsig[0].str == "16sc") {
-                fail = icompare((uint32_t *) outbuffs[generic_offset], (uint32_t *) outbuffs[i], vlen);
-            } else if(outputsig[0].size == 2) {
-                fail = icompare((uint16_t *) outbuffs[generic_offset], (uint16_t *) outbuffs[i], vlen);
-            } else if(outputsig[0].size == 1) {
-                fail = icompare((uint8_t *) outbuffs[generic_offset], (uint8_t *) outbuffs[i], vlen);
-            } else { 
-                std::cout << "Error: invalid type " << outputsig[0].str << std::endl;
-                fail = true;
+            if(outputsig[0].is_float) {
+                if(outputsig[0].size == 8) {
+                    fail = fcompare((double *) outbuffs[generic_offset], (double *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                } else {
+                    fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                }
+            } else {
+                fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
             }
             if(fail) {
                 std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
@@ -367,12 +394,6 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
         }
     }
 
-//    BOOST_FOREACH(void *buf, inbuffs) {
-//        free(buf);
-//    }
-//    BOOST_FOREACH(void *buf, outbuffs) {
-//        free(buf);
-//    }
     return fail;
 }
 
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index 79c5d77787..79fc8f0067 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -28,5 +28,6 @@ typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
 typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
 typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
 typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
 
 #endif //VOLK_QA_UTILS_H
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 43f38543cd..6b5e4f8b68 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -25,27 +25,28 @@ lib_LTLIBRARIES = libvolk_orc.la
 libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS)
 
 libvolk_orc_la_SOURCES = \
-volk_8s_convert_16s_a16_orc_impl.orc \
-volk_8s_s32f_convert_32f_a16_orc_impl.orc \
+volk_8i_convert_16i_a16_orc_impl.orc \
+volk_8i_s32f_convert_32f_a16_orc_impl.orc \
 volk_16u_byteswap_a16_orc_impl.orc \
-volk_32s_32s_and_32s_a16_orc_impl.orc \
-volk_32s_32s_or_32s_a16_orc_impl.orc \
-volk_32f_32f_add_32f_a16_orc_impl.orc \
-volk_32f_32f_subtract_32f_a16_orc_impl.orc \
-volk_32f_32f_divide_32f_a16_orc_impl.orc \
-volk_32f_32f_multiply_32f_a16_orc_impl.orc \
-volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc \
+volk_32i_x2_and_32i_a16_orc_impl.orc \
+volk_32i_x2_or_32i_a16_orc_impl.orc \
+volk_32f_x2_add_32f_a16_orc_impl.orc \
+volk_32f_x2_subtract_32f_a16_orc_impl.orc \
+volk_32f_x2_divide_32f_a16_orc_impl.orc \
+volk_32f_x2_multiply_32f_a16_orc_impl.orc \
+volk_32fc_x2_multiply_32fc_a16_orc_impl.orc \
 volk_32fc_32f_multiply_32fc_a16_orc_impl.orc \
 volk_32f_sqrt_32f_a16_orc_impl.orc \
-volk_32f_32f_max_32f_a16_orc_impl.orc \
-volk_32f_32f_min_32f_a16_orc_impl.orc \
+volk_32f_x2_max_32f_a16_orc_impl.orc \
+volk_32f_x2_min_32f_a16_orc_impl.orc \
 volk_32f_s32f_normalize_a16_orc_impl.orc \
 volk_32fc_magnitude_32f_a16_orc_impl.orc \
-volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc \
-volk_16sc_magnitude_16s_a16_orc_impl.orc \
-volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc \
-volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc \
-volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
+volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc \
+volk_16ic_magnitude_16i_a16_orc_impl.orc \
+volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc \
+volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc \
+volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
+
 
 
 
diff --git a/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc b/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc
new file mode 100644
index 0000000000..0189fbf5d4
--- /dev/null
+++ b/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc
@@ -0,0 +1,12 @@
+.function volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
diff --git a/volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc b/volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc
new file mode 100644
index 0000000000..56018edda2
--- /dev/null
+++ b/volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_16ic_deinterleave_16i_x2_a16_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
diff --git a/volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc b/volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
new file mode 100644
index 0000000000..dba9a4c8e7
--- /dev/null
+++ b/volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_16ic_deinterleave_real_8i_a16_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
diff --git a/volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc b/volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc
new file mode 100644
index 0000000000..37225e9b85
--- /dev/null
+++ b/volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_16ic_magnitude_16i_a16_orc_impl
+.source 4 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+x2 mulf prodiqf, iqf, iqf
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+convfl rootl, rootf
+convlw dst, rootl
diff --git a/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
deleted file mode 100644
index d396a00527..0000000000
--- a/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_16sc_deinterleave_16s_16s_a16_orc_impl
-.dest 2 idst
-.dest 2 qdst
-.source 4 src
-splitlw qdst, idst, src
diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
deleted file mode 100644
index 5954c406f0..0000000000
--- a/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
+++ /dev/null
@@ -1,6 +0,0 @@
-.function volk_16sc_deinterleave_real_8s_a16_orc_impl
-.dest 1 dst
-.source 4 src
-.temp 2 iw
-select0lw iw, src
-convhwb dst, iw
diff --git a/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
deleted file mode 100644
index 2a49d4ecb4..0000000000
--- a/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,23 +0,0 @@
-.function volk_16sc_magnitude_16s_a16_orc_impl
-.source 4 src
-.dest 2 dst
-.floatparam 4 scalar
-.temp 8 iql
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-.temp 4 rootf
-.temp 4 rootl
-
-x2 convswl iql, src
-x2 convlf iqf, iql
-x2 divf iqf, iqf, scalar
-x2 mulf prodiqf, iqf, iqf
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf rootf, sumf
-mulf rootf, rootf, scalar
-convfl rootl, rootf
-convlw dst, rootl
diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
index 6d2ed81978..1e23808371 100644
--- a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
+++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
@@ -1,4 +1,4 @@
-.function volk_16sc_magnitude_32f_aligned16_orc_impl
+.function volk_16ic_magnitude_32f_a16_orc_impl
 .source 4 src
 .dest 4 dst
 .floatparam 4 scalar
diff --git a/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc b/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
deleted file mode 100644
index 47c3d28a9c..0000000000
--- a/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,12 +0,0 @@
-.function volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl
-.dest 4 idst
-.dest 4 qdst
-.source 4 src
-.floatparam 4 scalar
-.temp 8 iql
-.temp 8 iqf
-
-x2 convswl iql, src
-x2 convlf iqf, iql
-x2 divf iqf, iqf, scalar
-splitql qdst, idst, iqf
diff --git a/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
deleted file mode 100644
index e6a30cf01c..0000000000
--- a/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_add_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-addf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
deleted file mode 100644
index 0bdcd0010d..0000000000
--- a/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_divide_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-divf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
deleted file mode 100644
index 9584e6634f..0000000000
--- a/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_max_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
deleted file mode 100644
index 47b9c05dbd..0000000000
--- a/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_min_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-minf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
deleted file mode 100644
index e5a049c161..0000000000
--- a/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_multiply_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
deleted file mode 100644
index 2ab42d5f6e..0000000000
--- a/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_subtract_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-subf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..8d095a052f
--- /dev/null
+++ b/volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_add_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..0097646cb6
--- /dev/null
+++ b/volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_divide_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..b7f0087378
--- /dev/null
+++ b/volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_max_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..78328b576b
--- /dev/null
+++ b/volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_min_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..e8fadff190
--- /dev/null
+++ b/volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_multiply_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..13fbe8c83c
--- /dev/null
+++ b/volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_subtract_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
diff --git a/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
deleted file mode 100644
index ed928b90f4..0000000000
--- a/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
+++ /dev/null
@@ -1,6 +0,0 @@
-.function volk_32fc_32fc_multiply_32fc_a16_orc_impl
-.source 8 src1
-.source 8 src2
-.dest 8 dst
-.temp 8 tmp
-x2 mulf dst, src1, src2
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc
new file mode 100644
index 0000000000..9e25990848
--- /dev/null
+++ b/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_32fc_s32f_magnitude_16i_a16_orc_impl
+.source 8 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+.temp 4 maskl
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+cmpltf maskl, scalar, rootf
+andl maskl, maskl, 0x80000000
+orl rootf, rootf, maskl
+convfl rootl, rootf
+convssslw dst, rootl
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
deleted file mode 100644
index cccda8a0f9..0000000000
--- a/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,23 +0,0 @@
-.function volk_32fc_s32f_magnitude_16s_a16_orc_impl
-.source 8 src
-.dest 2 dst
-.floatparam 4 scalar
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-.temp 4 rootf
-.temp 4 rootl
-.temp 4 maskl
-
-x2 mulf prodiqf, src, src
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf rootf, sumf
-mulf rootf, rootf, scalar
-cmpltf maskl, scalar, rootf
-andl maskl, maskl, 0x80000000
-orl rootf, rootf, maskl
-convfl rootl, rootf
-convssslw dst, rootl
diff --git a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
new file mode 100644
index 0000000000..d238928806
--- /dev/null
+++ b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_32fc_x2_multiply_32fc_a16_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 tmp
+x2 mulf dst, src1, src2
diff --git a/volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc b/volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc
new file mode 100644
index 0000000000..7b331f8ed3
--- /dev/null
+++ b/volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_and_32i_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+andl dst, src1, src2
diff --git a/volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc b/volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc
new file mode 100644
index 0000000000..4984a9ced6
--- /dev/null
+++ b/volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_or_32i_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
diff --git a/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
deleted file mode 100644
index bff3af875f..0000000000
--- a/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32s_32s_and_32s_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-andl dst, src1, src2
diff --git a/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
deleted file mode 100644
index b6961f79ea..0000000000
--- a/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32s_32s_or_32s_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-orl dst, src1, src2
diff --git a/volk/orc/volk_8i_convert_16i_a16_orc_impl.orc b/volk/orc/volk_8i_convert_16i_a16_orc_impl.orc
new file mode 100644
index 0000000000..f44845c88c
--- /dev/null
+++ b/volk/orc/volk_8i_convert_16i_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_8i_convert_16i_a16_orc_impl
+.source 1 src
+.dest 2 dst
+convsbw dst, src
+shlw dst, dst, 8
diff --git a/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
new file mode 100644
index 0000000000..4e33f7b3b6
--- /dev/null
+++ b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
@@ -0,0 +1,9 @@
+.function volk_8i_s32f_convert_32f_a16_orc_impl
+.source 2 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 flsrc
+.temp 4 lsrc
+convswl lsrc, src
+convlf flsrc, lsrc
+mulf dst, flsrc, scalar
diff --git a/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
deleted file mode 100644
index a55c7f7232..0000000000
--- a/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_8s_convert_16s_a16_orc_impl
-.source 1 src
-.dest 2 dst
-convsbw dst, src
-shlw dst, dst, 8
diff --git a/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc
deleted file mode 100644
index 3274ab9d6e..0000000000
--- a/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,9 +0,0 @@
-.function volk_8s_s32f_convert_32f_a16_orc_impl
-.source 2 src
-.dest 4 dst
-.floatparam 4 scalar
-.temp 4 flsrc
-.temp 4 lsrc
-convswl lsrc, src
-convlf flsrc, lsrc
-mulf dst, flsrc, scalar
-- 
cgit v1.2.3


From 82cafc4381e48ccc9423d2dc88720e5c1347d940 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Fri, 21 Jan 2011 12:26:52 -0800
Subject: Volk: fixed naming error. test coverage @ 75%, still need to add
 support for multiple outputs in the checker. some errors in the library were
 exposed by the new test suite, and a couple of bad Orc functions. need to
 investigate.

---
 volk/include/volk/Makefile.am                      |  2 +-
 .../volk/volk_16ic_deinterleave_real_8i_a16.h      |  6 +-
 .../volk/volk_32fc_deinterleave_real_16i_a16.h     | 80 ----------------------
 .../volk_32fc_s32f_deinterleave_real_16i_a16.h     | 80 ++++++++++++++++++++++
 volk/lib/qa_utils.cc                               | 51 +++++++++++---
 volk/lib/testqa.cc                                 | 34 +++++----
 6 files changed, 143 insertions(+), 110 deletions(-)
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 83f386c6cd..79da0effbe 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -68,7 +68,7 @@ volkinclude_HEADERS = \
 	volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \
 	volk_32fc_deinterleave_32f_x2_a16.h \
 	volk_32fc_deinterleave_64f_x2_a16.h \
-	volk_32fc_deinterleave_real_16i_a16.h \
+	volk_32fc_s32f_deinterleave_real_16i_a16.h \
 	volk_32fc_deinterleave_real_32f_a16.h \
 	volk_32fc_deinterleave_real_64f_a16.h \
 	volk_32fc_x2_dot_prod_32fc_a16.h \
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
index 437d5ab6bf..55a25702e2 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
+++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
@@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con
   number = sixteenthPoints * 16;
   int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
   for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
     int16ComplexVectorPtr++;
   }
 }
@@ -68,10 +68,10 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con
 */
 static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
   int8_t* iBufferPtr = iBuffer;
   for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
     complexVectorPtr++;
   }
 }
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
deleted file mode 100644
index 6042e6d62d..0000000000
--- a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H
-#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-
-    iValue = _mm_mul_ps(iValue, vScalar);
-
-    _mm_store_ps(floatBuffer, iValue);
-    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h
new file mode 100644
index 0000000000..31465bff96
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h
@@ -0,0 +1,80 @@
+#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    iValue = _mm_mul_ps(iValue, vScalar);
+
+    _mm_store_ps(floatBuffer, iValue);
+    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  iBufferPtr = &iBuffer[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 4c151bd6fa..8f57a9b90d 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -253,6 +253,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
     bool fail = false;
     int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
+        if(((t *)(in1))[i] < 1e-30) continue; //below around here we'll start to get roundoff errors due to float precision
         if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
             fail=true;
             if(print_max_errs-- > 0) {
@@ -265,14 +266,14 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
 }
 
 template <class t>
-bool icompare(t *in1, t *in2, unsigned int vlen, float tol) {
+bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
     bool fail = false;
     int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
-        if(((t *)(in1))[i] != ((t *)(in2))[i]) {
+        if(abs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) {
             fail=true;
             if(print_max_errs-- > 0) {
-                std::cout << "offset " << i << " in1: " << int(((t *)(in1))[i]) << " in2: " << int(((t *)(in2))[i]) << std::endl;
+                std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl;
             }
         }
     }
@@ -339,21 +340,21 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
                 if(inputsc.size() == 0) {
                     run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); 
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 255.0, vlen, iter, arch_list[i]);
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 127.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 1 arg function >1 scalars";
                 break;
             case 2:
                 if(inputsc.size() == 0) {
                     run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]);
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 2 arg function >1 scalars";
                 break;
             case 3:
                 if(inputsc.size() == 0) {
                     run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]);
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 3 arg function >1 scalars";
                 break;
             case 4:
@@ -375,7 +376,7 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
 
     //now compare
     if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
-
+    //TODO: loop over the output signature as well
     bool fail = false;
     for(int i=0; i<arch_list.size(); i++) {
         if(i != generic_offset) {
@@ -386,7 +387,41 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
                     fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
                 }
             } else {
-                fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
+                //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+                switch(outputsig[0].size) {
+                case 8:
+                    if(outputsig[0].is_signed) {
+                        fail = icompare((int64_t *) outbuffs[generic_offset], (int64_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    } else {
+                        fail = icompare((uint64_t *) outbuffs[generic_offset], (uint64_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    }
+                    break;
+                case 4:
+                    if(outputsig[0].is_signed) {
+                        fail = icompare((int32_t *) outbuffs[generic_offset], (int32_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    } else {
+                        fail = icompare((uint32_t *) outbuffs[generic_offset], (uint32_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    }
+                    break;
+                case 2:
+                    if(outputsig[0].is_signed) {
+                        fail = icompare((int16_t *) outbuffs[generic_offset], (int16_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    } else {
+                        fail = icompare((uint16_t *) outbuffs[generic_offset], (uint16_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    }
+                    break;
+                case 1:
+                    if(outputsig[0].is_signed) {
+                        fail = icompare((int8_t *) outbuffs[generic_offset], (int8_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    } else {
+                        fail = icompare((uint8_t *) outbuffs[generic_offset], (uint8_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                    }
+                    break;
+                default:
+                    fail=1;
+                }
+                    
+                //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
             }
             if(fail) {
                 std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 1ee264fb44..f813e843f2 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -11,18 +11,16 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
 //    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 2046, 10000);
     VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_convert_8i_u, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 2046, 10000);
@@ -37,7 +35,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 2046, 10000);
@@ -45,25 +43,25 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 1e-4, 2046, 10000);
+    //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
+    //VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 1, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 2046, 10000);
-- 
cgit v1.2.3


From 7a5a751073cc1583533b84c90ecc985b3669a696 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Fri, 21 Jan 2011 15:14:26 -0800
Subject: Volk: added QA support for multiple outputs, scalar outputs. 92% test
 coverage within the framework.

---
 volk/include/volk/Makefile.am                      |   2 +-
 .../volk/volk_32f_calc_spectral_noise_floor_a16.h  | 167 -------------------
 ...lk_32f_s32f_calc_spectral_noise_floor_32f_a16.h | 167 +++++++++++++++++++
 volk/lib/qa_utils.cc                               | 183 ++++++++++-----------
 volk/lib/testqa.cc                                 |  42 ++---
 5 files changed, 280 insertions(+), 281 deletions(-)
 delete mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 79da0effbe..1eb46b602d 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -63,7 +63,7 @@ volkinclude_HEADERS = \
 	volk_32f_x2_add_32f_a16.h \
 	volk_32fc_32f_multiply_32fc_a16.h \
 	volk_32fc_32f_power_32fc_a16.h \
-	volk_32f_calc_spectral_noise_floor_a16.h \
+	volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \
 	volk_32fc_s32f_atan2_32f_a16.h \
 	volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \
 	volk_32fc_deinterleave_32f_x2_a16.h \
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
deleted file mode 100644
index fce77cd045..0000000000
--- a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H
-#define INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the spectral noise floor of an input power spectrum
-
-  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
-
-  \param realDataPoints The input power spectrum
-  \param num_points The number of data points in the input power spectrum vector
-  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
-  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
-*/
-static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* dataPointsPtr = realDataPoints;
-  float avgPointsVector[4] __attribute__((aligned(128)));
-    
-  __m128 dataPointsVal;
-  __m128 avgPointsVal = _mm_setzero_ps();
-  // Calculate the sum (for mean) for all points
-  for(; number < quarterPoints; number++){
-
-    dataPointsVal = _mm_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 4;
-
-    avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
-  }
-
-  _mm_store_ps(avgPointsVector, avgPointsVal);
-
-  float sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more
-  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
-
-  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
-  __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
-  __m128 vOnesVector = _mm_set_ps1(1.0);
-  __m128 vValidBinCount = _mm_setzero_ps();
-  avgPointsVal = _mm_setzero_ps();
-  __m128 compareMask;
-  number = 0;
-  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
-  for(; number < quarterPoints; number++){
-
-    dataPointsVal = _mm_load_ps(dataPointsPtr);
-
-    dataPointsPtr += 4;
-
-    // Identify which items do not exceed the mean amplitude
-    compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
-
-    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
-    avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
-      
-    // Count the number of bins which do not exceed the mean amplitude
-    vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
-  }
-    
-  // Calculate the mean from the remaining data points
-  _mm_store_ps(avgPointsVector, avgPointsVal);
-
-  sumMean = 0.0;
-  sumMean += avgPointsVector[0];
-  sumMean += avgPointsVector[1];
-  sumMean += avgPointsVector[2];
-  sumMean += avgPointsVector[3];
-
-  // Calculate the number of valid bins from the remaning count
-  float validBinCountVector[4] __attribute__((aligned(128)));
-  _mm_store_ps(validBinCountVector, vValidBinCount);
-
-  float validBinCount = 0;
-  validBinCount += validBinCountVector[0];
-  validBinCount += validBinCountVector[1];
-  validBinCount += validBinCountVector[2];
-  validBinCount += validBinCountVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    if(realDataPoints[number] <= meanAmplitude){
-      sumMean += realDataPoints[number];
-      validBinCount += 1.0;
-    }
-  }
-    
-  float localNoiseFloorAmplitude = 0;
-  if(validBinCount > 0.0){
-    localNoiseFloorAmplitude = sumMean / validBinCount;
-  }
-  else{
-    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
-  }
-
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the spectral noise floor of an input power spectrum
-
-  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
-
-  \param realDataPoints The input power spectrum
-  \param num_points The number of data points in the input power spectrum vector
-  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
-  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
-*/
-static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
-  float sumMean = 0.0;
-  unsigned int number;
-  // find the sum (for mean), etc
-  for(number = 0; number < num_points; number++){
-    // sum (for mean)
-    sumMean += realDataPoints[number];
-  }
-
-  // calculate the spectral mean
-  // +20 because for the comparison below we only want to throw out bins
-  // that are significantly higher (and would, thus, affect the mean more)
-  const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
-
-  // now throw out any bins higher than the mean
-  sumMean = 0.0;
-  unsigned int newNumDataPoints = num_points;
-  for(number = 0; number < num_points; number++){
-    if (realDataPoints[number] <= meanAmplitude)
-      sumMean += realDataPoints[number];
-    else
-      newNumDataPoints--;
-  }
-
-  float localNoiseFloorAmplitude = 0.0;
-  if (newNumDataPoints == 0)             // in the odd case that all
-    localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
-  else
-    localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
-
-  *noiseFloorAmplitude = localNoiseFloorAmplitude;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h
new file mode 100644
index 0000000000..168245d657
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h
@@ -0,0 +1,167 @@
+#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H
+#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the spectral noise floor of an input power spectrum
+
+  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
+
+  \param realDataPoints The input power spectrum
+  \param num_points The number of data points in the input power spectrum vector
+  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* dataPointsPtr = realDataPoints;
+  float avgPointsVector[4] __attribute__((aligned(128)));
+    
+  __m128 dataPointsVal;
+  __m128 avgPointsVal = _mm_setzero_ps();
+  // Calculate the sum (for mean) for all points
+  for(; number < quarterPoints; number++){
+
+    dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+    dataPointsPtr += 4;
+
+    avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+  }
+
+  _mm_store_ps(avgPointsVector, avgPointsVal);
+
+  float sumMean = 0.0;
+  sumMean += avgPointsVector[0];
+  sumMean += avgPointsVector[1];
+  sumMean += avgPointsVector[2];
+  sumMean += avgPointsVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    sumMean += realDataPoints[number];
+  }
+
+  // calculate the spectral mean
+  // +20 because for the comparison below we only want to throw out bins
+  // that are significantly higher (and would, thus, affect the mean more
+  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+  __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+  __m128 vOnesVector = _mm_set_ps1(1.0);
+  __m128 vValidBinCount = _mm_setzero_ps();
+  avgPointsVal = _mm_setzero_ps();
+  __m128 compareMask;
+  number = 0;
+  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+  for(; number < quarterPoints; number++){
+
+    dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+    dataPointsPtr += 4;
+
+    // Identify which items do not exceed the mean amplitude
+    compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+    avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+      
+    // Count the number of bins which do not exceed the mean amplitude
+    vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+  }
+    
+  // Calculate the mean from the remaining data points
+  _mm_store_ps(avgPointsVector, avgPointsVal);
+
+  sumMean = 0.0;
+  sumMean += avgPointsVector[0];
+  sumMean += avgPointsVector[1];
+  sumMean += avgPointsVector[2];
+  sumMean += avgPointsVector[3];
+
+  // Calculate the number of valid bins from the remaning count
+  float validBinCountVector[4] __attribute__((aligned(128)));
+  _mm_store_ps(validBinCountVector, vValidBinCount);
+
+  float validBinCount = 0;
+  validBinCount += validBinCountVector[0];
+  validBinCount += validBinCountVector[1];
+  validBinCount += validBinCountVector[2];
+  validBinCount += validBinCountVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    if(realDataPoints[number] <= meanAmplitude){
+      sumMean += realDataPoints[number];
+      validBinCount += 1.0;
+    }
+  }
+    
+  float localNoiseFloorAmplitude = 0;
+  if(validBinCount > 0.0){
+    localNoiseFloorAmplitude = sumMean / validBinCount;
+  }
+  else{
+    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+  }
+
+  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the spectral noise floor of an input power spectrum
+
+  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
+
+  \param realDataPoints The input power spectrum
+  \param num_points The number of data points in the input power spectrum vector
+  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+  float sumMean = 0.0;
+  unsigned int number;
+  // find the sum (for mean), etc
+  for(number = 0; number < num_points; number++){
+    // sum (for mean)
+    sumMean += realDataPoints[number];
+  }
+
+  // calculate the spectral mean
+  // +20 because for the comparison below we only want to throw out bins
+  // that are significantly higher (and would, thus, affect the mean more)
+  const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+  // now throw out any bins higher than the mean
+  sumMean = 0.0;
+  unsigned int newNumDataPoints = num_points;
+  for(number = 0; number < num_points; number++){
+    if (realDataPoints[number] <= meanAmplitude)
+      sumMean += realDataPoints[number];
+    else
+      newNumDataPoints--;
+  }
+
+  float localNoiseFloorAmplitude = 0.0;
+  if (newNumDataPoints == 0)             // in the odd case that all
+    localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+  else
+    localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 8f57a9b90d..b1c55fc05a 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -66,6 +66,7 @@ void *make_aligned_buffer(unsigned int len, unsigned int size) {
   int ret;
   ret = posix_memalign((void**)&buf, 16, len * size);
   assert(ret == 0);
+  memset(buf, 0x00, len*size);
   return buf;
 }
 
@@ -220,32 +221,32 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
     assert(inputsig.size() != 0);
 }
 
-inline void run_cast_test1(volk_fn_1arg func, void *buff, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buff, vlen, arch.c_str());
+inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], vlen, arch.c_str());
 }
 
-inline void run_cast_test2(volk_fn_2arg func, void *outbuff, std::vector<void *> &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(outbuff, inbuffs[0], vlen, arch.c_str());
+inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
 }
 
-inline void run_cast_test3(volk_fn_3arg func, void *outbuff, std::vector<void *> &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(outbuff, inbuffs[0], inbuffs[1], vlen, arch.c_str());
+inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
 }
 
-inline void run_cast_test4(volk_fn_4arg func, void *outbuff, std::vector<void *> &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(outbuff, inbuffs[0], inbuffs[1], inbuffs[2], vlen, arch.c_str());
+inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
 }
 
-inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, void *buff, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(buff, scalar, vlen, arch.c_str());
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
 }
 
-inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, void *outbuff, std::vector<void *> &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(outbuff, inbuffs[0], scalar, vlen, arch.c_str());
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
 }
 
-inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, void *outbuff, std::vector<void *> &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
-    while(iter--) func(outbuff, inbuffs[0], inbuffs[1], scalar, vlen, arch.c_str());
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
 }
 
 template <class t>
@@ -253,7 +254,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
     bool fail = false;
     int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
-        if(((t *)(in1))[i] < 1e-30) continue; //below around here we'll start to get roundoff errors due to float precision
+        if(((t *)(in1))[i] < 1e-30) continue; //this is a hack: below around here we'll start to get roundoff errors due to limited precision
         if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
             fail=true;
             if(print_max_errs-- > 0) {
@@ -291,74 +292,70 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
     std::vector<volk_type_t> inputsig, outputsig;
     get_signatures_from_name(inputsig, outputsig, name);
     
-    std::vector<volk_type_t> inputsc, outputsc;
+    //pull the input scalars into their own vector
+    std::vector<volk_type_t> inputsc;
     for(int i=0; i<inputsig.size(); i++) {
         if(inputsig[i].is_scalar) {
             inputsc.push_back(inputsig[i]);
             inputsig.erase(inputsig.begin() + i);
         }
     }
-    for(int i=0; i<outputsig.size(); i++) {
-        if(outputsig[i].is_scalar) {
-            outputsc.push_back(outputsig[i]);
-            outputsig.erase(outputsig.begin() + i);
-        }
-    }
-    assert(outputsc.size() == 0); //we don't do output scalars yet
 
     //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
     //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
-    std::vector<void *> inbuffs, outbuffs;
+    std::vector<void *> inbuffs;
+
+    make_buffer_for_signature(inbuffs, inputsig, vlen);
+    for(int i=0; i<inbuffs.size(); i++) {
+        load_random_data(inbuffs[i], inputsig[i], vlen);        
+    }
     
-    if(outputsig.size() == 0) { //we're operating in place...
-        //assert(inputsig.size() == 1); //we only support 0 output 1 input right now...
-        make_buffer_for_signature(inbuffs, inputsig, vlen); //let's make an input buffer
-        load_random_data(inbuffs[0], inputsig[0], vlen); //and load it with random data
-        BOOST_FOREACH(std::string arch, arch_list) { //then copy the same random data to each output buffer
-            make_buffer_for_signature(outbuffs, inputsig, vlen);
-            memcpy(outbuffs.back(), inbuffs[0], vlen*inputsig[0].size*(inputsig[0].is_complex?2:1));
-        }
-    } else {
-        make_buffer_for_signature(inbuffs, inputsig, vlen);
-        BOOST_FOREACH(std::string arch, arch_list) {
-            make_buffer_for_signature(outbuffs, outputsig, vlen);
+    //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+    std::vector<std::vector<void *> > test_data;
+    for(int i=0; i<arch_list.size(); i++) {
+        std::vector<void *> arch_buffs;
+        for(int j=0; j<outputsig.size(); j++) {
+            arch_buffs.push_back(make_aligned_buffer(vlen, outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
         }
-    
-        //and set the input buffers to something random
-        for(int i=0; i<inbuffs.size(); i++) {
-            load_random_data(inbuffs[i], inputsig[i], vlen);        
+        for(int j=0; j<inputsig.size(); j++) {
+            arch_buffs.push_back(inbuffs[j]);
         }
+        test_data.push_back(arch_buffs);
     }
+    
+    std::vector<volk_type_t> both_sigs;
+    both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+    both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
 
     //now run the test
     clock_t start, end;
     for(int i = 0; i < arch_list.size(); i++) {
         start = clock();
 
-        switch(inputsig.size() + outputsig.size()) {
+        switch(both_sigs.size()) {
             case 1:
                 if(inputsc.size() == 0) {
-                    run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); 
+                    run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); 
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 127.0, vlen, iter, arch_list[i]);
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 1 arg function >1 scalars";
                 break;
             case 2:
                 if(inputsc.size() == 0) {
-                    run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                    run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]);
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 2 arg function >1 scalars";
                 break;
             case 3:
                 if(inputsc.size() == 0) {
-                    run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                    run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]);
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 3 arg function >1 scalars";
                 break;
             case 4:
-                run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
                 break;
             default:
                 throw "no function handler for this signature";
@@ -375,61 +372,63 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
         if(arch_list[i] == "generic") generic_offset=i;
 
     //now compare
-    if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
-    //TODO: loop over the output signature as well
+    //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
+    
     bool fail = false;
+    bool fail_global = false;
     for(int i=0; i<arch_list.size(); i++) {
         if(i != generic_offset) {
-            if(outputsig[0].is_float) {
-                if(outputsig[0].size == 8) {
-                    fail = fcompare((double *) outbuffs[generic_offset], (double *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                } else {
-                    fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                }
-            } else {
-                //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
-                switch(outputsig[0].size) {
-                case 8:
-                    if(outputsig[0].is_signed) {
-                        fail = icompare((int64_t *) outbuffs[generic_offset], (int64_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                    } else {
-                        fail = icompare((uint64_t *) outbuffs[generic_offset], (uint64_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                    }
-                    break;
-                case 4:
-                    if(outputsig[0].is_signed) {
-                        fail = icompare((int32_t *) outbuffs[generic_offset], (int32_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+            for(int j=0; j<both_sigs.size(); j++) {
+                if(both_sigs[j].is_float) {
+                    if(both_sigs[j].size == 8) {
+                        fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
                     } else {
-                        fail = icompare((uint32_t *) outbuffs[generic_offset], (uint32_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                        fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
                     }
-                    break;
-                case 2:
-                    if(outputsig[0].is_signed) {
-                        fail = icompare((int16_t *) outbuffs[generic_offset], (int16_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                    } else {
-                        fail = icompare((uint16_t *) outbuffs[generic_offset], (uint16_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                    }
-                    break;
-                case 1:
-                    if(outputsig[0].is_signed) {
-                        fail = icompare((int8_t *) outbuffs[generic_offset], (int8_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
-                    } else {
-                        fail = icompare((uint8_t *) outbuffs[generic_offset], (uint8_t *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                } else {
+                    //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+                    switch(both_sigs[j].size) {
+                    case 8:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    case 4:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    case 2:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    case 1:
+                        if(both_sigs[j].is_signed) {
+                            fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        } else {
+                            fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol);
+                        }
+                        break;
+                    default:
+                        fail=1;
                     }
-                    break;
-                default:
-                    fail=1;
                 }
-                    
+                if(fail) {
+                    fail_global = true;
+                    std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+                }
                 //fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
             }
-            if(fail) {
-                std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
-            }
         }
     }
-
-    return fail;
+    return fail_global;
 }
 
 
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index f813e843f2..4dd7f75993 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -7,13 +7,13 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     //in order...
 //    VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 2046, 10000);
     VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 2046, 10000);
@@ -21,25 +21,25 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_16u_byteswap_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 2046, 1000);
-//    VOLK_RUN_TESTS(volk_32f_calc_spectral_noise_floor_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 2046, 10000);
@@ -49,19 +49,19 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 2046, 10000);
-    //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
-    //VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 1, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 2046, 10000);
@@ -84,8 +84,8 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_64u_byteswap_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_64u_popcnt_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 1e-4, 2046, 10000);
-- 
cgit v1.2.3


From fa8c8c8e9fcd74eda5edb58edc89be97bc4bfa0a Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Fri, 21 Jan 2011 15:29:08 -0800
Subject: Volk: added ability to spec scalar in test invocation

---
 .../volk/volk_16ic_deinterleave_real_8i_a16.h      |   4 +-
 volk/lib/qa_utils.cc                               |   8 +-
 volk/lib/qa_utils.h                                |   4 +-
 volk/lib/testqa.cc                                 | 172 ++++++++++-----------
 4 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
index 55a25702e2..cfbebd57b7 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
+++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
@@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con
   number = sixteenthPoints * 16;
   int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
   for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
     int16ComplexVectorPtr++;
   }
 }
@@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c
   int16_t* complexVectorPtr = (int16_t*)complexVector;
   int8_t* iBufferPtr = iBuffer;
   for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ / 256));
     complexVectorPtr++;
   }
 }
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index b1c55fc05a..67ce5ddefd 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -282,7 +282,7 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
     return fail;
 }
 
-bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, int vlen, int iter) {
+bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) {
     std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
     
     //first let's get a list of available architectures for the test
@@ -337,21 +337,21 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
                 if(inputsc.size() == 0) {
                     run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); 
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]);
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
                 } else throw "unsupported 1 arg function >1 scalars";
                 break;
             case 2:
                 if(inputsc.size() == 0) {
                     run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]);
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
                 } else throw "unsupported 2 arg function >1 scalars";
                 break;
             case 3:
                 if(inputsc.size() == 0) {
                     run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]);
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
                 } else throw "unsupported 3 arg function >1 scalars";
                 break;
             case 4:
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index 79fc8f0067..e2539060aa 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -18,9 +18,9 @@ volk_type_t volk_type_from_string(std::string);
 float uniform(void);
 void random_floats(float *buf, unsigned n);
 
-bool run_volk_tests(const int[], void(*)(), std::string, float, int, int);
+bool run_volk_tests(const int[], void(*)(), std::string, float, float, int, int);
 
-#define VOLK_RUN_TESTS(func, tol, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, len, iter), 0)
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter), 0)
 
 typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
 typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 4dd7f75993..9f4934dc09 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -7,93 +7,93 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     //in order...
 //    VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_16u_byteswap_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 2046, 1000);
-    VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 0, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_16u_byteswap_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 0, 2046, 1000);
+    VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 0, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 0, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2<<31, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 128, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 128, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 10, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32u_byteswap_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_32u_popcnt_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_64u_byteswap_a16, 1e-4, 2046, 10000);
-//    VOLK_RUN_TESTS(volk_64u_popcnt_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8i_convert_16i_u, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 2046, 10000);
-    VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 4, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32u_byteswap_a16, 0, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_32u_popcnt_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_64u_byteswap_a16, 0, 0, 2046, 10000);
+//    VOLK_RUN_TESTS(volk_64u_popcnt_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 0, 256, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000);
+    VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000);
 
 }
-- 
cgit v1.2.3


From f832c9789be9fec46e211be4fb2355013d19c000 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Fri, 21 Jan 2011 18:24:02 -0800
Subject: Volk: Small changes to speed things up.

---
 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h | 4 ++--
 volk/include/volk/volk_32f_s32f_convert_8i_a16.h       | 2 +-
 volk/lib/qa_utils.cc                                   | 2 +-
 volk/lib/testqa.cc                                     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
index cfbebd57b7..55a25702e2 100644
--- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
+++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
@@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con
   number = sixteenthPoints * 16;
   int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
   for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
     int16ComplexVectorPtr++;
   }
 }
@@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c
   int16_t* complexVectorPtr = (int16_t*)complexVector;
   int8_t* iBufferPtr = iBuffer;
   for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ / 256));
+    *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
     complexVectorPtr++;
   }
 }
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
index c91448951e..f64f2a2130 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
@@ -106,7 +106,7 @@ static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, co
   unsigned int number = 0;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+    *outputVectorPtr++ = (int8_t)(*inputVectorPtr++  * scalar);
   }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 67ce5ddefd..9cafd459fa 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -32,7 +32,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) {
         if(type.size == 8) random_floats<double>((double *)data, n);
         else random_floats<float>((float *)data, n);
     } else {
-        float int_max = pow(2, type.size*8);
+        float int_max = float(uint64_t(2) << (type.size*8));
         if(type.is_signed) int_max /= 2.0;
         for(int i=0; i<n; i++) {
             float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 9f4934dc09..4cef7b443e 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 0, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1, 32768, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000);
-- 
cgit v1.2.3


From f47466012c733526e04f8ceeb66a3677eec38cd9 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Tue, 25 Jan 2011 18:20:36 -0800
Subject: Volk: Orc impl for complex multiply fixed. Maybe some room for
 optimization.

---
 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h   |  5 ++---
 volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc | 17 ++++++++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
index 224ab19c8e..b4214f5d2d 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
@@ -81,10 +81,9 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co
     \param bVector One of the vectors to be multiplied
     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
   */
-extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
+extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
 static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    static const float mask = -0.0;
-    volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points);
+    volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
 
diff --git a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
index b72dfe8e73..a27d722cdb 100644
--- a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
+++ b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
@@ -1,7 +1,18 @@
 .function volk_32fc_x2_multiply_32fc_a16_orc_impl
 .source 8 src1
 .source 8 src2
-.floatparam 4 mask
 .dest 8 dst
-.temp 8 tmp
-x2 mulf dst, src1, src2
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, src2
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, src2
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
-- 
cgit v1.2.3


From 2a4c4f89187bf75caa34c7bc52fc32310a75c9f2 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Wed, 26 Jan 2011 15:28:35 -0800
Subject: Volk: fixed volk_8i_s32f_convert_32f_a16_orc_impl.

---
 volk/include/volk/volk_8i_s32f_convert_32f_a16.h   | 3 ++-
 volk/lib/testqa.cc                                 | 6 +++---
 volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc | 6 ++++--
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
index d5c8eeb51c..99a24ec10b 100644
--- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
@@ -96,7 +96,8 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con
   */
 extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
 static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points);
+    float invscalar = 1.0 / scalar;
+    volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points);
 }
 #endif /* LV_HAVE_ORC */
 
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 4cef7b443e..d6b9e347d5 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -49,8 +49,8 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 128, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 128, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 128, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000);
@@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000);
 //    VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 32768, 2046, 10000);
+    VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 32768, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000);
diff --git a/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
index 4e33f7b3b6..8f6e157e91 100644
--- a/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
+++ b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
@@ -1,9 +1,11 @@
 .function volk_8i_s32f_convert_32f_a16_orc_impl
-.source 2 src
+.source 1 src
 .dest 4 dst
 .floatparam 4 scalar
 .temp 4 flsrc
 .temp 4 lsrc
-convswl lsrc, src
+.temp 2 ssrc
+convsbw ssrc, src
+convswl lsrc, ssrc
 convlf flsrc, lsrc
 mulf dst, flsrc, scalar
-- 
cgit v1.2.3


From e34a484084a5224ec3412bd7d6c6f285301f5d43 Mon Sep 17 00:00:00 2001
From: Nick Foster <nick@nerdnetworks.org>
Date: Wed, 26 Jan 2011 15:47:56 -0800
Subject: Volk: renamed volk_32fc_32f_power_32fc_a16 to
 volk_32fc_s32f_power_32fc_a16

---
 volk/include/volk/Makefile.am                     |   2 +-
 volk/include/volk/volk_32fc_32f_power_32fc_a16.h  | 109 ----------------------
 volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 109 ++++++++++++++++++++++
 volk/lib/testqa.cc                                |   2 +-
 4 files changed, 111 insertions(+), 111 deletions(-)
 delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a16.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 1eb46b602d..eb97775b0a 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -62,7 +62,7 @@ volkinclude_HEADERS = \
 	volk_32f_accumulator_s32f_a16.h \
 	volk_32f_x2_add_32f_a16.h \
 	volk_32fc_32f_multiply_32fc_a16.h \
-	volk_32fc_32f_power_32fc_a16.h \
+	volk_32fc_s32f_power_32fc_a16.h \
 	volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \
 	volk_32fc_s32f_atan2_32f_a16.h \
 	volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \
diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h
deleted file mode 100644
index 6f9e9e3ee1..0000000000
--- a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32f_power_32fc_a16_H
-#define INCLUDED_volk_32fc_32f_power_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
-  \param cVector The vector where the results will be stored
-  \param aVector The complex vector of values to be taken to a power
-  \param power The power value to be applied to each data point
-  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-*/
-static inline void volk_32fc_32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 vPower = _mm_set_ps1(power);
-  
-  __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
-  for(;number < quarterPoints; number++){
-    
-    cplxValue1 = _mm_load_ps((float*)aPtr); 
-    aPtr += 2;
-    
-    cplxValue2 = _mm_load_ps((float*)aPtr); 
-    aPtr += 2;
-    
-    // Convert to polar coordinates
-    
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-    
-    phase = atan2f4(qValue, iValue); // Calculate the Phase
-    
-    magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
-    
-    // Now calculate the power of the polar coordinate data
-    magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
-    
-    phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
-    
-    // Convert back to cartesian coordinates
-    iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
-    qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
-    
-    cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
-    cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
-    
-    _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
-    
-    cPtr += 2;
-    
-    _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
-    
-    cPtr += 2;
-  }
-  
-  number = quarterPoints * 4;
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-  lv_32fc_t complexPower;
-  ((float*)&complexPower)[0] = power;
-  ((float*)&complexPower)[1] = 0;
-  for(;number < num_points; number++){
-    *cPtr++ = lv_cpow((*aPtr++), complexPower);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
-    \param cVector The vector where the results will be stored
-    \param aVector The complex vector of values to be taken to a power
-    \param power The power value to be applied to each data point
-    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
-  */
-static inline void volk_32fc_32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
-  lv_32fc_t* cPtr = cVector;
-  const lv_32fc_t* aPtr = aVector;
-  unsigned int number = 0;
-  lv_32fc_t complexPower;
-  ((float*)&complexPower)[0] = power;
-  ((float*)&complexPower)[1] = 0.0;
-
-  for(number = 0; number < num_points; number++){
-    *cPtr++ = lv_cpow((*aPtr++), complexPower);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_32f_power_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h
new file mode 100644
index 0000000000..3507fdb3cd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H
+#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The complex vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  
+  __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+  for(;number < quarterPoints; number++){
+    
+    cplxValue1 = _mm_load_ps((float*)aPtr); 
+    aPtr += 2;
+    
+    cplxValue2 = _mm_load_ps((float*)aPtr); 
+    aPtr += 2;
+    
+    // Convert to polar coordinates
+    
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+    
+    phase = atan2f4(qValue, iValue); // Calculate the Phase
+    
+    magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+    
+    // Now calculate the power of the polar coordinate data
+    magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+    
+    phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+    
+    // Convert back to cartesian coordinates
+    iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+    qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+    
+    cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+    cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+    
+    _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+    
+    cPtr += 2;
+    
+    _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+    
+    cPtr += 2;
+  }
+  
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  lv_32fc_t complexPower;
+  ((float*)&complexPower)[0] = power;
+  ((float*)&complexPower)[1] = 0;
+  for(;number < num_points; number++){
+    *cPtr++ = lv_cpow((*aPtr++), complexPower);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector of values to be taken to a power
+    \param power The power value to be applied to each data point
+    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+  */
+static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+  unsigned int number = 0;
+  lv_32fc_t complexPower;
+  ((float*)&complexPower)[0] = power;
+  ((float*)&complexPower)[1] = 0.0;
+
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = lv_cpow((*aPtr++), complexPower);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index e9734411b9..f336708568 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) {
     VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000);
-    VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 0, 2046, 1000);
+    VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a16, 1e-4, 0, 2046, 1000);
     VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000);
     VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000);
-- 
cgit v1.2.3


From 6503e3b21978b71908400c994148836bec4a97b9 Mon Sep 17 00:00:00 2001
From: Tom Rondeau <trondeau@vt.edu>
Date: Sun, 30 Jan 2011 12:35:07 -0500
Subject: volk: Updating build structure to work when orc is not installed.

Distcheck passes for me if liborc is installed or not.
---
 volk/config/orc.m4                 | 24 +++++++++++-------------
 volk/configure.ac                  |  5 +----
 volk/include/volk/make_set_simd.py |  1 -
 volk/lib/Makefile.am               |  5 ++---
 4 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'volk/include')

diff --git a/volk/config/orc.m4 b/volk/config/orc.m4
index a4653400cc..df0f3d6f33 100644
--- a/volk/config/orc.m4
+++ b/volk/config/orc.m4
@@ -7,10 +7,9 @@ AC_DEFUN([ORC_CHECK],
 [
   ORC_REQ=ifelse([$1], , "0.4.10", [$1])
   
-  enable_orc = auto
   if test "x$enable_orc" != "xno" ; then
     PKG_CHECK_MODULES(ORC, orc-0.4 >= $ORC_REQ, [
-      AC_DEFINE(HAVE_ORC, 1, [Use Orc])
+      AC_DEFINE(LV_HAVE_ORC, 1, [Use Orc])
       if test "x$ORCC" = "x" ; then
         ORCC=`$PKG_CONFIG --variable=orcc orc-0.4`
       fi
@@ -21,32 +20,31 @@ AC_DEFUN([ORC_CHECK],
       AC_SUBST(ORCC_FLAGS)
       AC_SUBST(ORC_LDFLAGS)
       AC_SUBST(ORC_CFLAGS)
-      HAVE_ORC=yes
-      HAVE_ORCC=yes
+      LV_HAVE_ORC=yes
+      LV_HAVE_ORCC=yes
       if test "x$cross_compiling" = "xyes" ; then
-        HAVE_ORCC=no
+        LV_HAVE_ORCC=no
       fi
     ], [
       if test "x$enable_orc" = "xyes" ; then
         AC_MSG_ERROR([--enable-orc specified, but Orc >= $ORC_REQ not found])
       fi
       AC_DEFINE(DISABLE_ORC, 1, [Disable Orc])
-      HAVE_ORC=no
-      HAVE_ORCC=no
+      LV_HAVE_ORC=no
+      LV_HAVE_ORCC=no
     ])
   else
     AC_DEFINE(DISABLE_ORC, 1, [Disable Orc])
-    HAVE_ORC=no
-    HAVE_ORCC=no
+    LV_HAVE_ORC=no
+    LV_HAVE_ORCC=no
   fi
-  AM_CONDITIONAL(HAVE_ORC, [test "x$HAVE_ORC" = "xyes"])
-  AM_CONDITIONAL(HAVE_ORCC, [test "x$HAVE_ORCC" = "xyes"])
-
+  AM_CONDITIONAL(LV_HAVE_ORC, [test "x$LV_HAVE_ORC" = "xyes"])
+  AM_CONDITIONAL(LV_HAVE_ORCC, [test "x$LV_HAVE_ORCC" = "xyes"])
 ]))
 
 AC_DEFUN([ORC_OUTPUT],
 [
-  if test "$HAVE_ORC" = yes ; then
+  if test "$LV_HAVE_ORC" = yes ; then
     printf "configure: *** Orc acceleration enabled.\n"
   else
     if test "x$enable_orc" = "xno" ; then
diff --git a/volk/configure.ac b/volk/configure.ac
index eeb7441baf..c493adad62 100644
--- a/volk/configure.ac
+++ b/volk/configure.ac
@@ -68,12 +68,9 @@ AC_CONFIG_FILES([\
 	  include/Makefile \
 	  include/volk/Makefile \
 	  lib/Makefile \
+	  orc/Makefile \
 	  volk.pc \
 	])
-	
-if test "$HAVE_ORC" = yes; then
-  AC_CONFIG_FILES([orc/Makefile])
-fi
 
 AC_OUTPUT
 
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
index f2b7c06562..019833b436 100644
--- a/volk/include/volk/make_set_simd.py
+++ b/volk/include/volk/make_set_simd.py
@@ -280,7 +280,6 @@ def make_set_simd(dom) :
     tempstring = tempstring + "  ;;\n"
     tempstring = tempstring + "  esac\n"
     tempstring = tempstring + "  LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
-    tempstring = tempstring + "  AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n";
     tempstring = tempstring + "])\n"
    
     return tempstring;
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
index 6f3d7fd864..af7c7f3356 100644
--- a/volk/lib/Makefile.am
+++ b/volk/lib/Makefile.am
@@ -45,7 +45,7 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \
 
 
 # list of programs run by "make check" and "make distcheck"
-TESTS = testqa
+#TESTS = testqa
 #orc stuff gets built in the ORC directory conditional to ORC being enabled.
 #it gets linked in during the build of libvolk as an added library.
 #there might be a better way to do this.
@@ -77,7 +77,7 @@ libvolk_la_SOURCES = 		\
 volk_orc_LDFLAGS = \
 	$(ORC_LDFLAGS) \
 	-lorc-0.4
-	
+
 volk_orc_LIBADD = \
 	../orc/libvolk_orc.la
 
@@ -103,7 +103,6 @@ endif
 #libvolk_qa_la_LIBADD = \
 #	libvolk.la \
 #	libvolk_runtime.la
-	
 
 # ----------------------------------------------------------------
 # headers that don't get installed
-- 
cgit v1.2.3