volk: new unaligned versions of float multipliers.

author: Tom Rondeau <trondeau@vt.edu> 2012-02-04 11:04:30 -0500
committer: Tom Rondeau <trondeau@vt.edu> 2012-02-13 14:56:39 -0500
commit: f028a198cb47e0486ad41a29bd3b3dcf0663d766 (patch)
tree: 569f14c836d7c2757ca85547a34fbc737669ad29 /volk
parent: 3a21ccb8cdef50daa52f5d26293df3a03c821c99 (diff)
6 files changed, 231 insertions, 14 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 7da8651e9f..f5f730df12 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -77,6 +77,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results);
     VOLK_PROFILE(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000, &results);
     VOLK_PROFILE(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32f_x2_multiply_32f_u, 1e-4, 0, 204600, 10000, &results);
     VOLK_PROFILE(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000, &results);
     VOLK_PROFILE(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100, &results);
     VOLK_PROFILE(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100, &results);
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index eea006bf2f..312ff2d5a1 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -54,6 +54,7 @@ volkinclude_HEADERS = \
 	volk_32f_accumulator_s32f_a.h \
 	volk_32f_x2_add_32f_a.h \
 	volk_32f_s32f_multiply_32f_a.h \
+	volk_32f_s32f_multiply_32f_u.h \
 	volk_32fc_32f_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_u.h \
@@ -100,6 +101,7 @@ volkinclude_HEADERS = \
 	volk_32f_x2_max_32f_a.h \
 	volk_32f_x2_min_32f_a.h \
 	volk_32f_x2_multiply_32f_a.h \
+	volk_32f_x2_multiply_32f_u.h \
 	volk_32f_s32f_normalize_a.h \
 	volk_32f_s32f_power_32f_a.h \
 	volk_32f_sqrt_32f_a.h \
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
new file mode 100644
index 0000000000..0e700060f7
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_loadu_ps(aPtr); 
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for(;number < eighthPoints; number++){
+      
+      aVal = _mm256_loadu_ps(aPtr); 
+      
+      cVal = _mm256_mul_ps(aVal, bVal); 
+      
+      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const float* inputPtr = aVector;
+  float* outputPtr = cVector;
+  for(number = 0; number < num_points; number++){
+    *outputPtr = (*inputPtr) * scalar;
+    inputPtr++;
+    outputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
new file mode 100644
index 0000000000..6c3ce5d83f
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_loadu_ps(aPtr); 
+      bVal = _mm_loadu_ps(bPtr);
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Multiplies the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m256 aVal, bVal, cVal;
+    for(;number < eighthPoints; number++){
+      
+      aVal = _mm256_loadu_ps(aPtr); 
+      bVal = _mm256_loadu_ps(bPtr);
+      
+      cVal = _mm256_mul_ps(aVal, bVal); 
+      
+      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      bPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
index 450a89066e..218c450f82 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -8,13 +8,13 @@
 
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
-  /*!
+/*!
   \brief Multiplies the input vector by a scalar and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The vector to be multiplied
-    \param scalar The complex scalar to multiply aVector
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be multiplied
+  \param scalar The complex scalar to multiply aVector
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
   unsigned int number = 0;
     const unsigned int halfPoints = num_points / 2;
@@ -52,13 +52,13 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons
 #endif /* LV_HAVE_SSE */
 
 #ifdef LV_HAVE_GENERIC
-  /*!
+/*!
   \brief Multiplies the input vector by a scalar and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The vector to be multiplied
-    \param scalar The complex scalar to multiply aVector
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be multiplied
+  \param scalar The complex scalar to multiply aVector
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
 static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
     lv_32fc_t* cPtr = cVector;
     const lv_32fc_t* aPtr = aVector;
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index fbd4bdea56..97624775ee 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -37,7 +37,6 @@ VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
 VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1);
@@ -59,7 +58,6 @@ VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1);
@@ -90,3 +88,11 @@ VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1);
 VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1);
author	Tom Rondeau <trondeau@vt.edu>	2012-02-04 11:04:30 -0500
committer	Tom Rondeau <trondeau@vt.edu>	2012-02-13 14:56:39 -0500
commit	f028a198cb47e0486ad41a29bd3b3dcf0663d766 (patch)
tree	569f14c836d7c2757ca85547a34fbc737669ad29 /volk
parent	3a21ccb8cdef50daa52f5d26293df3a03c821c99 (diff)