summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
authorTom Rondeau <trondeau@vt.edu>2012-02-06 10:31:28 -0500
committerTom Rondeau <trondeau@vt.edu>2012-02-13 14:56:39 -0500
commit298623615b249de459cd12b5507dab921fc3210b (patch)
treefb1b3d7f3e115a0b18d9048df4dd9ba73eb6f01b /volk
parent47c390286d49e00498a3443a3dcb9f83d11c7ecc (diff)
volk: added unaligned version of adding 2 vectors.
Diffstat (limited to 'volk')
-rw-r--r--volk/apps/volk_profile.cc1
-rw-r--r--volk/include/volk/Makefile.am1
-rw-r--r--volk/include/volk/volk_32f_x2_add_32f_u.h66
-rw-r--r--volk/lib/testqa.cc1
4 files changed, 69 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index f5f730df12..712c32bce9 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -34,6 +34,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results);
VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results);
VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results);
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 312ff2d5a1..20864efbe5 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -53,6 +53,7 @@ volkinclude_HEADERS = \
volk_16u_byteswap_a.h \
volk_32f_accumulator_s32f_a.h \
volk_32f_x2_add_32f_a.h \
+ volk_32f_x2_add_32f_u.h \
volk_32f_s32f_multiply_32f_a.h \
volk_32f_s32f_multiply_32f_u.h \
volk_32fc_32f_multiply_32fc_a.h \
diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h
new file mode 100644
index 0000000000..e360a79580
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_add_32f_u.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 97624775ee..b00ea0b640 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -22,6 +22,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1);