diff options
author | Tom Rondeau <trondeau@vt.edu> | 2012-02-06 10:31:28 -0500 |
---|---|---|
committer | Tom Rondeau <trondeau@vt.edu> | 2012-02-13 14:56:39 -0500 |
commit | 298623615b249de459cd12b5507dab921fc3210b (patch) | |
tree | fb1b3d7f3e115a0b18d9048df4dd9ba73eb6f01b /volk | |
parent | 47c390286d49e00498a3443a3dcb9f83d11c7ecc (diff) |
volk: added unaligned version of adding 2 vectors.
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rw-r--r-- | volk/include/volk/Makefile.am | 1 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_add_32f_u.h | 66 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 1 |
4 files changed, 69 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index f5f730df12..712c32bce9 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -34,6 +34,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results); VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results); diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 312ff2d5a1..20864efbe5 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -53,6 +53,7 @@ volkinclude_HEADERS = \ volk_16u_byteswap_a.h \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ + volk_32f_x2_add_32f_u.h \ volk_32f_s32f_multiply_32f_a.h \ volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h new file mode 100644 index 0000000000..e360a79580 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 97624775ee..b00ea0b640 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -22,6 +22,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1); |