diff options
author | Nathan West <nathan.west@okstate.edu> | 2014-10-18 17:56:56 -0500 |
---|---|---|
committer | Nathan West <nathan.west@okstate.edu> | 2014-10-19 18:25:24 -0500 |
commit | 520ac293c30d725225c5f984b8bf55e6f1caecf3 (patch) | |
tree | 9885692f453a7ce28051a17313e1c378692751c2 | |
parent | 2e26c38cb01fee825d1f8ada952e56284c8ee8f1 (diff) |
volk: add neon support for 32fc_s32fc_multiply_32fc
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 41 |
1 files changed, 37 insertions, 4 deletions
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h index 945b4b5a2c..474b982887 100644 --- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h +++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h @@ -252,6 +252,43 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + unsigned int quarter_points = num_points / 4; + + float32x4x2_t a_val, scalar_val; + float32x4x2_t tmp_imag; + + scalar_val = vld2q_f32((const float*)&scalar); + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)aPtr); + tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]); + + tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]); + tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]); + + vst2q_f32((float*)cVector, tmp_imag); + aPtr += 4; + cVector += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + *cPtr++ = *aPtr++ * scalar; + } +} +#endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC /*! @@ -285,8 +322,4 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c } #endif /* LV_HAVE_GENERIC */ - - - - #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ |