diff options
author | Abhishek Bhowmick <abhowmick22@gmail.com> | 2014-06-03 16:15:13 +0530 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:29:03 -0400 |
commit | 655b2ca7d0ce014ecf5a2f4c8184c09c0fc5a946 (patch) | |
tree | c0886152b77959644d8105c82584653e80d37519 | |
parent | bb79c5b07b6c7156a355d5d055f9efa5e10be974 (diff) |
volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions.
-rw-r--r-- | volk/kernels/volk/volk_16i_s32f_convert_32f.h | 114 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_s32f_convert_16i.h | 114 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_conjugate_32fc.h | 76 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h | 51 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h | 102 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h | 44 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_magnitude_32f.h | 92 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_magnitude_squared_32f.h | 88 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h | 92 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_multiply_32fc.h | 98 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h | 106 |
11 files changed, 977 insertions, 0 deletions
diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h index a810a601a0..e9890841f2 100644 --- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h +++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h @@ -4,6 +4,63 @@ #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal, inputVal2; + __m128 ret; + __m256 output; + __m256 dummy = _mm256_setzero_ps(); + + for(;number < eighthPoints; number++){ + + // Load the 8 values + //inputVal = _mm_loadu_si128((__m128i*)inputPtr); + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(dummy, ret, 0); + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(output, ret, 1); + + _mm256_storeu_ps(outputVectorPtr, output); + + outputVectorPtr += 8; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE4_1 #include <smmintrin.h> @@ -126,6 +183,63 @@ static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal, inputVal2; + __m128 ret; + __m256 output; + __m256 dummy = _mm256_setzero_ps(); + + for(;number < eighthPoints; number++){ + + // Load the 8 values + //inputVal = _mm_loadu_si128((__m128i*)inputPtr); + inputVal = _mm_load_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(dummy, ret, 0); + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(output, ret, 1); + + _mm256_store_ps(outputVectorPtr, output); + + outputVectorPtr += 8; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE4_1 #include <smmintrin.h> diff --git a/volk/kernels/volk/volk_32f_s32f_convert_16i.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h index 9fd758655f..7a53da6dee 100644 --- a/volk/kernels/volk/volk_32f_s32f_convert_16i.h +++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h @@ -5,6 +5,63 @@ #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal, ret; + __m256i intInputVal; + __m128i intInputVal1, intInputVal2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for(;number < eighthPoints; number++){ + inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + + // Scale and clip + ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val); + + intInputVal = _mm256_cvtps_epi32(ret); + + intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); + intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE2 #include <emmintrin.h> /*! @@ -158,6 +215,63 @@ static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, cons #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal, ret; + __m256i intInputVal; + __m128i intInputVal1, intInputVal2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for(;number < eighthPoints; number++){ + inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + + // Scale and clip + ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val); + + intInputVal = _mm256_cvtps_epi32(ret); + + intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); + intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE2 #include <emmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h index 0f74b01816..80159d434f 100644 --- a/volk/kernels/volk/volk_32fc_conjugate_32fc.h +++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h @@ -6,6 +6,44 @@ #include <volk/volk_complex.h> #include <float.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + + for(;number < quarterPoints; number++){ + + x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm256_xor_ps(x, conjugator); // conjugate register + + _mm256_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 4; + c += 4; + } + + number = quarterPoints * 4; + + for(;number < num_points; number++) { + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! @@ -70,6 +108,44 @@ static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv #include <volk/volk_complex.h> #include <float.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + + for(;number < quarterPoints; number++){ + + x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm256_xor_ps(x, conjugator); // conjugate register + + _mm256_store_ps((float*)c,x); // Store the results back into the C container + + a += 4; + c += 4; + } + + number = quarterPoints * 4; + + for(;number < num_points; number++) { + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h index 0d33ed7e28..aad9fbe4db 100644 --- a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -4,6 +4,57 @@ #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + // Mask for real and imaginary parts + int realMask = 0x88; + int imagMask = 0xdd; + const unsigned int eighthPoints = num_points / 8; + __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; + for(;number < eighthPoints; number++){ + + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(complex1, complex2, realMask); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(complex1, complex2, imagMask); + + _mm256_store_ps(iBufferPtr, iValue); + _mm256_store_ps(qBufferPtr, qValue); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE #include <xmmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h index 4a4c5509bd..b0c93ff038 100644 --- a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h @@ -4,6 +4,57 @@ #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int quarterPoints = num_points / 4; + __m256 cplxValue; + __m128 complexH, complexL, fVal; + __m256d dVal; + + for(;number < quarterPoints; number++){ + + cplxValue = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + complexH = _mm256_extractf128_ps(cplxValue, 1); + complexL = _mm256_extractf128_ps(cplxValue, 0); + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_storeu_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_storeu_pd(qBufferPtr, dVal); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE2 #include <emmintrin.h> /*! @@ -82,6 +133,57 @@ static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int quarterPoints = num_points / 4; + __m256 cplxValue; + __m128 complexH, complexL, fVal; + __m256d dVal; + + for(;number < quarterPoints; number++){ + + cplxValue = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + complexH = _mm256_extractf128_ps(cplxValue, 1); + complexL = _mm256_extractf128_ps(cplxValue, 0); + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_store_pd(qBufferPtr, dVal); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE2 #include <emmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h index b1968296f5..9b5c57bc08 100644 --- a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h +++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -4,6 +4,50 @@ #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + int imagMask = 0xdd; + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; + + __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; + for(;number < eighthPoints; number++){ + + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(complex1, complex2, imagMask); + //iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm256_store_ps(qBufferPtr, qValue); + + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE #include <xmmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h index b6da7f3630..f611091693 100644 --- a/volk/kernels/volk/volk_32fc_magnitude_32f.h +++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h @@ -5,6 +5,52 @@ #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, complex1, complex2, result; + for(;number < eighthPoints; number++){ + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values + + result = _mm256_sqrt_ps(result); + + _mm256_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! @@ -123,6 +169,52 @@ static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, complex1, complex2, result; + for(;number < eighthPoints; number++){ + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values + + result = _mm256_sqrt_ps(result); + + _mm256_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h index 878794ba79..be85bda1ac 100644 --- a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h +++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -5,6 +5,50 @@ #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, complex1, complex2, result; + for(;number < eighthPoints; number++){ + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values + + _mm256_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! @@ -119,6 +163,50 @@ static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVecto #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, complex1, complex2, result; + for(;number < eighthPoints; number++){ + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values + + _mm256_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h index e73eb09f8f..9ec92e2ce3 100644 --- a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h +++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h @@ -5,6 +5,98 @@ #include <stdio.h> #include <math.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + +#ifdef LV_HAVE_LIB_SIMDMATH +#include <simdmath.h> +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#ifdef LV_HAVE_LIB_SIMDMATH + __m256 magScalar = _mm256_set1_ps(10.0); + magScalar = _mm256_div_ps(magScalar, logf4(magScalar)); + + __m256 invRBW = _mm256_set1_ps(iRBW); + + __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor); + + __m256 power; + __m256 input1, input2; + const uint64_t eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the complex values + input1 =_mm256_load_ps(inputPtr); + inputPtr += 8; + input2 =_mm256_load_ps(inputPtr); + inputPtr += 8; + + // Apply the normalization factor + input1 = _mm256_mul_ps(input1, invNormalizationFactor); + input2 = _mm256_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm256_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm256_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20); + inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31); + + power = _mm256_hadd_ps(inputVal1, inputVal2); + + // Divide by the rbw + power = _mm256_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm256_mul_ps(power, magScalar); + + // Store the floating point results + _mm256_store_ps(destPtr, power); + + destPtr += 8; + } + + number = eighthPoints*8; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h index 7c723bcdb6..cf08e91518 100644 --- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -6,6 +6,55 @@ #include <volk/volk_complex.h> #include <float.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < quarterPoints; number++){ + + x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ... + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm256_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for(; number < num_points; number++) { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! @@ -83,6 +132,55 @@ static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const #include <volk/volk_complex.h> #include <float.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < quarterPoints; number++){ + + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ... + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm256_store_ps((float*)c,z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for(; number < num_points; number++) { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h index dbc123ff25..82070e07da 100644 --- a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -6,6 +6,59 @@ #include <volk/volk_complex.h> #include <float.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + + for(;number < quarterPoints; number++){ + + x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + + y = _mm256_xor_ps(y, conjugator); // conjugate y + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ... + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ... + + _mm256_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for(; number < num_points; number++) { + *c++ = (*a++) * lv_conj(*b++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! @@ -87,6 +140,59 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVect #include <volk/volk_complex.h> #include <float.h> +#ifdef LV_HAVE_AVX +#include <immintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + + for(;number < quarterPoints; number++){ + + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + + y = _mm256_xor_ps(y, conjugator); // conjugate y + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ... + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ... + + _mm256_store_ps((float*)c,z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for(; number < num_points; number++) { + *c++ = (*a++) * lv_conj(*b++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> /*! |