volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions.

author: Abhishek Bhowmick <abhowmick22@gmail.com> 2014-06-03 16:15:13 +0530
committer: Tom Rondeau <tom@trondeau.com> 2014-10-15 10:29:03 -0400
commit: 655b2ca7d0ce014ecf5a2f4c8184c09c0fc5a946 (patch)
tree: c0886152b77959644d8105c82584653e80d37519
parent: bb79c5b07b6c7156a355d5d055f9efa5e10be974 (diff)
11 files changed, 977 insertions, 0 deletions
diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index a810a601a0..e9890841f2 100644
--- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -4,6 +4,63 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal, inputVal2;
+    __m128 ret;
+    __m256 output;
+    __m256 dummy = _mm256_setzero_ps();
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      output = _mm256_insertf128_ps(dummy, ret, 0);
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      output = _mm256_insertf128_ps(output, ret, 1);
+      
+      _mm256_storeu_ps(outputVectorPtr, output);
+
+      outputVectorPtr += 8;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
 
@@ -126,6 +183,63 @@ static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, const
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal, inputVal2;
+    __m128 ret;
+    __m256 output;
+    __m256 dummy = _mm256_setzero_ps();
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+      inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      output = _mm256_insertf128_ps(dummy, ret, 0);
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      output = _mm256_insertf128_ps(output, ret, 1);
+      
+      _mm256_store_ps(outputVectorPtr, output);
+
+      outputVectorPtr += 8;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
 
diff --git a/volk/kernels/volk/volk_32f_s32f_convert_16i.h b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
index 9fd758655f..7a53da6dee 100644
--- a/volk/kernels/volk/volk_32f_s32f_convert_16i.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -5,6 +5,63 @@
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
+  __m256 vScalar = _mm256_set1_ps(scalar);
+  __m256 inputVal, ret;
+  __m256i intInputVal;
+  __m128i intInputVal1, intInputVal2;
+  __m256 vmin_val = _mm256_set1_ps(min_val);
+  __m256 vmax_val = _mm256_set1_ps(max_val);
+
+  for(;number < eighthPoints; number++){
+    inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+
+    // Scale and clip
+    ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+   
+    intInputVal = _mm256_cvtps_epi32(ret);
+
+    intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+    intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)rintf(r);
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
   /*!
@@ -158,6 +215,63 @@ static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, cons
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
+  __m256 vScalar = _mm256_set1_ps(scalar);
+  __m256 inputVal, ret;
+  __m256i intInputVal;
+  __m128i intInputVal1, intInputVal2;
+  __m256 vmin_val = _mm256_set1_ps(min_val);
+  __m256 vmax_val = _mm256_set1_ps(max_val);
+
+  for(;number < eighthPoints; number++){
+    inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+    // Scale and clip
+    ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
+   
+    intInputVal = _mm256_cvtps_epi32(ret);
+
+    intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+    intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)rintf(r);
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
index 0f74b01816..80159d434f 100644
--- a/volk/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -6,6 +6,44 @@
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+      x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+      _mm256_storeu_ps((float*)c,x); // Store the results back into the C container
+
+      a += 4;
+      c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for(;number < num_points; number++) {
+      *c++ = lv_conj(*a++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
@@ -70,6 +108,44 @@ static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+      x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+      _mm256_store_ps((float*)c,x); // Store the results back into the C container
+
+      a += 4;
+      c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for(;number < num_points; number++) {
+      *c++ = lv_conj(*a++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index 0d33ed7e28..aad9fbe4db 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -4,6 +4,57 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  // Mask for real and imaginary parts
+  int realMask = 0x88;
+  int imagMask = 0xdd;
+  const unsigned int eighthPoints = num_points / 8;
+  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+  for(;number < eighthPoints; number++){
+
+    cplxValue1 = _mm256_load_ps(complexVectorPtr);
+    complexVectorPtr += 8;
+
+    cplxValue2 = _mm256_load_ps(complexVectorPtr);
+    complexVectorPtr += 8;
+
+    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+   
+    // Arrange in i1i2i3i4 format
+    iValue = _mm256_shuffle_ps(complex1, complex2, realMask);
+    // Arrange in q1q2q3q4 format
+    qValue = _mm256_shuffle_ps(complex1, complex2, imagMask);
+
+    _mm256_store_ps(iBufferPtr, iValue);
+    _mm256_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 /*!
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
index 4a4c5509bd..b0c93ff038 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -4,6 +4,57 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue; 
+    __m128 complexH, complexL, fVal;
+    __m256d dVal;
+
+    for(;number < quarterPoints; number++){
+
+      cplxValue = _mm256_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      complexH = _mm256_extractf128_ps(cplxValue, 1);
+      complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm256_cvtps_pd(fVal);
+      _mm256_storeu_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm256_cvtps_pd(fVal);
+      _mm256_storeu_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
 /*!
@@ -82,6 +133,57 @@ static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue; 
+    __m128 complexH, complexL, fVal;
+    __m256d dVal;
+
+    for(;number < quarterPoints; number++){
+
+      cplxValue = _mm256_load_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      complexH = _mm256_extractf128_ps(cplxValue, 1);
+      complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm256_cvtps_pd(fVal);
+      _mm256_store_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm256_cvtps_pd(fVal);
+      _mm256_store_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
 /*!
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index b1968296f5..9b5c57bc08 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -4,6 +4,50 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into Q vector data
+  \param complexVector The complex input vector
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;
+  int imagMask = 0xdd;
+  const float* complexVectorPtr = (const float*)complexVector;
+  float* qBufferPtr = qBuffer;
+
+  __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+  for(;number < eighthPoints; number++){
+
+    cplxValue1 = _mm256_load_ps(complexVectorPtr);
+    complexVectorPtr += 8;
+
+    cplxValue2 = _mm256_load_ps(complexVectorPtr);
+    complexVectorPtr += 8;
+
+    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+    
+    // Arrange in q1q2q3q4 format
+    qValue = _mm256_shuffle_ps(complex1, complex2, imagMask);  
+    //iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm256_store_ps(qBufferPtr, qValue);
+
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 /*!
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index b6da7f3630..f611091693 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -5,6 +5,52 @@
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+    for(;number < eighthPoints; number++){
+      cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+      complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+      
+      result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+      result = _mm256_sqrt_ps(result);
+
+      _mm256_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
@@ -123,6 +169,52 @@ static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, const
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+    for(;number < eighthPoints; number++){
+      cplxValue1 = _mm256_load_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue2 = _mm256_load_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+      
+      complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+      complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+      result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+      result = _mm256_sqrt_ps(result);
+
+      _mm256_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
index 878794ba79..be85bda1ac 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -5,6 +5,50 @@
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+    for(;number < eighthPoints; number++){
+      cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+      complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+      result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+      _mm256_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
@@ -119,6 +163,50 @@ static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVecto
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+    for(;number < eighthPoints; number++){
+      cplxValue1 = _mm256_load_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue2 = _mm256_load_ps(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+      complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+      result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+      _mm256_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
index e73eb09f8f..9ec92e2ce3 100644
--- a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
+++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -5,6 +5,98 @@
 #include <stdio.h>
 #include <math.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+  __m256 magScalar = _mm256_set1_ps(10.0);
+  magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+
+  __m256 invRBW = _mm256_set1_ps(iRBW);
+
+  __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+
+  __m256 power;
+  __m256 input1, input2;
+  const uint64_t eighthPoints = num_points / 8;
+  for(;number < eighthPoints; number++){
+    // Load the complex values
+    input1 =_mm256_load_ps(inputPtr);
+    inputPtr += 8;
+    input2 =_mm256_load_ps(inputPtr);
+    inputPtr += 8;
+
+    // Apply the normalization factor
+    input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm256_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm256_mul_ps(input2, input2);
+
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+    inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+
+    power = _mm256_hadd_ps(inputVal1, inputVal2);
+
+    // Divide by the rbw
+    power = _mm256_mul_ps(power, invRBW);
+
+    // Calculate the natural log power
+    power = logf4(power);
+
+    // Convert to log10 and multiply by 10.0
+    power = _mm256_mul_ps(power, magScalar);
+
+    // Store the floating point results
+    _mm256_store_ps(destPtr, power);
+
+    destPtr += 8;
+  }
+
+  number = eighthPoints*8;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+    destPtr++;
+  }
+
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
 
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 7c723bcdb6..cf08e91518 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -6,6 +6,55 @@
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+      y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+
+      yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+      yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+      _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 4;
+      b += 4;
+      c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for(; number < num_points; number++) {
+      *c++ = (*a++) * (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
@@ -83,6 +132,55 @@ static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+      y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+
+      yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+      yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+      _mm256_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 4;
+      b += 4;
+      c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for(; number < num_points; number++) {
+      *c++ = (*a++) * (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
index dbc123ff25..82070e07da 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -6,6 +6,59 @@
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+      y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+
+      y = _mm256_xor_ps(y, conjugator); // conjugate y
+
+      yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+      yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ...
+
+      _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 4;
+      b += 4;
+      c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for(; number < num_points; number++) {
+      *c++ = (*a++) * lv_conj(*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
@@ -87,6 +140,59 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVect
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+      y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+
+      y = _mm256_xor_ps(y, conjugator); // conjugate y
+
+      yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+      yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ...
+
+      _mm256_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 4;
+      b += 4;
+      c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for(; number < num_points; number++) {
+      *c++ = (*a++) * lv_conj(*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
author	Abhishek Bhowmick <abhowmick22@gmail.com>	2014-06-03 16:15:13 +0530
committer	Tom Rondeau <tom@trondeau.com>	2014-10-15 10:29:03 -0400
commit	655b2ca7d0ce014ecf5a2f4c8184c09c0fc5a946 (patch)
tree	c0886152b77959644d8105c82584653e80d37519
parent	bb79c5b07b6c7156a355d5d055f9efa5e10be974 (diff)