doc/doxygen-3.6/volk__32fc__32f__dot__prod__32fc__a_8h_source.html

00001 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
00002 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
00003
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006
00007
00008 #ifdef LV_HAVE_GENERIC
00009
00010
00011 static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
00012
00013   float res[2];
00014   float *realpt = &res[0], *imagpt = &res[1];
00015   const float* aPtr = (float*)input;
00016   const float* bPtr=  taps;
00017   unsigned int number = 0;
00018
00019   *realpt = 0;
00020   *imagpt = 0;
00021
00022   for(number = 0; number < num_points; number++){
00023     *realpt += ((*aPtr++) * (*bPtr));
00024     *imagpt += ((*aPtr++) * (*bPtr++));
00025   }
00026
00027   *result = *(lv_32fc_t*)(&res[0]);
00028 }
00029
00030 #endif /*LV_HAVE_GENERIC*/
00031
00032
00033 #ifdef LV_HAVE_SSE
00034
00035
00036 static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
00037
00038   unsigned int number = 0;
00039   const unsigned int sixteenthPoints = num_points / 8;
00040
00041   float res[2];
00042   float *realpt = &res[0], *imagpt = &res[1];
00043   const float* aPtr = (float*)input;
00044   const float* bPtr = taps;
00045
00046   __m128 a0Val, a1Val, a2Val, a3Val;
00047   __m128 b0Val, b1Val, b2Val, b3Val;
00048   __m128 x0Val, x1Val, x2Val, x3Val;
00049   __m128 c0Val, c1Val, c2Val, c3Val;
00050
00051   __m128 dotProdVal0 = _mm_setzero_ps();
00052   __m128 dotProdVal1 = _mm_setzero_ps();
00053   __m128 dotProdVal2 = _mm_setzero_ps();
00054   __m128 dotProdVal3 = _mm_setzero_ps();
00055
00056   for(;number < sixteenthPoints; number++){
00057
00058     a0Val = _mm_load_ps(aPtr);
00059     a1Val = _mm_load_ps(aPtr+4);
00060     a2Val = _mm_load_ps(aPtr+8);
00061     a3Val = _mm_load_ps(aPtr+12);
00062
00063     x0Val = _mm_load_ps(bPtr);
00064     x1Val = _mm_load_ps(bPtr);
00065     x2Val = _mm_load_ps(bPtr+4);
00066     x3Val = _mm_load_ps(bPtr+4);
00067     b0Val = _mm_unpacklo_ps(x0Val, x1Val);
00068     b1Val = _mm_unpackhi_ps(x0Val, x1Val);
00069     b2Val = _mm_unpacklo_ps(x2Val, x3Val);
00070     b3Val = _mm_unpackhi_ps(x2Val, x3Val);
00071
00072     c0Val = _mm_mul_ps(a0Val, b0Val);
00073     c1Val = _mm_mul_ps(a1Val, b1Val);
00074     c2Val = _mm_mul_ps(a2Val, b2Val);
00075     c3Val = _mm_mul_ps(a3Val, b3Val);
00076
00077     dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
00078     dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
00079     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
00080     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
00081
00082     aPtr += 16;
00083     bPtr += 8;
00084   }
00085
00086   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00087   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00088   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00089
00090   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00091
00092   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00093
00094   *realpt = dotProductVector[0];
00095   *imagpt = dotProductVector[1];
00096   *realpt += dotProductVector[2];
00097   *imagpt += dotProductVector[3];
00098
00099   number = sixteenthPoints*8;
00100   for(;number < num_points; number++){
00101     *realpt += ((*aPtr++) * (*bPtr));
00102     *imagpt += ((*aPtr++) * (*bPtr++));
00103   }
00104
00105   *result = *(lv_32fc_t*)(&res[0]);
00106 }
00107
00108 #endif /*LV_HAVE_SSE*/
00109
00110
00111 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/