doc/doxygen-3.6/volk__16i__32fc__dot__prod__32fc__a_8h_source.html

00001 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
00002 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
00003
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006
00007
00008 #ifdef LV_HAVE_GENERIC
00009
00010
00011 static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
00012
00013   static const int N_UNROLL = 4;
00014
00015   lv_32fc_t acc0 = 0;
00016   lv_32fc_t acc1 = 0;
00017   lv_32fc_t acc2 = 0;
00018   lv_32fc_t acc3 = 0;
00019
00020   unsigned i = 0;
00021   unsigned n = (num_points / N_UNROLL) * N_UNROLL;
00022
00023   for(i = 0; i < n; i += N_UNROLL) {
00024     acc0 += taps[i + 0] * (float)input[i + 0];
00025     acc1 += taps[i + 1] * (float)input[i + 1];
00026     acc2 += taps[i + 2] * (float)input[i + 2];
00027     acc3 += taps[i + 3] * (float)input[i + 3];
00028   }
00029
00030   for(; i < num_points; i++) {
00031     acc0 += taps[i] * (float)input[i];
00032   }
00033
00034   *result = acc0 + acc1 + acc2 + acc3;
00035 }
00036
00037 #endif /*LV_HAVE_GENERIC*/
00038
00039
00040 #if LV_HAVE_SSE && LV_HAVE_MMX
00041
00042
00043 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
00044
00045   unsigned int number = 0;
00046   const unsigned int sixteenthPoints = num_points / 8;
00047
00048   float res[2];
00049   float *realpt = &res[0], *imagpt = &res[1];
00050   const short* aPtr = input;
00051   const float* bPtr = (float*)taps;
00052
00053   __m64  m0, m1;
00054   __m128 f0, f1, f2, f3;
00055   __m128 a0Val, a1Val, a2Val, a3Val;
00056   __m128 b0Val, b1Val, b2Val, b3Val;
00057   __m128 c0Val, c1Val, c2Val, c3Val;
00058
00059   __m128 dotProdVal0 = _mm_setzero_ps();
00060   __m128 dotProdVal1 = _mm_setzero_ps();
00061   __m128 dotProdVal2 = _mm_setzero_ps();
00062   __m128 dotProdVal3 = _mm_setzero_ps();
00063
00064   for(;number < sixteenthPoints; number++){
00065
00066     m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
00067     m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
00068     f0 = _mm_cvtpi16_ps(m0);
00069     f1 = _mm_cvtpi16_ps(m0);
00070     f2 = _mm_cvtpi16_ps(m1);
00071     f3 = _mm_cvtpi16_ps(m1);
00072
00073     a0Val = _mm_unpacklo_ps(f0, f1);
00074     a1Val = _mm_unpackhi_ps(f0, f1);
00075     a2Val = _mm_unpacklo_ps(f2, f3);
00076     a3Val = _mm_unpackhi_ps(f2, f3);
00077
00078     b0Val = _mm_load_ps(bPtr);
00079     b1Val = _mm_load_ps(bPtr+4);
00080     b2Val = _mm_load_ps(bPtr+8);
00081     b3Val = _mm_load_ps(bPtr+12);
00082
00083     c0Val = _mm_mul_ps(a0Val, b0Val);
00084     c1Val = _mm_mul_ps(a1Val, b1Val);
00085     c2Val = _mm_mul_ps(a2Val, b2Val);
00086     c3Val = _mm_mul_ps(a3Val, b3Val);
00087
00088     dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
00089     dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
00090     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
00091     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
00092
00093     aPtr += 8;
00094     bPtr += 16;
00095   }
00096
00097   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00098   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00099   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00100
00101   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00102
00103   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00104
00105   *realpt = dotProductVector[0];
00106   *imagpt = dotProductVector[1];
00107   *realpt += dotProductVector[2];
00108   *imagpt += dotProductVector[3];
00109
00110   number = sixteenthPoints*8;
00111   for(;number < num_points; number++){
00112     *realpt += ((*aPtr)   * (*bPtr++));
00113     *imagpt += ((*aPtr++) * (*bPtr++));
00114   }
00115
00116   *result = *(lv_32fc_t*)(&res[0]);
00117 }
00118
00119 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
00120
00121
00122 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/