doc/doxygen-3.6/volk__32fc__x2__conjugate__dot__prod__32fc__u_8h_source.html

00001 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
00002 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
00003
00004
00005 #include<volk/volk_complex.h>
00006
00007
00008 #ifdef LV_HAVE_GENERIC
00009
00010
00011 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
00012
00013   float * res = (float*) result;
00014   float * in = (float*) input;
00015   float * tp = (float*) taps;
00016   unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
00017   unsigned int isodd = (num_bytes >> 3) &1;
00018
00019
00020
00021   float sum0[2] = {0,0};
00022   float sum1[2] = {0,0};
00023   unsigned int i = 0;
00024
00025
00026   for(i = 0; i < n_2_ccomplex_blocks; ++i) {
00027
00028     sum0[0] += in[0] * tp[0] + in[1] * tp[1];
00029     sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
00030     sum1[0] += in[2] * tp[2] + in[3] * tp[3];
00031     sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
00032
00033
00034     in += 4;
00035     tp += 4;
00036
00037   }
00038
00039
00040   res[0] = sum0[0] + sum1[0];
00041   res[1] = sum0[1] + sum1[1];
00042
00043
00044
00045   for(i = 0; i < isodd; ++i) {
00046
00047
00048     *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
00049
00050   }
00051   /*
00052   for(i = 0; i < num_bytes >> 3; ++i) {
00053     *result += input[i] * conjf(taps[i]);
00054   }
00055   */
00056 }
00057
00058 #endif /*LV_HAVE_GENERIC*/
00059
00060 #ifdef LV_HAVE_SSE3
00061
00062 #include <xmmintrin.h>
00063 #include <pmmintrin.h>
00064 #include <mmintrin.h>
00065
00066
00067 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
00068
00069   // Variable never used?
00070   //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
00071
00072   union HalfMask {
00073     uint32_t intRep[4];
00074     __m128 vec;
00075     } halfMask;
00076
00077   union NegMask {
00078     int intRep[4];
00079     __m128 vec;
00080   } negMask;
00081
00082   unsigned int offset = 0;
00083   float Rsum=0, Isum=0;
00084   float Im,Re;
00085
00086   __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
00087   __m128 zv = {0,0,0,0};
00088
00089   halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
00090   halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
00091
00092   negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
00093   negMask.intRep[1] = negMask.intRep[3] = 0;
00094
00095   // main loop
00096   while(num_bytes >= 4*sizeof(float)){
00097
00098     in1 = _mm_loadu_ps( (float*) (input+offset) );
00099     in2 = _mm_loadu_ps( (float*) (taps+offset) );
00100     Rv = _mm_mul_ps(in1, in2);
00101     fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
00102     Iv = _mm_mul_ps(in1, fehg);
00103     Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
00104     Ivm = _mm_xor_ps( negMask.vec, Iv );
00105     Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
00106     _mm_store_ss( &Im, Is );
00107     _mm_store_ss( &Re, Rs );
00108     num_bytes -= 4*sizeof(float);
00109     offset += 2;
00110     Rsum += Re;
00111     Isum += Im;
00112   }
00113
00114   // handle the last complex case ...
00115   if(num_bytes > 0){
00116
00117     if(num_bytes != 4){
00118       // bad things are happening
00119     }
00120
00121     in1 = _mm_loadu_ps( (float*) (input+offset) );
00122     in2 = _mm_loadu_ps( (float*) (taps+offset) );
00123     Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
00124     fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
00125     Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
00126     Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
00127     Ivm = _mm_xor_ps( negMask.vec, Iv );
00128     Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
00129     _mm_store_ss( &Im, Is );
00130     _mm_store_ss( &Re, Rs );
00131     Rsum += Re;
00132     Isum += Im;
00133   }
00134
00135   result[0] = lv_cmake(Rsum,Isum);
00136   return;
00137 }
00138
00139 #endif /*LV_HAVE_SSE3*/
00140
00141
00142 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
00143
00144
00145