GNU Radio 3.5.3.2 C++ API
volk_32fc_x2_conjugate_dot_prod_32fc_u.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
00002 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
00003 
00004 
00005 #include<volk/volk_complex.h>
00006 
00007 
00008 #ifdef LV_HAVE_GENERIC
00009 
00010 
00011 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
00012   
00013   float * res = (float*) result;
00014   float * in = (float*) input;
00015   float * tp = (float*) taps;
00016   unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
00017   unsigned int isodd = (num_bytes >> 3) &1;
00018   
00019   
00020   
00021   float sum0[2] = {0,0};
00022   float sum1[2] = {0,0};
00023   unsigned int i = 0;
00024 
00025   
00026   for(i = 0; i < n_2_ccomplex_blocks; ++i) {
00027     
00028     sum0[0] += in[0] * tp[0] + in[1] * tp[1];
00029     sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
00030     sum1[0] += in[2] * tp[2] + in[3] * tp[3];
00031     sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
00032     
00033     
00034     in += 4;
00035     tp += 4;
00036 
00037   }
00038  
00039   
00040   res[0] = sum0[0] + sum1[0];
00041   res[1] = sum0[1] + sum1[1];
00042   
00043   
00044   
00045   for(i = 0; i < isodd; ++i) {
00046 
00047 
00048     *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
00049 
00050   }
00051   /*
00052   for(i = 0; i < num_bytes >> 3; ++i) {
00053     *result += input[i] * conjf(taps[i]);
00054   }
00055   */
00056 }
00057 
00058 #endif /*LV_HAVE_GENERIC*/
00059 
00060 #ifdef LV_HAVE_SSE3
00061 
00062 #include <xmmintrin.h>
00063 #include <pmmintrin.h>
00064 #include <mmintrin.h>
00065 
00066 
00067 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
00068 
00069   // Variable never used?
00070   //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
00071 
00072   union HalfMask {
00073     uint32_t intRep[4];
00074     __m128 vec;
00075     } halfMask;
00076  
00077   union NegMask {
00078     int intRep[4];
00079     __m128 vec;
00080   } negMask;
00081 
00082   unsigned int offset = 0;
00083   float Rsum=0, Isum=0;
00084   float Im,Re;
00085 
00086   __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
00087   __m128 zv = {0,0,0,0};
00088   
00089   halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
00090   halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
00091 
00092   negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
00093   negMask.intRep[1] = negMask.intRep[3] = 0;
00094   
00095   // main loop
00096   while(num_bytes >= 4*sizeof(float)){
00097 
00098     in1 = _mm_loadu_ps( (float*) (input+offset) );
00099     in2 = _mm_loadu_ps( (float*) (taps+offset) );
00100     Rv = _mm_mul_ps(in1, in2);
00101     fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
00102     Iv = _mm_mul_ps(in1, fehg);
00103     Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
00104     Ivm = _mm_xor_ps( negMask.vec, Iv );
00105     Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
00106     _mm_store_ss( &Im, Is );
00107     _mm_store_ss( &Re, Rs );
00108     num_bytes -= 4*sizeof(float);
00109     offset += 2;
00110     Rsum += Re;
00111     Isum += Im;
00112   }
00113 
00114   // handle the last complex case ...
00115   if(num_bytes > 0){
00116 
00117     if(num_bytes != 4){
00118       // bad things are happening
00119     }
00120 
00121     in1 = _mm_loadu_ps( (float*) (input+offset) );
00122     in2 = _mm_loadu_ps( (float*) (taps+offset) );
00123     Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
00124     fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
00125     Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
00126     Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
00127     Ivm = _mm_xor_ps( negMask.vec, Iv );
00128     Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
00129     _mm_store_ss( &Im, Is );
00130     _mm_store_ss( &Re, Rs );
00131     Rsum += Re;
00132     Isum += Im;
00133   }
00134 
00135   result[0] = lv_cmake(Rsum,Isum);
00136   return;
00137 }
00138 
00139 #endif /*LV_HAVE_SSE3*/
00140 
00141 
00142 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
00143 
00144 
00145