GNU Radio 3.6.5 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H 00002 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H 00003 00004 00005 #include<volk/volk_complex.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { 00012 00013 float * res = (float*) result; 00014 float * in = (float*) input; 00015 float * tp = (float*) taps; 00016 unsigned int n_2_ccomplex_blocks = num_bytes >> 4; 00017 unsigned int isodd = (num_bytes >> 3) &1; 00018 00019 00020 00021 float sum0[2] = {0,0}; 00022 float sum1[2] = {0,0}; 00023 unsigned int i = 0; 00024 00025 00026 for(i = 0; i < n_2_ccomplex_blocks; ++i) { 00027 00028 sum0[0] += in[0] * tp[0] + in[1] * tp[1]; 00029 sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; 00030 sum1[0] += in[2] * tp[2] + in[3] * tp[3]; 00031 sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; 00032 00033 00034 in += 4; 00035 tp += 4; 00036 00037 } 00038 00039 00040 res[0] = sum0[0] + sum1[0]; 00041 res[1] = sum0[1] + sum1[1]; 00042 00043 00044 00045 for(i = 0; i < isodd; ++i) { 00046 00047 00048 *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); 00049 00050 } 00051 /* 00052 for(i = 0; i < num_bytes >> 3; ++i) { 00053 *result += input[i] * conjf(taps[i]); 00054 } 00055 */ 00056 } 00057 00058 #endif /*LV_HAVE_GENERIC*/ 00059 00060 #ifdef LV_HAVE_SSE3 00061 00062 #include <xmmintrin.h> 00063 #include <pmmintrin.h> 00064 #include <mmintrin.h> 00065 00066 00067 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { 00068 00069 // Variable never used? 00070 //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; 00071 00072 union HalfMask { 00073 uint32_t intRep[4]; 00074 __m128 vec; 00075 } halfMask; 00076 00077 union NegMask { 00078 int intRep[4]; 00079 __m128 vec; 00080 } negMask; 00081 00082 unsigned int offset = 0; 00083 float Rsum=0, Isum=0; 00084 float Im,Re; 00085 00086 __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; 00087 __m128 zv = {0,0,0,0}; 00088 00089 halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; 00090 halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; 00091 00092 negMask.intRep[0] = negMask.intRep[2] = 0x80000000; 00093 negMask.intRep[1] = negMask.intRep[3] = 0; 00094 00095 // main loop 00096 while(num_bytes >= 4*sizeof(float)){ 00097 00098 in1 = _mm_loadu_ps( (float*) (input+offset) ); 00099 in2 = _mm_loadu_ps( (float*) (taps+offset) ); 00100 Rv = _mm_mul_ps(in1, in2); 00101 fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); 00102 Iv = _mm_mul_ps(in1, fehg); 00103 Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); 00104 Ivm = _mm_xor_ps( negMask.vec, Iv ); 00105 Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); 00106 _mm_store_ss( &Im, Is ); 00107 _mm_store_ss( &Re, Rs ); 00108 num_bytes -= 4*sizeof(float); 00109 offset += 2; 00110 Rsum += Re; 00111 Isum += Im; 00112 } 00113 00114 // handle the last complex case ... 00115 if(num_bytes > 0){ 00116 00117 if(num_bytes != 4){ 00118 // bad things are happening 00119 } 00120 00121 in1 = _mm_loadu_ps( (float*) (input+offset) ); 00122 in2 = _mm_loadu_ps( (float*) (taps+offset) ); 00123 Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); 00124 fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); 00125 Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); 00126 Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); 00127 Ivm = _mm_xor_ps( negMask.vec, Iv ); 00128 Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); 00129 _mm_store_ss( &Im, Is ); 00130 _mm_store_ss( &Re, Rs ); 00131 Rsum += Re; 00132 Isum += Im; 00133 } 00134 00135 result[0] = lv_cmake(Rsum,Isum); 00136 return; 00137 } 00138 00139 #endif /*LV_HAVE_SSE3*/ 00140 00141 00142 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ 00143 00144 00145