Statistics
| Branch: | Tag: | Revision:

root / volk / include / volk / volk_32fc_x2_conjugate_dot_prod_32fc_u.h @ ccfac187

History | View | Annotate | Download (3.3 kB)

1
#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
2
#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
3
4
5
#include<volk/volk_complex.h>
6
7
8
#ifdef LV_HAVE_GENERIC
9
10
11
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
12
  
13
  float * res = (float*) result;
14
  float * in = (float*) input;
15
  float * tp = (float*) taps;
16
  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
17
  unsigned int isodd = (num_bytes >> 3) &1;
18
  
19
  
20
  
21
  float sum0[2] = {0,0};
22
  float sum1[2] = {0,0};
23
  unsigned int i = 0;
24
25
  
26
  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
27
    
28
    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
29
    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
30
    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
31
    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
32
    
33
    
34
    in += 4;
35
    tp += 4;
36
37
  }
38
 
39
  
40
  res[0] = sum0[0] + sum1[0];
41
  res[1] = sum0[1] + sum1[1];
42
  
43
  
44
  
45
  for(i = 0; i < isodd; ++i) {
46
47
48
    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
49
50
  }
51
  /*
52
  for(i = 0; i < num_bytes >> 3; ++i) {
53
    *result += input[i] * conjf(taps[i]);
54
  }
55
  */
56
}
57
58
#endif /*LV_HAVE_GENERIC*/
59
60
#ifdef LV_HAVE_SSE3
61
62
#include <xmmintrin.h>
63
#include <pmmintrin.h>
64
#include <mmintrin.h>
65
66
67
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
68
69
  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
70
71
  union HalfMask {
72
    uint32_t intRep[4];
73
    __m128 vec;
74
    } halfMask;
75
 
76
  union NegMask {
77
    int intRep[4];
78
    __m128 vec;
79
  } negMask;
80
81
  unsigned int offset = 0;
82
  float Rsum=0, Isum=0;
83
  float Im,Re;
84
85
  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
86
  __m128 zv = {0,0,0,0};
87
  
88
  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
89
  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
90
91
  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
92
  negMask.intRep[1] = negMask.intRep[3] = 0;
93
  
94
  // main loop
95
  while(num_bytes >= 4*sizeof(float)){
96
97
    in1 = _mm_loadu_ps( (float*) (input+offset) );
98
    in2 = _mm_loadu_ps( (float*) (taps+offset) );
99
    Rv = in1*in2;
100
    fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
101
    Iv = in1*fehg;
102
    Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
103
    Ivm = _mm_xor_ps( negMask.vec, Iv );
104
    Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
105
    _mm_store_ss( &Im, Is );
106
    _mm_store_ss( &Re, Rs );
107
    num_bytes -= 4*sizeof(float);
108
    offset += 2;
109
    Rsum += Re;
110
    Isum += Im;
111
  }
112
113
  // handle the last complex case ...
114
  if(num_bytes > 0){
115
116
    if(num_bytes != 4){
117
      // bad things are happening
118
    }
119
120
    in1 = _mm_loadu_ps( (float*) (input+offset) );
121
    in2 = _mm_loadu_ps( (float*) (taps+offset) );
122
    Rv = _mm_and_ps(in1*in2, halfMask.vec);
123
    fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
124
    Iv = _mm_and_ps(in1*fehg, halfMask.vec);
125
    Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
126
    Ivm = _mm_xor_ps( negMask.vec, Iv );
127
    Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
128
    _mm_store_ss( &Im, Is );
129
    _mm_store_ss( &Re, Rs );
130
    Rsum += Re;
131
    Isum += Im;
132
  }
133
134
  result[0] = lv_cmake(Rsum,Isum);
135
  return;
136
}
137
138
#endif /*LV_HAVE_SSE3*/
139
140
141
#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
142
143
144