GNU Radio 3.7.3 C++ API
volk_32fc_x2_conjugate_dot_prod_32fc.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
3 
4 
5 #include<volk/volk_complex.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
12 
13  const unsigned int num_bytes = num_points*8;
14 
15  float * res = (float*) result;
16  float * in = (float*) input;
17  float * tp = (float*) taps;
18  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
19  unsigned int isodd = (num_bytes >> 3) &1;
20 
21 
22 
23  float sum0[2] = {0,0};
24  float sum1[2] = {0,0};
25  unsigned int i = 0;
26 
27 
28  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
29 
30  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
31  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
32  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
33  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
34 
35 
36  in += 4;
37  tp += 4;
38 
39  }
40 
41 
42  res[0] = sum0[0] + sum1[0];
43  res[1] = sum0[1] + sum1[1];
44 
45 
46 
47  for(i = 0; i < isodd; ++i) {
48 
49 
50  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
51 
52  }
53  /*
54  for(i = 0; i < num_bytes >> 3; ++i) {
55  *result += input[i] * conjf(taps[i]);
56  }
57  */
58 }
59 
60 #endif /*LV_HAVE_GENERIC*/
61 
62 #ifdef LV_HAVE_SSE3
63 
64 #include <xmmintrin.h>
65 #include <pmmintrin.h>
66 #include <mmintrin.h>
67 
68 
69 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
70 
71  unsigned int num_bytes = num_points*8;
72 
73  // Variable never used?
74  //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
75 
76  union HalfMask {
77  uint32_t intRep[4];
78  __m128 vec;
79  } halfMask;
80 
81  union NegMask {
82  int intRep[4];
83  __m128 vec;
84  } negMask;
85 
86  unsigned int offset = 0;
87  float Rsum=0, Isum=0;
88  float Im,Re;
89 
90  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
91  __m128 zv = {0,0,0,0};
92 
93  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
94  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
95 
96  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
97  negMask.intRep[1] = negMask.intRep[3] = 0;
98 
99  // main loop
100  while(num_bytes >= 4*sizeof(float)){
101 
102  in1 = _mm_loadu_ps( (float*) (input+offset) );
103  in2 = _mm_loadu_ps( (float*) (taps+offset) );
104  Rv = _mm_mul_ps(in1, in2);
105  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
106  Iv = _mm_mul_ps(in1, fehg);
107  Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
108  Ivm = _mm_xor_ps( negMask.vec, Iv );
109  Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
110  _mm_store_ss( &Im, Is );
111  _mm_store_ss( &Re, Rs );
112  num_bytes -= 4*sizeof(float);
113  offset += 2;
114  Rsum += Re;
115  Isum += Im;
116  }
117 
118  // handle the last complex case ...
119  if(num_bytes > 0){
120 
121  if(num_bytes != 4){
122  // bad things are happening
123  }
124 
125  in1 = _mm_loadu_ps( (float*) (input+offset) );
126  in2 = _mm_loadu_ps( (float*) (taps+offset) );
127  Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
128  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
129  Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
130  Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
131  Ivm = _mm_xor_ps( negMask.vec, Iv );
132  Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
133  _mm_store_ss( &Im, Is );
134  _mm_store_ss( &Re, Rs );
135  Rsum += Re;
136  Isum += Im;
137  }
138 
139  result[0] = lv_cmake(Rsum,Isum);
140  return;
141 }
142 
143 #endif /*LV_HAVE_SSE3*/
144 
145 
146 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
147 
148 
149 
150 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
151 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
152 
153 #include <volk/volk_common.h>
154 #include<volk/volk_complex.h>
155 #include<stdio.h>
156 
157 
158 #ifdef LV_HAVE_GENERIC
159 
160 
161 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
162 
163  const unsigned int num_bytes = num_points*8;
164 
165  float * res = (float*) result;
166  float * in = (float*) input;
167  float * tp = (float*) taps;
168  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
169  unsigned int isodd = (num_bytes >> 3) &1;
170 
171 
172 
173  float sum0[2] = {0,0};
174  float sum1[2] = {0,0};
175  unsigned int i = 0;
176 
177 
178  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
179 
180 
181  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
182  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
183  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
184  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
185 
186 
187  in += 4;
188  tp += 4;
189 
190  }
191 
192 
193  res[0] = sum0[0] + sum1[0];
194  res[1] = sum0[1] + sum1[1];
195 
196 
197 
198  for(i = 0; i < isodd; ++i) {
199 
200 
201  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
202 
203  }
204  /*
205  for(i = 0; i < num_bytes >> 3; ++i) {
206  *result += input[i] * conjf(taps[i]);
207  }
208  */
209 }
210 
211 #endif /*LV_HAVE_GENERIC*/
212 
213 
214 #if LV_HAVE_SSE && LV_HAVE_64
215 
216 
217 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
218 
219  const unsigned int num_bytes = num_points*8;
220 
221  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
222 
223 
224 
225 
226  asm volatile
227  (
228  "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
229  "# const float *taps, unsigned num_bytes)\n\t"
230  "# float sum0 = 0;\n\t"
231  "# float sum1 = 0;\n\t"
232  "# float sum2 = 0;\n\t"
233  "# float sum3 = 0;\n\t"
234  "# do {\n\t"
235  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
236  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
237  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
238  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
239  "# input += 4;\n\t"
240  "# taps += 4; \n\t"
241  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
242  "# result[0] = sum0 + sum2;\n\t"
243  "# result[1] = sum1 + sum3;\n\t"
244  "# TODO: prefetch and better scheduling\n\t"
245  " xor %%r9, %%r9\n\t"
246  " xor %%r10, %%r10\n\t"
247  " movq %[conjugator], %%r9\n\t"
248  " movq %%rcx, %%rax\n\t"
249  " movaps 0(%%r9), %%xmm8\n\t"
250  " movq %%rcx, %%r8\n\t"
251  " movq %[rsi], %%r9\n\t"
252  " movq %[rdx], %%r10\n\t"
253  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
254  " movaps 0(%%r9), %%xmm0\n\t"
255  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
256  " movups 0(%%r10), %%xmm2\n\t"
257  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
258  " shr $4, %%r8\n\t"
259  " xorps %%xmm8, %%xmm2\n\t"
260  " jmp .%=L1_test\n\t"
261  " # 4 taps / loop\n\t"
262  " # something like ?? cycles / loop\n\t"
263  ".%=Loop1: \n\t"
264  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
265  "# movaps (%%r9), %%xmmA\n\t"
266  "# movaps (%%r10), %%xmmB\n\t"
267  "# movaps %%xmmA, %%xmmZ\n\t"
268  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
269  "# mulps %%xmmB, %%xmmA\n\t"
270  "# mulps %%xmmZ, %%xmmB\n\t"
271  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
272  "# xorps %%xmmPN, %%xmmA\n\t"
273  "# movaps %%xmmA, %%xmmZ\n\t"
274  "# unpcklps %%xmmB, %%xmmA\n\t"
275  "# unpckhps %%xmmB, %%xmmZ\n\t"
276  "# movaps %%xmmZ, %%xmmY\n\t"
277  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
278  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
279  "# addps %%xmmZ, %%xmmA\n\t"
280  "# addps %%xmmA, %%xmmC\n\t"
281  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
282  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
283  " movaps 16(%%r9), %%xmm1\n\t"
284  " movaps %%xmm0, %%xmm4\n\t"
285  " mulps %%xmm2, %%xmm0\n\t"
286  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
287  " movaps 16(%%r10), %%xmm3\n\t"
288  " movaps %%xmm1, %%xmm5\n\t"
289  " xorps %%xmm8, %%xmm3\n\t"
290  " addps %%xmm0, %%xmm6\n\t"
291  " mulps %%xmm3, %%xmm1\n\t"
292  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
293  " addps %%xmm1, %%xmm6\n\t"
294  " mulps %%xmm4, %%xmm2\n\t"
295  " movaps 32(%%r9), %%xmm0\n\t"
296  " addps %%xmm2, %%xmm7\n\t"
297  " mulps %%xmm5, %%xmm3\n\t"
298  " add $32, %%r9\n\t"
299  " movaps 32(%%r10), %%xmm2\n\t"
300  " addps %%xmm3, %%xmm7\n\t"
301  " add $32, %%r10\n\t"
302  " xorps %%xmm8, %%xmm2\n\t"
303  ".%=L1_test:\n\t"
304  " dec %%rax\n\t"
305  " jge .%=Loop1\n\t"
306  " # We've handled the bulk of multiplies up to here.\n\t"
307  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
308  " # If so, we've got 2 more taps to do.\n\t"
309  " and $1, %%r8\n\t"
310  " je .%=Leven\n\t"
311  " # The count was odd, do 2 more taps.\n\t"
312  " # Note that we've already got mm0/mm2 preloaded\n\t"
313  " # from the main loop.\n\t"
314  " movaps %%xmm0, %%xmm4\n\t"
315  " mulps %%xmm2, %%xmm0\n\t"
316  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
317  " addps %%xmm0, %%xmm6\n\t"
318  " mulps %%xmm4, %%xmm2\n\t"
319  " addps %%xmm2, %%xmm7\n\t"
320  ".%=Leven:\n\t"
321  " # neg inversor\n\t"
322  " xorps %%xmm1, %%xmm1\n\t"
323  " mov $0x80000000, %%r9\n\t"
324  " movd %%r9, %%xmm1\n\t"
325  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
326  " # pfpnacc\n\t"
327  " xorps %%xmm1, %%xmm6\n\t"
328  " movaps %%xmm6, %%xmm2\n\t"
329  " unpcklps %%xmm7, %%xmm6\n\t"
330  " unpckhps %%xmm7, %%xmm2\n\t"
331  " movaps %%xmm2, %%xmm3\n\t"
332  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
333  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
334  " addps %%xmm2, %%xmm6\n\t"
335  " # xmm6 = r1 i2 r3 i4\n\t"
336  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
337  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
338  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
339  :
340  :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
341  :"rax", "r8", "r9", "r10"
342  );
343 
344 
345  int getem = num_bytes % 16;
346 
347 
348  for(; getem > 0; getem -= 8) {
349 
350 
351  *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
352 
353  }
354 
355  return;
356 }
357 #endif
358 
359 #if LV_HAVE_SSE && LV_HAVE_32
360 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
361 
362  const unsigned int num_bytes = num_points*8;
363 
364  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
365 
366  int bound = num_bytes >> 4;
367  int leftovers = num_bytes % 16;
368 
369 
370  asm volatile
371  (
372  " #pushl %%ebp\n\t"
373  " #movl %%esp, %%ebp\n\t"
374  " #movl 12(%%ebp), %%eax # input\n\t"
375  " #movl 16(%%ebp), %%edx # taps\n\t"
376  " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
377  " movaps 0(%[conjugator]), %%xmm1\n\t"
378  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
379  " movaps 0(%[eax]), %%xmm0\n\t"
380  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
381  " movaps 0(%[edx]), %%xmm2\n\t"
382  " movl %[ecx], (%[out])\n\t"
383  " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
384 
385  " xorps %%xmm1, %%xmm2\n\t"
386  " jmp .%=L1_test\n\t"
387  " # 4 taps / loop\n\t"
388  " # something like ?? cycles / loop\n\t"
389  ".%=Loop1: \n\t"
390  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
391  "# movaps (%[eax]), %%xmmA\n\t"
392  "# movaps (%[edx]), %%xmmB\n\t"
393  "# movaps %%xmmA, %%xmmZ\n\t"
394  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
395  "# mulps %%xmmB, %%xmmA\n\t"
396  "# mulps %%xmmZ, %%xmmB\n\t"
397  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
398  "# xorps %%xmmPN, %%xmmA\n\t"
399  "# movaps %%xmmA, %%xmmZ\n\t"
400  "# unpcklps %%xmmB, %%xmmA\n\t"
401  "# unpckhps %%xmmB, %%xmmZ\n\t"
402  "# movaps %%xmmZ, %%xmmY\n\t"
403  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
404  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
405  "# addps %%xmmZ, %%xmmA\n\t"
406  "# addps %%xmmA, %%xmmC\n\t"
407  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
408  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
409  " movaps 16(%[edx]), %%xmm3\n\t"
410  " movaps %%xmm0, %%xmm4\n\t"
411  " xorps %%xmm1, %%xmm3\n\t"
412  " mulps %%xmm2, %%xmm0\n\t"
413  " movaps 16(%[eax]), %%xmm1\n\t"
414  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
415  " movaps %%xmm1, %%xmm5\n\t"
416  " addps %%xmm0, %%xmm6\n\t"
417  " mulps %%xmm3, %%xmm1\n\t"
418  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
419  " addps %%xmm1, %%xmm6\n\t"
420  " movaps 0(%[conjugator]), %%xmm1\n\t"
421  " mulps %%xmm4, %%xmm2\n\t"
422  " movaps 32(%[eax]), %%xmm0\n\t"
423  " addps %%xmm2, %%xmm7\n\t"
424  " mulps %%xmm5, %%xmm3\n\t"
425  " addl $32, %[eax]\n\t"
426  " movaps 32(%[edx]), %%xmm2\n\t"
427  " addps %%xmm3, %%xmm7\n\t"
428  " xorps %%xmm1, %%xmm2\n\t"
429  " addl $32, %[edx]\n\t"
430  ".%=L1_test:\n\t"
431  " decl %[ecx]\n\t"
432  " jge .%=Loop1\n\t"
433  " # We've handled the bulk of multiplies up to here.\n\t"
434  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
435  " # If so, we've got 2 more taps to do.\n\t"
436  " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
437  " shrl $4, %[ecx]\n\t"
438  " andl $1, %[ecx]\n\t"
439  " je .%=Leven\n\t"
440  " # The count was odd, do 2 more taps.\n\t"
441  " # Note that we've already got mm0/mm2 preloaded\n\t"
442  " # from the main loop.\n\t"
443  " movaps %%xmm0, %%xmm4\n\t"
444  " mulps %%xmm2, %%xmm0\n\t"
445  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
446  " addps %%xmm0, %%xmm6\n\t"
447  " mulps %%xmm4, %%xmm2\n\t"
448  " addps %%xmm2, %%xmm7\n\t"
449  ".%=Leven:\n\t"
450  " # neg inversor\n\t"
451  " #movl 8(%%ebp), %[eax] \n\t"
452  " xorps %%xmm1, %%xmm1\n\t"
453  " movl $0x80000000, (%[out])\n\t"
454  " movss (%[out]), %%xmm1\n\t"
455  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
456  " # pfpnacc\n\t"
457  " xorps %%xmm1, %%xmm6\n\t"
458  " movaps %%xmm6, %%xmm2\n\t"
459  " unpcklps %%xmm7, %%xmm6\n\t"
460  " unpckhps %%xmm7, %%xmm2\n\t"
461  " movaps %%xmm2, %%xmm3\n\t"
462  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
463  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
464  " addps %%xmm2, %%xmm6\n\t"
465  " # xmm6 = r1 i2 r3 i4\n\t"
466  " #movl 8(%%ebp), %[eax] # @result\n\t"
467  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
468  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
469  " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
470  " #popl %%ebp\n\t"
471  :
472  : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
473  );
474 
475 
476 
477 
478  printf("%d, %d\n", leftovers, bound);
479 
480  for(; leftovers > 0; leftovers -= 8) {
481 
482 
483  *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
484 
485  }
486 
487  return;
488 
489 
490 
491 
492 
493 
494 }
495 
496 #endif /*LV_HAVE_SSE*/
497 
498 
499 
500 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
#define lv_conj(x)
Definition: volk_complex.h:80
unsigned int uint32_t
Definition: stdint.h:80
#define lv_cmake(r, i)
Definition: volk_complex.h:59
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56
uint32_t i[4]
Definition: volk_common.h:80