Statistics
| Branch: | Tag: | Revision:

root / volk / include / volk / volk_32fc_x2_dot_prod_32fc_a.h @ ccfac187

History | View | Annotate | Download (13.7 kB)

1
#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
2
#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
3
4
#include <volk/volk_common.h>
5
#include <volk/volk_complex.h>
6
#include <stdio.h>
7
#include <string.h>
8
9
10
#ifdef LV_HAVE_GENERIC 
11
12
13
static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
14
  
15
  float * res = (float*) result;
16
  float * in = (float*) input;
17
  float * tp = (float*) taps;
18
  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
19
  unsigned int isodd = (num_bytes >> 3) &1;
20
  
21
  
22
  
23
  float sum0[2] = {0,0};
24
  float sum1[2] = {0,0};
25
  unsigned int i = 0;
26
27
  
28
  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
29
    
30
31
    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
32
    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
33
    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
34
    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
35
    
36
    
37
    in += 4;
38
    tp += 4;
39
40
  }
41
42
  
43
  res[0] = sum0[0] + sum1[0];
44
  res[1] = sum0[1] + sum1[1];
45
  
46
  
47
  
48
  for(i = 0; i < isodd; ++i) {
49
50
51
    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
52
53
  }
54
55
}
56
57
#endif /*LV_HAVE_GENERIC*/
58
59
60
#if LV_HAVE_SSE && LV_HAVE_64
61
62
63
static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
64
  
65
66
  asm 
67
    (
68
     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
69
     "#                         const float *taps, unsigned num_bytes)\n\t"
70
     "#    float sum0 = 0;\n\t"
71
     "#    float sum1 = 0;\n\t"
72
     "#    float sum2 = 0;\n\t"
73
     "#    float sum3 = 0;\n\t"
74
     "#    do {\n\t"
75
     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
76
     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
77
     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
78
     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
79
     "#      input += 4;\n\t"
80
     "#      taps += 4;  \n\t"
81
     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
82
     "#    result[0] = sum0 + sum2;\n\t"
83
     "#    result[1] = sum1 + sum3;\n\t"
84
     "# TODO: prefetch and better scheduling\n\t"
85
     "  xor    %%r9,  %%r9\n\t"
86
     "  xor    %%r10, %%r10\n\t"
87
     "  movq   %%rcx, %%rax\n\t"
88
     "  movq   %%rcx, %%r8\n\t"
89
     "  movq   %[rsi],  %%r9\n\t"
90
     "  movq   %[rdx], %%r10\n\t"
91
     "        xorps        %%xmm6, %%xmm6                # zero accumulators\n\t"
92
     "        movaps        0(%%r9), %%xmm0\n\t"
93
     "        xorps        %%xmm7, %%xmm7                # zero accumulators\n\t"
94
     "        movaps        0(%%r10), %%xmm2\n\t"
95
     "        shr        $5, %%rax                # rax = n_2_ccomplex_blocks / 2\n\t"
96
     "  shr     $4, %%r8\n\t"
97
     "        jmp        .%=L1_test\n\t"
98
     "        # 4 taps / loop\n\t"
99
     "        # something like ?? cycles / loop\n\t"
100
     ".%=Loop1:        \n\t"
101
     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
102
     "#        movaps        (%%r9), %%xmmA\n\t"
103
     "#        movaps        (%%r10), %%xmmB\n\t"
104
     "#        movaps        %%xmmA, %%xmmZ\n\t"
105
     "#        shufps        $0xb1, %%xmmZ, %%xmmZ        # swap internals\n\t"
106
     "#        mulps        %%xmmB, %%xmmA\n\t"
107
     "#        mulps        %%xmmZ, %%xmmB\n\t"
108
     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
109
     "#        xorps        %%xmmPN, %%xmmA\n\t"
110
     "#        movaps        %%xmmA, %%xmmZ\n\t"
111
     "#        unpcklps %%xmmB, %%xmmA\n\t"
112
     "#        unpckhps %%xmmB, %%xmmZ\n\t"
113
     "#        movaps        %%xmmZ, %%xmmY\n\t"
114
     "#        shufps        $0x44, %%xmmA, %%xmmZ        # b01000100\n\t"
115
     "#        shufps        $0xee, %%xmmY, %%xmmA        # b11101110\n\t"
116
     "#        addps        %%xmmZ, %%xmmA\n\t"
117
     "#        addps        %%xmmA, %%xmmC\n\t"
118
     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
119
     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
120
     "        movaps        16(%%r9), %%xmm1\n\t"
121
     "        movaps        %%xmm0, %%xmm4\n\t"
122
     "        mulps        %%xmm2, %%xmm0\n\t"
123
     "        shufps        $0xb1, %%xmm4, %%xmm4        # swap internals\n\t"
124
     "        movaps        16(%%r10), %%xmm3\n\t"
125
     "        movaps        %%xmm1, %%xmm5\n\t"
126
     "        addps        %%xmm0, %%xmm6\n\t"
127
     "        mulps        %%xmm3, %%xmm1\n\t"
128
     "        shufps        $0xb1, %%xmm5, %%xmm5        # swap internals\n\t"
129
     "        addps        %%xmm1, %%xmm6\n\t"
130
     "        mulps        %%xmm4, %%xmm2\n\t"
131
     "        movaps        32(%%r9), %%xmm0\n\t"
132
     "        addps        %%xmm2, %%xmm7\n\t"
133
     "        mulps        %%xmm5, %%xmm3\n\t"
134
     "        add        $32, %%r9\n\t"
135
     "        movaps        32(%%r10), %%xmm2\n\t"
136
     "        addps        %%xmm3, %%xmm7\n\t"
137
     "        add        $32, %%r10\n\t"
138
     ".%=L1_test:\n\t"
139
     "        dec        %%rax\n\t"
140
     "        jge        .%=Loop1\n\t"
141
     "        # We've handled the bulk of multiplies up to here.\n\t"
142
     "        # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
143
     "        # If so, we've got 2 more taps to do.\n\t"
144
     "        and        $1, %%r8\n\t"
145
     "        je        .%=Leven\n\t"
146
     "        # The count was odd, do 2 more taps.\n\t"
147
     "        # Note that we've already got mm0/mm2 preloaded\n\t"
148
     "        # from the main loop.\n\t"
149
     "        movaps        %%xmm0, %%xmm4\n\t"
150
     "        mulps        %%xmm2, %%xmm0\n\t"
151
     "        shufps        $0xb1, %%xmm4, %%xmm4        # swap internals\n\t"
152
     "        addps        %%xmm0, %%xmm6\n\t"
153
     "        mulps        %%xmm4, %%xmm2\n\t"
154
     "        addps        %%xmm2, %%xmm7\n\t"
155
     ".%=Leven:\n\t"
156
     "        # neg inversor\n\t"
157
     "        xorps        %%xmm1, %%xmm1\n\t"
158
     "        mov        $0x80000000, %%r9\n\t"
159
     "        movd        %%r9, %%xmm1\n\t"
160
     "        shufps        $0x11, %%xmm1, %%xmm1        # b00010001 # 0 -0 0 -0\n\t"
161
     "        # pfpnacc\n\t"
162
     "        xorps        %%xmm1, %%xmm6\n\t"
163
     "        movaps        %%xmm6, %%xmm2\n\t"
164
     "        unpcklps %%xmm7, %%xmm6\n\t"
165
     "        unpckhps %%xmm7, %%xmm2\n\t"
166
     "        movaps        %%xmm2, %%xmm3\n\t"
167
     "        shufps        $0x44, %%xmm6, %%xmm2        # b01000100\n\t"
168
     "        shufps        $0xee, %%xmm3, %%xmm6        # b11101110\n\t"
169
     "        addps        %%xmm2, %%xmm6\n\t"
170
     "                                        # xmm6 = r1 i2 r3 i4\n\t"
171
     "        movhlps        %%xmm6, %%xmm4                # xmm4 = r3 i4 ?? ??\n\t"
172
     "        addps        %%xmm4, %%xmm6                # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
173
     "        movlps        %%xmm6, (%[rdi])                # store low 2x32 bits (complex) to memory\n\t"
174
     :
175
     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
176
     :"rax", "r8", "r9", "r10"
177
     );
178
  
179
  
180
  int getem = num_bytes % 16;
181
  
182
  
183
  for(; getem > 0; getem -= 8) {
184
  
185
    
186
    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
187
  
188
  }
189
190
  return;
191
  
192
}
193
194
#endif
195
196
#if LV_HAVE_SSE && LV_HAVE_32
197
198
static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
199
  
200
  asm volatile 
201
    (
202
     "        #pushl        %%ebp\n\t"
203
     "        #movl        %%esp, %%ebp\n\t"
204
     "        movl        12(%%ebp), %%eax                # input\n\t"
205
     "        movl        16(%%ebp), %%edx                # taps\n\t"
206
     "        movl        20(%%ebp), %%ecx                # n_bytes\n\t"
207
     "        xorps        %%xmm6, %%xmm6                # zero accumulators\n\t"
208
     "        movaps        0(%%eax), %%xmm0\n\t"
209
     "        xorps        %%xmm7, %%xmm7                # zero accumulators\n\t"
210
     "        movaps        0(%%edx), %%xmm2\n\t"
211
     "        shrl        $5, %%ecx                # ecx = n_2_ccomplex_blocks / 2\n\t"
212
     "        jmp        .%=L1_test\n\t"
213
     "        # 4 taps / loop\n\t"
214
     "        # something like ?? cycles / loop\n\t"
215
     ".%=Loop1:        \n\t"
216
     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
217
     "#        movaps        (%%eax), %%xmmA\n\t"
218
     "#        movaps        (%%edx), %%xmmB\n\t"
219
     "#        movaps        %%xmmA, %%xmmZ\n\t"
220
     "#        shufps        $0xb1, %%xmmZ, %%xmmZ        # swap internals\n\t"
221
     "#        mulps        %%xmmB, %%xmmA\n\t"
222
     "#        mulps        %%xmmZ, %%xmmB\n\t"
223
     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
224
     "#        xorps        %%xmmPN, %%xmmA\n\t"
225
     "#        movaps        %%xmmA, %%xmmZ\n\t"
226
     "#        unpcklps %%xmmB, %%xmmA\n\t"
227
     "#        unpckhps %%xmmB, %%xmmZ\n\t"
228
     "#        movaps        %%xmmZ, %%xmmY\n\t"
229
     "#        shufps        $0x44, %%xmmA, %%xmmZ        # b01000100\n\t"
230
     "#        shufps        $0xee, %%xmmY, %%xmmA        # b11101110\n\t"
231
     "#        addps        %%xmmZ, %%xmmA\n\t"
232
     "#        addps        %%xmmA, %%xmmC\n\t"
233
     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
234
     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
235
     "        movaps        16(%%eax), %%xmm1\n\t"
236
     "        movaps        %%xmm0, %%xmm4\n\t"
237
     "        mulps        %%xmm2, %%xmm0\n\t"
238
     "        shufps        $0xb1, %%xmm4, %%xmm4        # swap internals\n\t"
239
     "        movaps        16(%%edx), %%xmm3\n\t"
240
     "        movaps        %%xmm1, %%xmm5\n\t"
241
     "        addps        %%xmm0, %%xmm6\n\t"
242
     "        mulps        %%xmm3, %%xmm1\n\t"
243
     "        shufps        $0xb1, %%xmm5, %%xmm5        # swap internals\n\t"
244
     "        addps        %%xmm1, %%xmm6\n\t"
245
     "        mulps        %%xmm4, %%xmm2\n\t"
246
     "        movaps        32(%%eax), %%xmm0\n\t"
247
     "        addps        %%xmm2, %%xmm7\n\t"
248
     "        mulps        %%xmm5, %%xmm3\n\t"
249
     "        addl        $32, %%eax\n\t"
250
     "        movaps        32(%%edx), %%xmm2\n\t"
251
     "        addps        %%xmm3, %%xmm7\n\t"
252
     "        addl        $32, %%edx\n\t"
253
     ".%=L1_test:\n\t"
254
     "        decl        %%ecx\n\t"
255
     "        jge        .%=Loop1\n\t"
256
     "        # We've handled the bulk of multiplies up to here.\n\t"
257
     "        # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
258
     "        # If so, we've got 2 more taps to do.\n\t"
259
     "        movl        20(%%ebp), %%ecx                # n_2_ccomplex_blocks\n\t"
260
     "  shrl    $4, %%ecx\n\t"
261
     "        andl        $1, %%ecx\n\t"
262
     "        je        .%=Leven\n\t"
263
     "        # The count was odd, do 2 more taps.\n\t"
264
     "        # Note that we've already got mm0/mm2 preloaded\n\t"
265
     "        # from the main loop.\n\t"
266
     "        movaps        %%xmm0, %%xmm4\n\t"
267
     "        mulps        %%xmm2, %%xmm0\n\t"
268
     "        shufps        $0xb1, %%xmm4, %%xmm4        # swap internals\n\t"
269
     "        addps        %%xmm0, %%xmm6\n\t"
270
     "        mulps        %%xmm4, %%xmm2\n\t"
271
     "        addps        %%xmm2, %%xmm7\n\t"
272
     ".%=Leven:\n\t"
273
     "        # neg inversor\n\t"
274
     "  movl 8(%%ebp), %%eax \n\t"
275
     "        xorps        %%xmm1, %%xmm1\n\t"
276
     "  movl        $0x80000000, (%%eax)\n\t"
277
     "        movss        (%%eax), %%xmm1\n\t"
278
     "        shufps        $0x11, %%xmm1, %%xmm1        # b00010001 # 0 -0 0 -0\n\t"
279
     "        # pfpnacc\n\t"
280
     "        xorps        %%xmm1, %%xmm6\n\t"
281
     "        movaps        %%xmm6, %%xmm2\n\t"
282
     "        unpcklps %%xmm7, %%xmm6\n\t"
283
     "        unpckhps %%xmm7, %%xmm2\n\t"
284
     "        movaps        %%xmm2, %%xmm3\n\t"
285
     "        shufps        $0x44, %%xmm6, %%xmm2        # b01000100\n\t"
286
     "        shufps        $0xee, %%xmm3, %%xmm6        # b11101110\n\t"
287
     "        addps        %%xmm2, %%xmm6\n\t"
288
     "                                        # xmm6 = r1 i2 r3 i4\n\t"
289
     "        #movl        8(%%ebp), %%eax                # @result\n\t"
290
     "        movhlps        %%xmm6, %%xmm4                # xmm4 = r3 i4 ?? ??\n\t"
291
     "        addps        %%xmm4, %%xmm6                # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
292
     "        movlps        %%xmm6, (%%eax)                # store low 2x32 bits (complex) to memory\n\t"
293
     "        #popl        %%ebp\n\t"
294
     :
295
     :
296
     : "eax", "ecx", "edx"
297
     );
298
299
  
300
  int getem = num_bytes % 16;
301
  
302
  for(; getem > 0; getem -= 8) {
303
    
304
    
305
    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
306
    
307
  }
308
  
309
  return;
310
  
311
  
312
  
313
314
  
315
  
316
}
317
318
#endif /*LV_HAVE_SSE*/  
319
320
#ifdef LV_HAVE_SSE3
321
322
#include <pmmintrin.h>
323
324
static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
325
  
326
327
  lv_32fc_t dotProduct;
328
  memset(&dotProduct, 0x0, 2*sizeof(float));
329
330
  unsigned int number = 0;
331
  const unsigned int halfPoints = num_bytes >> 4;
332
333
  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
334
335
  const lv_32fc_t* a = input;
336
  const lv_32fc_t* b = taps;
337
338
  dotProdVal = _mm_setzero_ps();
339
340
  for(;number < halfPoints; number++){
341
      
342
    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
343
    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
344
      
345
    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
346
    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
347
      
348
    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
349
      
350
    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
351
      
352
    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
353
      
354
    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
355
356
    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
357
358
    a += 2;
359
    b += 2;
360
  }
361
362
  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
363
364
  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
365
366
  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
367
368
  if((num_bytes >> 2) != 0) {
369
    dotProduct += (*a) * (*b);
370
  }
371
372
  *result = dotProduct;
373
}  
374
375
#endif /*LV_HAVE_SSE3*/
376
377
#ifdef LV_HAVE_SSE4_1
378
379
#include <smmintrin.h>
380
381
static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
382
  volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes);
383
  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
384
   /* 
385
    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
386
    float *p_input, *p_taps;
387
    __m64 *p_result;
388
389
    p_result = (__m64*)result;
390
    p_input = (float*)input;
391
    p_taps = (float*)taps;
392
393
    static const __m128i neg = {0x000000000000000080000000};
394
395
    int i = 0;
396
  
397
    int bound = (num_bytes >> 5);
398
    int leftovers = (num_bytes & 24) >> 3;
399
400
    real0 = _mm_sub_ps(real0, real0);
401
    real1 = _mm_sub_ps(real1, real1);
402
    im0 = _mm_sub_ps(im0, im0);
403
    im1 = _mm_sub_ps(im1, im1);
404
  
405
    for(; i < bound; ++i) {
406
  
407
    
408
    xmm0 = _mm_load_ps(p_input);
409
    xmm1 = _mm_load_ps(p_taps);
410
    
411
    p_input += 4;
412
    p_taps += 4;
413
    
414
    xmm2 = _mm_load_ps(p_input);
415
    xmm3 = _mm_load_ps(p_taps);
416
    
417
    p_input += 4;
418
    p_taps += 4;
419
    
420
    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
421
    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
422
    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
423
    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
424
    
425
    //imaginary vector from input
426
    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
427
    //real vector from input
428
    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
429
    //imaginary vector from taps
430
    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
431
    //real vector from taps
432
    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
433
    
434
    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
435
    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
436
    
437
    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
438
    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
439
    
440
    real0 = _mm_add_ps(xmm4, real0);
441
    real1 = _mm_add_ps(xmm5, real1);
442
    im0 = _mm_add_ps(xmm6, im0);
443
    im1 = _mm_add_ps(xmm7, im1);
444
    
445
    }
446
447
448
    
449
    
450
    real1 = _mm_xor_ps(real1, (__m128)neg);
451
    
452
  
453
    im0 = _mm_add_ps(im0, im1);
454
    real0 = _mm_add_ps(real0, real1);
455
  
456
    im0 = _mm_add_ps(im0, real0);
457
  
458
    _mm_storel_pi(p_result, im0);
459
  
460
    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
461
    
462
    *result += input[i] * taps[i];
463
    }
464
  */
465
}  
466
467
#endif /*LV_HAVE_SSE4_1*/
468
469
#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/