Changeset 8971

Show
Ignore:
Timestamp:
07/22/08 00:23:05
Author:
eb
Message:

working altivec gr_fir_fff. About 3x faster on PS3

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • gnuradio/branches/developers/eb/vmx/gnuradio-core/src/lib/filter/Makefile.am

    r8969 r8971  
    172172        gr_cpu_powerpc.cc \ 
    173173        gr_fir_fff_altivec.cc \ 
    174         gr_altivec.c 
     174        gr_altivec.c \ 
     175        dotprod_fff_altivec.c 
    175176 
    176177powerpc_qa_CODE = \ 
     
    292293noinst_HEADERS =                        \ 
    293294        assembly.h                      \ 
     295        dotprod_fff_altivec.h           \ 
    294296        gr_fir_scc_simd.h               \ 
    295297        gr_fir_scc_x86.h                \ 
  • gnuradio/branches/developers/eb/vmx/gnuradio-core/src/lib/filter/gr_fir_fff_altivec.cc

    r8970 r8971  
    2929#include <gr_math.h> 
    3030#include <gr_altivec.h> 
    31  
    32 extern "C" { 
    33  
    34 #if 0 
    35  
    36 float 
    37 dotprod_fff_altivec(const float *a, const float *b, size_t n) 
    38 
    39   float sum = 0; 
    40   for (size_t i = 0; i < n; i++){ 
    41     sum += a[i] * b[i]; 
    42   } 
    43   return sum; 
    44 
    45  
    46 #else 
    47 /* 
    48  *  preconditions: 
    49  * 
    50  *    n > 0 and a multiple of 4 
    51  *    a   4-byte aligned 
    52  *    b  16-byte aligned 
    53  */ 
    54 float 
    55 dotprod_fff_altivec(const float *_a, const float *_b, size_t n) 
    56 
    57   const vector float *a = (const vector float *) _a; 
    58   const vector float *b = (const vector float *) _b; 
    59  
    60   static const size_t UNROLL_CNT = 4; 
    61  
    62   n = gr_p2_round_down(n, 4); 
    63   size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC); 
    64   size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC); 
    65  
    66   // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft); 
    67  
    68   // Used with vperm to build a* from p* 
    69   vector unsigned char lvsl_a = vec_lvsl(0, _a); 
    70  
    71   vector float p0, p1, p2, p3; 
    72   vector float a0, a1, a2, a3; 
    73   vector float b0, b1, b2, b3; 
    74   vector float acc0 = {0, 0, 0, 0}; 
    75   vector float acc1 = {0, 0, 0, 0}; 
    76   vector float acc2 = {0, 0, 0, 0}; 
    77   vector float acc3 = {0, 0, 0, 0}; 
    78  
    79   // wind in 
    80  
    81   register int r0vs = 0 * VS; 
    82   register int r1vs = 1 * VS; 
    83   register int r2vs = 2 * VS; 
    84   register int r3vs = 3 * VS; 
    85  
    86   p0 = vec_ld(r0vs, a); 
    87   p1 = vec_ld(r1vs, a); 
    88   p2 = vec_ld(r2vs, a); 
    89   p3 = vec_ld(r3vs, a); 
    90   a += UNROLL_CNT; 
    91  
    92   a0 = vec_perm(p0, p1, lvsl_a); 
    93   b0 = vec_ld(r0vs, b); 
    94   p0 = vec_ld(r0vs, a); 
    95  
    96   for (size_t i = 0; i < loop_cnt; i++){ 
    97  
    98     a1 = vec_perm(p1, p2, lvsl_a); 
    99     b1 = vec_ld(r1vs, b); 
    100     p1 = vec_ld(r1vs, a); 
    101     acc0 = vec_madd(a0, b0, acc0); 
    102  
    103     a2 = vec_perm(p2, p3, lvsl_a); 
    104     b2 = vec_ld(r2vs, b); 
    105     p2 = vec_ld(r2vs, a); 
    106     acc1 = vec_madd(a1, b1, acc1); 
    107  
    108     a3 = vec_perm(p3, p0, lvsl_a); 
    109     b3 = vec_ld(r3vs, b); 
    110     p3 = vec_ld(r3vs, a); 
    111     acc2 = vec_madd(a2, b2, acc2); 
    112  
    113     a += UNROLL_CNT; 
    114     b += UNROLL_CNT; 
    115  
    116     a0 = vec_perm(p0, p1, lvsl_a); 
    117     b0 = vec_ld(r0vs, b); 
    118     p0 = vec_ld(r0vs, a); 
    119     acc3 = vec_madd(a3, b3, acc3); 
    120   } 
    121  
    122   /* 
    123    * The compiler ought to be able to figure out that 0, 4, 8 and 12 
    124    * are the only possible values for nleft. 
    125    */ 
    126   switch (nleft){ 
    127   case 0: 
    128     break; 
    129      
    130   case 4: 
    131     acc0 = vec_madd(a0, b0, acc0); 
    132     break; 
    133  
    134   case 8: 
    135     a1 = vec_perm(p1, p2, lvsl_a); 
    136     b1 = vec_ld(r1vs, b); 
    137     acc0 = vec_madd(a0, b0, acc0); 
    138     acc1 = vec_madd(a1, b1, acc1); 
    139     break; 
    140  
    141   case 12: 
    142     a1 = vec_perm(p1, p2, lvsl_a); 
    143     b1 = vec_ld(r1vs, b); 
    144     acc0 = vec_madd(a0, b0, acc0); 
    145     a2 = vec_perm(p2, p3, lvsl_a); 
    146     b2 = vec_ld(r2vs, b); 
    147     acc1 = vec_madd(a1, b1, acc1); 
    148     acc2 = vec_madd(a2, b2, acc2); 
    149     break; 
    150   } 
    151              
    152   acc0 = acc0 + acc1; 
    153   acc2 = acc2 + acc3; 
    154   acc0 = acc0 + acc2; 
    155  
    156   return horizontal_add_f(acc0); 
    157 
    158  
    159 #endif 
    160 
     31#include <dotprod_fff_altivec.h> 
    16132 
    16233gr_fir_fff_altivec::gr_fir_fff_altivec()