Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / dotprod_fff_altivec.c @ 3ac56587

History | View | Annotate | Download (3.6 kB)

1 2c8ea58e eb
/* -*- c++ -*- */
2 2c8ea58e eb
/*
3 2c8ea58e eb
 * Copyright 2008 Free Software Foundation, Inc.
4 2c8ea58e eb
 * 
5 2c8ea58e eb
 * This file is part of GNU Radio
6 2c8ea58e eb
 * 
7 2c8ea58e eb
 * GNU Radio is free software; you can redistribute it and/or modify
8 2c8ea58e eb
 * it under the terms of the GNU General Public License as published by
9 2c8ea58e eb
 * the Free Software Foundation; either version 3, or (at your option)
10 2c8ea58e eb
 * any later version.
11 2c8ea58e eb
 * 
12 2c8ea58e eb
 * GNU Radio is distributed in the hope that it will be useful,
13 2c8ea58e eb
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 2c8ea58e eb
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 2c8ea58e eb
 * GNU General Public License for more details.
16 2c8ea58e eb
 * 
17 2c8ea58e eb
 * You should have received a copy of the GNU General Public License along
18 2c8ea58e eb
 * with this program; if not, write to the Free Software Foundation, Inc.,
19 2c8ea58e eb
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 2c8ea58e eb
 */
21 2c8ea58e eb
22 2c8ea58e eb
#ifdef HAVE_CONFIG_H
23 2c8ea58e eb
#include <config.h>
24 2c8ea58e eb
#endif
25 2c8ea58e eb
#include <dotprod_fff_altivec.h>
26 2c8ea58e eb
#include <gr_altivec.h>
27 2c8ea58e eb
28 2c8ea58e eb
/*!
29 2c8ea58e eb
 * \param x any value
30 2c8ea58e eb
 * \param pow2 must be a power of 2
31 2c8ea58e eb
 * \returns \p x rounded down to a multiple of \p pow2.
32 2c8ea58e eb
 */
33 2c8ea58e eb
static inline size_t
34 2c8ea58e eb
gr_p2_round_down(size_t x, size_t pow2)
35 2c8ea58e eb
{
36 2c8ea58e eb
  return x & -pow2;
37 2c8ea58e eb
}
38 2c8ea58e eb
39 2c8ea58e eb
40 2c8ea58e eb
#if 0
41 2c8ea58e eb
42 2c8ea58e eb
float
43 2c8ea58e eb
dotprod_fff_altivec(const float *a, const float *b, size_t n)
44 2c8ea58e eb
{
45 2c8ea58e eb
  float        sum = 0;
46 2c8ea58e eb
  for (size_t i = 0; i < n; i++){
47 2c8ea58e eb
    sum += a[i] * b[i];
48 2c8ea58e eb
  }
49 2c8ea58e eb
  return sum;
50 2c8ea58e eb
}
51 2c8ea58e eb
52 2c8ea58e eb
#else
53 2c8ea58e eb
54 2c8ea58e eb
/*
55 2c8ea58e eb
 *  preconditions:
56 2c8ea58e eb
 *
57 2c8ea58e eb
 *    n > 0 and a multiple of 4
58 2c8ea58e eb
 *    a   4-byte aligned
59 2c8ea58e eb
 *    b  16-byte aligned
60 2c8ea58e eb
 */
61 2c8ea58e eb
float
62 2c8ea58e eb
dotprod_fff_altivec(const float *_a, const float *_b, size_t n)
63 2c8ea58e eb
{
64 a1d3c023 eb
  const vec_float4 *a = (const vec_float4 *) _a;
65 a1d3c023 eb
  const vec_float4 *b = (const vec_float4 *) _b;
66 2c8ea58e eb
67 2c8ea58e eb
  static const size_t UNROLL_CNT = 4;
68 2c8ea58e eb
69 2c8ea58e eb
  n = gr_p2_round_down(n, 4);
70 2c8ea58e eb
  size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC);
71 2c8ea58e eb
  size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC);
72 2c8ea58e eb
73 2c8ea58e eb
  // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft);
74 2c8ea58e eb
75 2c8ea58e eb
  // Used with vperm to build a* from p*
76 a1d3c023 eb
  vec_uchar16 lvsl_a = vec_lvsl(0, _a);
77 a1d3c023 eb
78 a1d3c023 eb
  vec_float4 p0, p1, p2, p3;
79 a1d3c023 eb
  vec_float4 a0, a1, a2, a3;
80 a1d3c023 eb
  vec_float4 b0, b1, b2, b3;
81 a1d3c023 eb
  vec_float4 acc0 = {0, 0, 0, 0};
82 a1d3c023 eb
  vec_float4 acc1 = {0, 0, 0, 0};
83 a1d3c023 eb
  vec_float4 acc2 = {0, 0, 0, 0};
84 a1d3c023 eb
  vec_float4 acc3 = {0, 0, 0, 0};
85 2c8ea58e eb
86 2c8ea58e eb
  // wind in
87 2c8ea58e eb
88 2c8ea58e eb
  p0 = vec_ld(0*VS, a);
89 2c8ea58e eb
  p1 = vec_ld(1*VS, a);
90 2c8ea58e eb
  p2 = vec_ld(2*VS, a);
91 2c8ea58e eb
  p3 = vec_ld(3*VS, a);
92 2c8ea58e eb
  a += UNROLL_CNT;
93 2c8ea58e eb
94 2c8ea58e eb
  a0 = vec_perm(p0, p1, lvsl_a);
95 2c8ea58e eb
  b0 = vec_ld(0*VS, b);
96 2c8ea58e eb
  p0 = vec_ld(0*VS, a);
97 2c8ea58e eb
98 2c8ea58e eb
  size_t i;
99 2c8ea58e eb
  for (i = 0; i < loop_cnt; i++){
100 2c8ea58e eb
101 2c8ea58e eb
    a1 = vec_perm(p1, p2, lvsl_a);
102 2c8ea58e eb
    b1 = vec_ld(1*VS, b);
103 2c8ea58e eb
    p1 = vec_ld(1*VS, a);
104 2c8ea58e eb
    acc0 = vec_madd(a0, b0, acc0);
105 2c8ea58e eb
106 2c8ea58e eb
    a2 = vec_perm(p2, p3, lvsl_a);
107 2c8ea58e eb
    b2 = vec_ld(2*VS, b);
108 2c8ea58e eb
    p2 = vec_ld(2*VS, a);
109 2c8ea58e eb
    acc1 = vec_madd(a1, b1, acc1);
110 2c8ea58e eb
111 2c8ea58e eb
    a3 = vec_perm(p3, p0, lvsl_a);
112 2c8ea58e eb
    b3 = vec_ld(3*VS, b);
113 2c8ea58e eb
    p3 = vec_ld(3*VS, a);
114 2c8ea58e eb
    acc2 = vec_madd(a2, b2, acc2);
115 2c8ea58e eb
116 2c8ea58e eb
    a += UNROLL_CNT;
117 2c8ea58e eb
    b += UNROLL_CNT;
118 2c8ea58e eb
119 2c8ea58e eb
    a0 = vec_perm(p0, p1, lvsl_a);
120 2c8ea58e eb
    b0 = vec_ld(0*VS, b);
121 2c8ea58e eb
    p0 = vec_ld(0*VS, a);
122 2c8ea58e eb
    acc3 = vec_madd(a3, b3, acc3);
123 2c8ea58e eb
  }
124 2c8ea58e eb
125 2c8ea58e eb
  /*
126 2c8ea58e eb
   * The compiler ought to be able to figure out that 0, 4, 8 and 12
127 2c8ea58e eb
   * are the only possible values for nleft.
128 2c8ea58e eb
   */
129 2c8ea58e eb
  switch (nleft){
130 2c8ea58e eb
  case 0:
131 2c8ea58e eb
    break;
132 2c8ea58e eb
    
133 2c8ea58e eb
  case 4:
134 2c8ea58e eb
    acc0 = vec_madd(a0, b0, acc0);
135 2c8ea58e eb
    break;
136 2c8ea58e eb
137 2c8ea58e eb
  case 8:
138 2c8ea58e eb
    a1 = vec_perm(p1, p2, lvsl_a);
139 2c8ea58e eb
    b1 = vec_ld(1*VS, b);
140 2c8ea58e eb
    acc0 = vec_madd(a0, b0, acc0);
141 2c8ea58e eb
    acc1 = vec_madd(a1, b1, acc1);
142 2c8ea58e eb
    break;
143 2c8ea58e eb
144 2c8ea58e eb
  case 12:
145 2c8ea58e eb
    a1 = vec_perm(p1, p2, lvsl_a);
146 2c8ea58e eb
    b1 = vec_ld(1*VS, b);
147 2c8ea58e eb
    acc0 = vec_madd(a0, b0, acc0);
148 2c8ea58e eb
    a2 = vec_perm(p2, p3, lvsl_a);
149 2c8ea58e eb
    b2 = vec_ld(2*VS, b);
150 2c8ea58e eb
    acc1 = vec_madd(a1, b1, acc1);
151 2c8ea58e eb
    acc2 = vec_madd(a2, b2, acc2);
152 2c8ea58e eb
    break;
153 2c8ea58e eb
  }
154 2c8ea58e eb
            
155 2c8ea58e eb
  acc0 = acc0 + acc1;
156 2c8ea58e eb
  acc2 = acc2 + acc3;
157 2c8ea58e eb
  acc0 = acc0 + acc2;
158 2c8ea58e eb
159 2c8ea58e eb
  return horizontal_add_f(acc0);
160 2c8ea58e eb
}
161 2c8ea58e eb
162 2c8ea58e eb
#endif