root / gnuradio-core / src / lib / filter / dotprod_fff_altivec.c @ 3ac56587
History | View | Annotate | Download (3.6 kB)
| 1 | 2c8ea58e | eb | /* -*- c++ -*- */
|
|---|---|---|---|
| 2 | 2c8ea58e | eb | /*
|
| 3 | 2c8ea58e | eb | * Copyright 2008 Free Software Foundation, Inc. |
| 4 | 2c8ea58e | eb | * |
| 5 | 2c8ea58e | eb | * This file is part of GNU Radio |
| 6 | 2c8ea58e | eb | * |
| 7 | 2c8ea58e | eb | * GNU Radio is free software; you can redistribute it and/or modify |
| 8 | 2c8ea58e | eb | * it under the terms of the GNU General Public License as published by |
| 9 | 2c8ea58e | eb | * the Free Software Foundation; either version 3, or (at your option) |
| 10 | 2c8ea58e | eb | * any later version. |
| 11 | 2c8ea58e | eb | * |
| 12 | 2c8ea58e | eb | * GNU Radio is distributed in the hope that it will be useful, |
| 13 | 2c8ea58e | eb | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | 2c8ea58e | eb | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | 2c8ea58e | eb | * GNU General Public License for more details. |
| 16 | 2c8ea58e | eb | * |
| 17 | 2c8ea58e | eb | * You should have received a copy of the GNU General Public License along |
| 18 | 2c8ea58e | eb | * with this program; if not, write to the Free Software Foundation, Inc., |
| 19 | 2c8ea58e | eb | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | 2c8ea58e | eb | */ |
| 21 | 2c8ea58e | eb | |
| 22 | 2c8ea58e | eb | #ifdef HAVE_CONFIG_H
|
| 23 | 2c8ea58e | eb | #include <config.h> |
| 24 | 2c8ea58e | eb | #endif
|
| 25 | 2c8ea58e | eb | #include <dotprod_fff_altivec.h> |
| 26 | 2c8ea58e | eb | #include <gr_altivec.h> |
| 27 | 2c8ea58e | eb | |
| 28 | 2c8ea58e | eb | /*!
|
| 29 | 2c8ea58e | eb | * \param x any value |
| 30 | 2c8ea58e | eb | * \param pow2 must be a power of 2 |
| 31 | 2c8ea58e | eb | * \returns \p x rounded down to a multiple of \p pow2. |
| 32 | 2c8ea58e | eb | */ |
| 33 | 2c8ea58e | eb | static inline size_t |
| 34 | 2c8ea58e | eb | gr_p2_round_down(size_t x, size_t pow2) |
| 35 | 2c8ea58e | eb | {
|
| 36 | 2c8ea58e | eb | return x & -pow2;
|
| 37 | 2c8ea58e | eb | } |
| 38 | 2c8ea58e | eb | |
| 39 | 2c8ea58e | eb | |
| 40 | 2c8ea58e | eb | #if 0
|
| 41 | 2c8ea58e | eb | |
| 42 | 2c8ea58e | eb | float |
| 43 | 2c8ea58e | eb | dotprod_fff_altivec(const float *a, const float *b, size_t n) |
| 44 | 2c8ea58e | eb | {
|
| 45 | 2c8ea58e | eb | float sum = 0; |
| 46 | 2c8ea58e | eb | for (size_t i = 0; i < n; i++){
|
| 47 | 2c8ea58e | eb | sum += a[i] * b[i]; |
| 48 | 2c8ea58e | eb | } |
| 49 | 2c8ea58e | eb | return sum; |
| 50 | 2c8ea58e | eb | } |
| 51 | 2c8ea58e | eb | |
| 52 | 2c8ea58e | eb | #else |
| 53 | 2c8ea58e | eb | |
| 54 | 2c8ea58e | eb | /*
|
| 55 | 2c8ea58e | eb | * preconditions: |
| 56 | 2c8ea58e | eb | * |
| 57 | 2c8ea58e | eb | * n > 0 and a multiple of 4 |
| 58 | 2c8ea58e | eb | * a 4-byte aligned |
| 59 | 2c8ea58e | eb | * b 16-byte aligned |
| 60 | 2c8ea58e | eb | */ |
| 61 | 2c8ea58e | eb | float
|
| 62 | 2c8ea58e | eb | dotprod_fff_altivec(const float *_a, const float *_b, size_t n) |
| 63 | 2c8ea58e | eb | {
|
| 64 | a1d3c023 | eb | const vec_float4 *a = (const vec_float4 *) _a; |
| 65 | a1d3c023 | eb | const vec_float4 *b = (const vec_float4 *) _b; |
| 66 | 2c8ea58e | eb | |
| 67 | 2c8ea58e | eb | static const size_t UNROLL_CNT = 4; |
| 68 | 2c8ea58e | eb | |
| 69 | 2c8ea58e | eb | n = gr_p2_round_down(n, 4);
|
| 70 | 2c8ea58e | eb | size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC); |
| 71 | 2c8ea58e | eb | size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC); |
| 72 | 2c8ea58e | eb | |
| 73 | 2c8ea58e | eb | // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft);
|
| 74 | 2c8ea58e | eb | |
| 75 | 2c8ea58e | eb | // Used with vperm to build a* from p*
|
| 76 | a1d3c023 | eb | vec_uchar16 lvsl_a = vec_lvsl(0, _a);
|
| 77 | a1d3c023 | eb | |
| 78 | a1d3c023 | eb | vec_float4 p0, p1, p2, p3; |
| 79 | a1d3c023 | eb | vec_float4 a0, a1, a2, a3; |
| 80 | a1d3c023 | eb | vec_float4 b0, b1, b2, b3; |
| 81 | a1d3c023 | eb | vec_float4 acc0 = {0, 0, 0, 0};
|
| 82 | a1d3c023 | eb | vec_float4 acc1 = {0, 0, 0, 0};
|
| 83 | a1d3c023 | eb | vec_float4 acc2 = {0, 0, 0, 0};
|
| 84 | a1d3c023 | eb | vec_float4 acc3 = {0, 0, 0, 0};
|
| 85 | 2c8ea58e | eb | |
| 86 | 2c8ea58e | eb | // wind in
|
| 87 | 2c8ea58e | eb | |
| 88 | 2c8ea58e | eb | p0 = vec_ld(0*VS, a);
|
| 89 | 2c8ea58e | eb | p1 = vec_ld(1*VS, a);
|
| 90 | 2c8ea58e | eb | p2 = vec_ld(2*VS, a);
|
| 91 | 2c8ea58e | eb | p3 = vec_ld(3*VS, a);
|
| 92 | 2c8ea58e | eb | a += UNROLL_CNT; |
| 93 | 2c8ea58e | eb | |
| 94 | 2c8ea58e | eb | a0 = vec_perm(p0, p1, lvsl_a); |
| 95 | 2c8ea58e | eb | b0 = vec_ld(0*VS, b);
|
| 96 | 2c8ea58e | eb | p0 = vec_ld(0*VS, a);
|
| 97 | 2c8ea58e | eb | |
| 98 | 2c8ea58e | eb | size_t i; |
| 99 | 2c8ea58e | eb | for (i = 0; i < loop_cnt; i++){ |
| 100 | 2c8ea58e | eb | |
| 101 | 2c8ea58e | eb | a1 = vec_perm(p1, p2, lvsl_a); |
| 102 | 2c8ea58e | eb | b1 = vec_ld(1*VS, b);
|
| 103 | 2c8ea58e | eb | p1 = vec_ld(1*VS, a);
|
| 104 | 2c8ea58e | eb | acc0 = vec_madd(a0, b0, acc0); |
| 105 | 2c8ea58e | eb | |
| 106 | 2c8ea58e | eb | a2 = vec_perm(p2, p3, lvsl_a); |
| 107 | 2c8ea58e | eb | b2 = vec_ld(2*VS, b);
|
| 108 | 2c8ea58e | eb | p2 = vec_ld(2*VS, a);
|
| 109 | 2c8ea58e | eb | acc1 = vec_madd(a1, b1, acc1); |
| 110 | 2c8ea58e | eb | |
| 111 | 2c8ea58e | eb | a3 = vec_perm(p3, p0, lvsl_a); |
| 112 | 2c8ea58e | eb | b3 = vec_ld(3*VS, b);
|
| 113 | 2c8ea58e | eb | p3 = vec_ld(3*VS, a);
|
| 114 | 2c8ea58e | eb | acc2 = vec_madd(a2, b2, acc2); |
| 115 | 2c8ea58e | eb | |
| 116 | 2c8ea58e | eb | a += UNROLL_CNT; |
| 117 | 2c8ea58e | eb | b += UNROLL_CNT; |
| 118 | 2c8ea58e | eb | |
| 119 | 2c8ea58e | eb | a0 = vec_perm(p0, p1, lvsl_a); |
| 120 | 2c8ea58e | eb | b0 = vec_ld(0*VS, b);
|
| 121 | 2c8ea58e | eb | p0 = vec_ld(0*VS, a);
|
| 122 | 2c8ea58e | eb | acc3 = vec_madd(a3, b3, acc3); |
| 123 | 2c8ea58e | eb | } |
| 124 | 2c8ea58e | eb | |
| 125 | 2c8ea58e | eb | /*
|
| 126 | 2c8ea58e | eb | * The compiler ought to be able to figure out that 0, 4, 8 and 12 |
| 127 | 2c8ea58e | eb | * are the only possible values for nleft. |
| 128 | 2c8ea58e | eb | */ |
| 129 | 2c8ea58e | eb | switch (nleft){
|
| 130 | 2c8ea58e | eb | case 0: |
| 131 | 2c8ea58e | eb | break;
|
| 132 | 2c8ea58e | eb | |
| 133 | 2c8ea58e | eb | case 4: |
| 134 | 2c8ea58e | eb | acc0 = vec_madd(a0, b0, acc0); |
| 135 | 2c8ea58e | eb | break;
|
| 136 | 2c8ea58e | eb | |
| 137 | 2c8ea58e | eb | case 8: |
| 138 | 2c8ea58e | eb | a1 = vec_perm(p1, p2, lvsl_a); |
| 139 | 2c8ea58e | eb | b1 = vec_ld(1*VS, b);
|
| 140 | 2c8ea58e | eb | acc0 = vec_madd(a0, b0, acc0); |
| 141 | 2c8ea58e | eb | acc1 = vec_madd(a1, b1, acc1); |
| 142 | 2c8ea58e | eb | break;
|
| 143 | 2c8ea58e | eb | |
| 144 | 2c8ea58e | eb | case 12: |
| 145 | 2c8ea58e | eb | a1 = vec_perm(p1, p2, lvsl_a); |
| 146 | 2c8ea58e | eb | b1 = vec_ld(1*VS, b);
|
| 147 | 2c8ea58e | eb | acc0 = vec_madd(a0, b0, acc0); |
| 148 | 2c8ea58e | eb | a2 = vec_perm(p2, p3, lvsl_a); |
| 149 | 2c8ea58e | eb | b2 = vec_ld(2*VS, b);
|
| 150 | 2c8ea58e | eb | acc1 = vec_madd(a1, b1, acc1); |
| 151 | 2c8ea58e | eb | acc2 = vec_madd(a2, b2, acc2); |
| 152 | 2c8ea58e | eb | break;
|
| 153 | 2c8ea58e | eb | } |
| 154 | 2c8ea58e | eb | |
| 155 | 2c8ea58e | eb | acc0 = acc0 + acc1; |
| 156 | 2c8ea58e | eb | acc2 = acc2 + acc3; |
| 157 | 2c8ea58e | eb | acc0 = acc0 + acc2; |
| 158 | 2c8ea58e | eb | |
| 159 | 2c8ea58e | eb | return horizontal_add_f(acc0);
|
| 160 | 2c8ea58e | eb | } |
| 161 | 2c8ea58e | eb | |
| 162 | 2c8ea58e | eb | #endif |