| 31 | | |
|---|
| 32 | | extern "C" { |
|---|
| 33 | | |
|---|
| 34 | | #if 0 |
|---|
| 35 | | |
|---|
| 36 | | float |
|---|
| 37 | | dotprod_fff_altivec(const float *a, const float *b, size_t n) |
|---|
| 38 | | { |
|---|
| 39 | | float sum = 0; |
|---|
| 40 | | for (size_t i = 0; i < n; i++){ |
|---|
| 41 | | sum += a[i] * b[i]; |
|---|
| 42 | | } |
|---|
| 43 | | return sum; |
|---|
| 44 | | } |
|---|
| 45 | | |
|---|
| 46 | | #else |
|---|
| 47 | | /* |
|---|
| 48 | | * preconditions: |
|---|
| 49 | | * |
|---|
| 50 | | * n > 0 and a multiple of 4 |
|---|
| 51 | | * a 4-byte aligned |
|---|
| 52 | | * b 16-byte aligned |
|---|
| 53 | | */ |
|---|
| 54 | | float |
|---|
| 55 | | dotprod_fff_altivec(const float *_a, const float *_b, size_t n) |
|---|
| 56 | | { |
|---|
| 57 | | const vector float *a = (const vector float *) _a; |
|---|
| 58 | | const vector float *b = (const vector float *) _b; |
|---|
| 59 | | |
|---|
| 60 | | static const size_t UNROLL_CNT = 4; |
|---|
| 61 | | |
|---|
| 62 | | n = gr_p2_round_down(n, 4); |
|---|
| 63 | | size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC); |
|---|
| 64 | | size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC); |
|---|
| 65 | | |
|---|
| 66 | | // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft); |
|---|
| 67 | | |
|---|
| 68 | | // Used with vperm to build a* from p* |
|---|
| 69 | | vector unsigned char lvsl_a = vec_lvsl(0, _a); |
|---|
| 70 | | |
|---|
| 71 | | vector float p0, p1, p2, p3; |
|---|
| 72 | | vector float a0, a1, a2, a3; |
|---|
| 73 | | vector float b0, b1, b2, b3; |
|---|
| 74 | | vector float acc0 = {0, 0, 0, 0}; |
|---|
| 75 | | vector float acc1 = {0, 0, 0, 0}; |
|---|
| 76 | | vector float acc2 = {0, 0, 0, 0}; |
|---|
| 77 | | vector float acc3 = {0, 0, 0, 0}; |
|---|
| 78 | | |
|---|
| 79 | | // wind in |
|---|
| 80 | | |
|---|
| 81 | | register int r0vs = 0 * VS; |
|---|
| 82 | | register int r1vs = 1 * VS; |
|---|
| 83 | | register int r2vs = 2 * VS; |
|---|
| 84 | | register int r3vs = 3 * VS; |
|---|
| 85 | | |
|---|
| 86 | | p0 = vec_ld(r0vs, a); |
|---|
| 87 | | p1 = vec_ld(r1vs, a); |
|---|
| 88 | | p2 = vec_ld(r2vs, a); |
|---|
| 89 | | p3 = vec_ld(r3vs, a); |
|---|
| 90 | | a += UNROLL_CNT; |
|---|
| 91 | | |
|---|
| 92 | | a0 = vec_perm(p0, p1, lvsl_a); |
|---|
| 93 | | b0 = vec_ld(r0vs, b); |
|---|
| 94 | | p0 = vec_ld(r0vs, a); |
|---|
| 95 | | |
|---|
| 96 | | for (size_t i = 0; i < loop_cnt; i++){ |
|---|
| 97 | | |
|---|
| 98 | | a1 = vec_perm(p1, p2, lvsl_a); |
|---|
| 99 | | b1 = vec_ld(r1vs, b); |
|---|
| 100 | | p1 = vec_ld(r1vs, a); |
|---|
| 101 | | acc0 = vec_madd(a0, b0, acc0); |
|---|
| 102 | | |
|---|
| 103 | | a2 = vec_perm(p2, p3, lvsl_a); |
|---|
| 104 | | b2 = vec_ld(r2vs, b); |
|---|
| 105 | | p2 = vec_ld(r2vs, a); |
|---|
| 106 | | acc1 = vec_madd(a1, b1, acc1); |
|---|
| 107 | | |
|---|
| 108 | | a3 = vec_perm(p3, p0, lvsl_a); |
|---|
| 109 | | b3 = vec_ld(r3vs, b); |
|---|
| 110 | | p3 = vec_ld(r3vs, a); |
|---|
| 111 | | acc2 = vec_madd(a2, b2, acc2); |
|---|
| 112 | | |
|---|
| 113 | | a += UNROLL_CNT; |
|---|
| 114 | | b += UNROLL_CNT; |
|---|
| 115 | | |
|---|
| 116 | | a0 = vec_perm(p0, p1, lvsl_a); |
|---|
| 117 | | b0 = vec_ld(r0vs, b); |
|---|
| 118 | | p0 = vec_ld(r0vs, a); |
|---|
| 119 | | acc3 = vec_madd(a3, b3, acc3); |
|---|
| 120 | | } |
|---|
| 121 | | |
|---|
| 122 | | /* |
|---|
| 123 | | * The compiler ought to be able to figure out that 0, 4, 8 and 12 |
|---|
| 124 | | * are the only possible values for nleft. |
|---|
| 125 | | */ |
|---|
| 126 | | switch (nleft){ |
|---|
| 127 | | case 0: |
|---|
| 128 | | break; |
|---|
| 129 | | |
|---|
| 130 | | case 4: |
|---|
| 131 | | acc0 = vec_madd(a0, b0, acc0); |
|---|
| 132 | | break; |
|---|
| 133 | | |
|---|
| 134 | | case 8: |
|---|
| 135 | | a1 = vec_perm(p1, p2, lvsl_a); |
|---|
| 136 | | b1 = vec_ld(r1vs, b); |
|---|
| 137 | | acc0 = vec_madd(a0, b0, acc0); |
|---|
| 138 | | acc1 = vec_madd(a1, b1, acc1); |
|---|
| 139 | | break; |
|---|
| 140 | | |
|---|
| 141 | | case 12: |
|---|
| 142 | | a1 = vec_perm(p1, p2, lvsl_a); |
|---|
| 143 | | b1 = vec_ld(r1vs, b); |
|---|
| 144 | | acc0 = vec_madd(a0, b0, acc0); |
|---|
| 145 | | a2 = vec_perm(p2, p3, lvsl_a); |
|---|
| 146 | | b2 = vec_ld(r2vs, b); |
|---|
| 147 | | acc1 = vec_madd(a1, b1, acc1); |
|---|
| 148 | | acc2 = vec_madd(a2, b2, acc2); |
|---|
| 149 | | break; |
|---|
| 150 | | } |
|---|
| 151 | | |
|---|
| 152 | | acc0 = acc0 + acc1; |
|---|
| 153 | | acc2 = acc2 + acc3; |
|---|
| 154 | | acc0 = acc0 + acc2; |
|---|
| 155 | | |
|---|
| 156 | | return horizontal_add_f(acc0); |
|---|
| 157 | | } |
|---|
| 158 | | |
|---|
| 159 | | #endif |
|---|
| 160 | | } |
|---|
| | 31 | #include <dotprod_fff_altivec.h> |
|---|