| | 157 | |
|---|
| | 158 | #else |
|---|
| | 159 | /* |
|---|
| | 160 | * preconditions: |
|---|
| | 161 | * |
|---|
| | 162 | * n > 0 and a multiple of 4 |
|---|
| | 163 | * a 4-byte aligned |
|---|
| | 164 | * b 16-byte aligned |
|---|
| | 165 | */ |
|---|
| | 166 | float |
|---|
| | 167 | dotprod_fff_vmx(const float *_a, const float *_b, size_t n) |
|---|
| | 168 | { |
|---|
| | 169 | const vector float *a = (const vector float *) _a; |
|---|
| | 170 | const vector float *b = (const vector float *) _b; |
|---|
| | 171 | |
|---|
| | 172 | static const size_t UNROLL_CNT = 4; |
|---|
| | 173 | |
|---|
| | 174 | size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC); |
|---|
| | 175 | size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC); |
|---|
| | 176 | |
|---|
| | 177 | // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft); |
|---|
| | 178 | |
|---|
| | 179 | // Used with vperm to build a* from p* |
|---|
| | 180 | vector unsigned char lvsl_a = vec_lvsl(0, _a); |
|---|
| | 181 | |
|---|
| | 182 | vector float p0, p1, p2, p3; |
|---|
| | 183 | vector float a0, a1, a2, a3; |
|---|
| | 184 | vector float b0, b1, b2, b3; |
|---|
| | 185 | vector float acc0 = {0, 0, 0, 0}; |
|---|
| | 186 | vector float acc1 = {0, 0, 0, 0}; |
|---|
| | 187 | vector float acc2 = {0, 0, 0, 0}; |
|---|
| | 188 | vector float acc3 = {0, 0, 0, 0}; |
|---|
| | 189 | |
|---|
| | 190 | // wind in |
|---|
| | 191 | |
|---|
| | 192 | p0 = vec_ld(0*VS, a); |
|---|
| | 193 | p1 = vec_ld(1*VS, a); |
|---|
| | 194 | p2 = vec_ld(2*VS, a); |
|---|
| | 195 | p3 = vec_ld(3*VS, a); |
|---|
| | 196 | |
|---|
| | 197 | a0 = vec_perm(p0, p1, lvsl_a); |
|---|
| | 198 | b0 = vec_ld(0*VS, b); |
|---|
| | 199 | p0 = vec_ld((UNROLL_CNT + 0)*VS, a); |
|---|
| | 200 | |
|---|
| | 201 | for (size_t i = 0; i < loop_cnt; i++){ |
|---|
| | 202 | |
|---|
| | 203 | a1 = vec_perm(p1, p2, lvsl_a); |
|---|
| | 204 | b1 = vec_ld(1*VS, b); |
|---|
| | 205 | p1 = vec_ld((UNROLL_CNT + 1)*VS, a); |
|---|
| | 206 | acc0 = vec_madd(a0, b0, acc0); |
|---|
| | 207 | |
|---|
| | 208 | a2 = vec_perm(p2, p3, lvsl_a); |
|---|
| | 209 | b2 = vec_ld(2*VS, b); |
|---|
| | 210 | p2 = vec_ld((UNROLL_CNT + 2)*VS, a); |
|---|
| | 211 | acc1 = vec_madd(a1, b1, acc1); |
|---|
| | 212 | |
|---|
| | 213 | a3 = vec_perm(p3, p0, lvsl_a); |
|---|
| | 214 | b3 = vec_ld(3*VS, b); |
|---|
| | 215 | p3 = vec_ld((UNROLL_CNT + 3)*VS, a); |
|---|
| | 216 | acc2 = vec_madd(a2, b2, acc2); |
|---|
| | 217 | |
|---|
| | 218 | a += UNROLL_CNT; |
|---|
| | 219 | b += UNROLL_CNT; |
|---|
| | 220 | |
|---|
| | 221 | a0 = vec_perm(p0, p1, lvsl_a); |
|---|
| | 222 | b0 = vec_ld(0*VS, b); |
|---|
| | 223 | p0 = vec_ld((UNROLL_CNT + 0)*VS, a); |
|---|
| | 224 | acc3 = vec_madd(a3, b3, acc3); |
|---|
| | 225 | } |
|---|
| | 226 | |
|---|
| | 227 | assert((nleft % 4) == 0); |
|---|
| | 228 | |
|---|
| | 229 | switch (nleft/4){ |
|---|
| | 230 | case 0: |
|---|
| | 231 | break; |
|---|
| | 232 | |
|---|
| | 233 | case 1: |
|---|
| | 234 | acc0 = vec_madd(a0, b0, acc0); |
|---|
| | 235 | break; |
|---|
| | 236 | |
|---|
| | 237 | case 2: |
|---|
| | 238 | a1 = vec_perm(p1, p2, lvsl_a); |
|---|
| | 239 | b1 = vec_ld(1*VS, b); |
|---|
| | 240 | acc0 = vec_madd(a0, b0, acc0); |
|---|
| | 241 | acc1 = vec_madd(a1, b1, acc1); |
|---|
| | 242 | break; |
|---|
| | 243 | |
|---|
| | 244 | case 3: |
|---|
| | 245 | a1 = vec_perm(p1, p2, lvsl_a); |
|---|
| | 246 | b1 = vec_ld(1*VS, b); |
|---|
| | 247 | acc0 = vec_madd(a0, b0, acc0); |
|---|
| | 248 | a2 = vec_perm(p2, p3, lvsl_a); |
|---|
| | 249 | b2 = vec_ld(2*VS, b); |
|---|
| | 250 | acc1 = vec_madd(a1, b1, acc1); |
|---|
| | 251 | acc2 = vec_madd(a2, b2, acc2); |
|---|
| | 252 | break; |
|---|
| | 253 | |
|---|
| | 254 | default: |
|---|
| | 255 | assert(0); |
|---|
| | 256 | break; |
|---|
| | 257 | } |
|---|
| | 258 | |
|---|
| | 259 | acc0 = acc0 + acc1; |
|---|
| | 260 | acc2 = acc2 + acc3; |
|---|
| | 261 | acc0 = acc0 + acc2; |
|---|
| | 262 | |
|---|
| | 263 | return horizontal_add_f(acc0); |
|---|
| | 264 | } |
|---|
| | 265 | |
|---|