root / gnuradio-core / src / lib / filter / ccomplex_dotprod_sse.S @ a34f397a
History | View | Annotate | Download (4.3 kB)
| 1 | # |
|---|---|
| 2 | # Copyright 2002 Free Software Foundation, Inc. |
| 3 | # |
| 4 | # This file is part of GNU Radio |
| 5 | # |
| 6 | # GNU Radio is free software; you can redistribute it and/or modify |
| 7 | # it under the terms of the GNU General Public License as published by |
| 8 | # the Free Software Foundation; either version 3, or (at your option) |
| 9 | # any later version. |
| 10 | # |
| 11 | # GNU Radio is distributed in the hope that it will be useful, |
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | # GNU General Public License for more details. |
| 15 | # |
| 16 | # You should have received a copy of the GNU General Public License |
| 17 | # along with GNU Radio; see the file COPYING. If not, write to |
| 18 | # the Free Software Foundation, Inc., 51 Franklin Street, |
| 19 | # Boston, MA 02110-1301, USA. |
| 20 | # |
| 21 | |
| 22 | |
| 23 | # input and taps are guarenteed to be 16 byte aligned. |
| 24 | # n_2_ccomplex_blocks is != 0 |
| 25 | # |
| 26 | # |
| 27 | # ccomplex_dotprod_generic (const float *input, |
| 28 | # const float *taps, unsigned n_2_ccomplex_blocks, float *result) |
| 29 | # {
|
| 30 | # float sum0 = 0; |
| 31 | # float sum1 = 0; |
| 32 | # float sum2 = 0; |
| 33 | # float sum3 = 0; |
| 34 | # |
| 35 | # do {
|
| 36 | # |
| 37 | # sum0 += input[0] * taps[0] - input[1] * taps[1]; |
| 38 | # sum1 += input[0] * taps[1] + input[1] * taps[0]; |
| 39 | # sum2 += input[2] * taps[2] - input[3] * taps[3]; |
| 40 | # sum3 += input[2] * taps[3] + input[3] * taps[2]; |
| 41 | # |
| 42 | # input += 4; |
| 43 | # taps += 4; |
| 44 | # |
| 45 | # } while (--n_2_ccomplex_blocks != 0); |
| 46 | # |
| 47 | # |
| 48 | # result[0] = sum0 + sum2; |
| 49 | # result[1] = sum1 + sum3; |
| 50 | # } |
| 51 | # |
| 52 | |
| 53 | # TODO: prefetch and better scheduling |
| 54 | |
| 55 | #include "assembly.h" |
| 56 | |
| 57 | .file "ccomplex_dotprod_sse.S" |
| 58 | .version "01.01" |
| 59 | .text |
| 60 | .p2align 4 |
| 61 | .globl GLOB_SYMB(ccomplex_dotprod_sse) |
| 62 | DEF_FUNC_HEAD(ccomplex_dotprod_sse) |
| 63 | GLOB_SYMB(ccomplex_dotprod_sse): |
| 64 | pushl %ebp |
| 65 | movl %esp, %ebp |
| 66 | movl 8(%ebp), %eax # input |
| 67 | movl 12(%ebp), %edx # taps |
| 68 | movl 16(%ebp), %ecx # n_2_ccomplex_blocks |
| 69 | |
| 70 | xorps %xmm6, %xmm6 # zero accumulators |
| 71 | |
| 72 | movaps 0(%eax), %xmm0 |
| 73 | |
| 74 | xorps %xmm7, %xmm7 # zero accumulators |
| 75 | |
| 76 | movaps 0(%edx), %xmm2 |
| 77 | |
| 78 | shrl $1, %ecx # ecx = n_2_ccomplex_blocks / 2 |
| 79 | |
| 80 | jmp .L1_test |
| 81 | |
| 82 | # |
| 83 | # 4 taps / loop |
| 84 | # something like ?? cycles / loop |
| 85 | # |
| 86 | |
| 87 | .p2align 4 |
| 88 | .Loop1: |
| 89 | |
| 90 | # complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000 |
| 91 | # |
| 92 | # movaps (%eax), %xmmA |
| 93 | # movaps (%edx), %xmmB |
| 94 | # |
| 95 | # movaps %xmmA, %xmmZ |
| 96 | # shufps $0xb1, %xmmZ, %xmmZ # swap internals |
| 97 | # |
| 98 | # mulps %xmmB, %xmmA |
| 99 | # mulps %xmmZ, %xmmB |
| 100 | # |
| 101 | # # SSE replacement for: pfpnacc %xmmB, %xmmA |
| 102 | # xorps %xmmPN, %xmmA |
| 103 | # movaps %xmmA, %xmmZ |
| 104 | # unpcklps %xmmB, %xmmA |
| 105 | # unpckhps %xmmB, %xmmZ |
| 106 | # movaps %xmmZ, %xmmY |
| 107 | # shufps $0x44, %xmmA, %xmmZ # b01000100 |
| 108 | # shufps $0xee, %xmmY, %xmmA # b11101110 |
| 109 | # addps %xmmZ, %xmmA |
| 110 | # |
| 111 | # addps %xmmA, %xmmC |
| 112 | |
| 113 | # A=xmm0, B=xmm2, Z=xmm4 |
| 114 | # A'=xmm1, B'=xmm3, Z'=xmm5 |
| 115 | |
| 116 | movaps 16(%eax), %xmm1 |
| 117 | |
| 118 | movaps %xmm0, %xmm4 |
| 119 | mulps %xmm2, %xmm0 |
| 120 | |
| 121 | shufps $0xb1, %xmm4, %xmm4 # swap internals |
| 122 | movaps 16(%edx), %xmm3 |
| 123 | movaps %xmm1, %xmm5 |
| 124 | addps %xmm0, %xmm6 |
| 125 | mulps %xmm3, %xmm1 |
| 126 | shufps $0xb1, %xmm5, %xmm5 # swap internals |
| 127 | addps %xmm1, %xmm6 |
| 128 | mulps %xmm4, %xmm2 |
| 129 | movaps 32(%eax), %xmm0 |
| 130 | addps %xmm2, %xmm7 |
| 131 | mulps %xmm5, %xmm3 |
| 132 | |
| 133 | addl $32, %eax |
| 134 | |
| 135 | movaps 32(%edx), %xmm2 |
| 136 | addps %xmm3, %xmm7 |
| 137 | |
| 138 | addl $32, %edx |
| 139 | |
| 140 | |
| 141 | |
| 142 | .L1_test: |
| 143 | decl %ecx |
| 144 | jge .Loop1 |
| 145 | |
| 146 | # We've handled the bulk of multiplies up to here. |
| 147 | # Let's sse if original n_2_ccomplex_blocks was odd. |
| 148 | # If so, we've got 2 more taps to do. |
| 149 | |
| 150 | movl 16(%ebp), %ecx # n_2_ccomplex_blocks |
| 151 | andl $1, %ecx |
| 152 | je .Leven |
| 153 | |
| 154 | # The count was odd, do 2 more taps. |
| 155 | # Note that we've already got mm0/mm2 preloaded |
| 156 | # from the main loop. |
| 157 | |
| 158 | movaps %xmm0, %xmm4 |
| 159 | mulps %xmm2, %xmm0 |
| 160 | shufps $0xb1, %xmm4, %xmm4 # swap internals |
| 161 | addps %xmm0, %xmm6 |
| 162 | mulps %xmm4, %xmm2 |
| 163 | addps %xmm2, %xmm7 |
| 164 | |
| 165 | |
| 166 | .Leven: |
| 167 | # neg inversor |
| 168 | xorps %xmm1, %xmm1 |
| 169 | movl $0x80000000, 16(%ebp) |
| 170 | movss 16(%ebp), %xmm1 |
| 171 | shufps $0x11, %xmm1, %xmm1 # b00010001 # 0 -0 0 -0 |
| 172 | |
| 173 | # pfpnacc |
| 174 | xorps %xmm1, %xmm6 |
| 175 | |
| 176 | movaps %xmm6, %xmm2 |
| 177 | unpcklps %xmm7, %xmm6 |
| 178 | unpckhps %xmm7, %xmm2 |
| 179 | movaps %xmm2, %xmm3 |
| 180 | shufps $0x44, %xmm6, %xmm2 # b01000100 |
| 181 | shufps $0xee, %xmm3, %xmm6 # b11101110 |
| 182 | addps %xmm2, %xmm6 |
| 183 | |
| 184 | # xmm6 = r1 i2 r3 i4 |
| 185 | movl 20(%ebp), %eax # @result |
| 186 | movhlps %xmm6, %xmm4 # xmm4 = r3 i4 ?? ?? |
| 187 | addps %xmm4, %xmm6 # xmm6 = r1+r3 i2+i4 ?? ?? |
| 188 | movlps %xmm6, (%eax) # store low 2x32 bits (complex) to memory |
| 189 | |
| 190 | popl %ebp |
| 191 | ret |
| 192 | |
| 193 | FUNC_TAIL(ccomplex_dotprod_sse) |
| 194 | .ident "Hand coded x86 SSE assembly" |
| 195 | |
| 196 | #if defined(__linux__) && defined(__ELF__) |
| 197 | .section .note.GNU-stack,"",%progbits |
| 198 | #endif |