Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / ccomplex_dotprod_sse.S @ a34f397a

History | View | Annotate | Download (4.3 kB)

1
#
2
# Copyright 2002 Free Software Foundation, Inc.
3
# 
4
# This file is part of GNU Radio
5
# 
6
# GNU Radio is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 3, or (at your option)
9
# any later version.
10
# 
11
# GNU Radio is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
# 
16
# You should have received a copy of the GNU General Public License
17
# along with GNU Radio; see the file COPYING.  If not, write to
18
# the Free Software Foundation, Inc., 51 Franklin Street,
19
# Boston, MA 02110-1301, USA.
20
# 
21
22
23
# input and taps are guarenteed to be 16 byte aligned.
24
# n_2_ccomplex_blocks is != 0
25
#	
26
#
27
#  ccomplex_dotprod_generic (const float *input,
28
#                         const float *taps, unsigned n_2_ccomplex_blocks, float *result)
29
#  {
30
#    float sum0 = 0;
31
#    float sum1 = 0;
32
#    float sum2 = 0;
33
#    float sum3 = 0;
34
#  
35
#    do {
36
#
37
#      sum0 += input[0] * taps[0] - input[1] * taps[1];
38
#      sum1 += input[0] * taps[1] + input[1] * taps[0];
39
#      sum2 += input[2] * taps[2] - input[3] * taps[3];
40
#      sum3 += input[2] * taps[3] + input[3] * taps[2];
41
#  
42
#      input += 4;
43
#      taps += 4;  
44
#  
45
#    } while (--n_2_ccomplex_blocks != 0);
46
#  
47
#  
48
#    result[0] = sum0 + sum2;
49
#    result[1] = sum1 + sum3;
50
#  }
51
#
52
53
# TODO: prefetch and better scheduling
54
55
#include "assembly.h"
56
57
	.file	"ccomplex_dotprod_sse.S"
58
	.version	"01.01"
59
.text
60
	.p2align 4
61
.globl GLOB_SYMB(ccomplex_dotprod_sse)
62
	DEF_FUNC_HEAD(ccomplex_dotprod_sse)
63
GLOB_SYMB(ccomplex_dotprod_sse):
64
	pushl	%ebp
65
	movl	%esp, %ebp
66
	movl	8(%ebp), %eax		# input
67
	movl	12(%ebp), %edx		# taps
68
	movl	16(%ebp), %ecx		# n_2_ccomplex_blocks
69
70
	xorps	%xmm6, %xmm6		# zero accumulators
71
	
72
	movaps	0(%eax), %xmm0
73
74
	xorps	%xmm7, %xmm7		# zero accumulators
75
76
	movaps	0(%edx), %xmm2
77
78
	shrl	$1, %ecx		# ecx = n_2_ccomplex_blocks / 2
79
80
	jmp	.L1_test
81
82
	#
83
	# 4 taps / loop
84
	# something like ?? cycles / loop
85
	#
86
	
87
	.p2align 4
88
.Loop1:	
89
90
# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000
91
#
92
#	movaps	(%eax), %xmmA
93
#	movaps	(%edx), %xmmB
94
#
95
#	movaps	%xmmA, %xmmZ
96
#	shufps	$0xb1, %xmmZ, %xmmZ	# swap internals
97
#
98
#	mulps	%xmmB, %xmmA
99
#	mulps	%xmmZ, %xmmB
100
#
101
#	# SSE replacement for: pfpnacc %xmmB, %xmmA
102
#	xorps	%xmmPN, %xmmA
103
#	movaps	%xmmA, %xmmZ
104
#	unpcklps %xmmB, %xmmA
105
#	unpckhps %xmmB, %xmmZ
106
#	movaps	%xmmZ, %xmmY
107
#	shufps	$0x44, %xmmA, %xmmZ	# b01000100
108
#	shufps	$0xee, %xmmY, %xmmA	# b11101110
109
#	addps	%xmmZ, %xmmA
110
#
111
#	addps	%xmmA, %xmmC
112
113
# A=xmm0, B=xmm2, Z=xmm4
114
# A'=xmm1, B'=xmm3, Z'=xmm5
115
116
	movaps	16(%eax), %xmm1
117
118
	movaps	%xmm0, %xmm4
119
	mulps	%xmm2, %xmm0
120
121
	shufps	$0xb1, %xmm4, %xmm4	# swap internals
122
	movaps	16(%edx), %xmm3
123
	movaps	%xmm1, %xmm5
124
	addps	%xmm0, %xmm6
125
	mulps	%xmm3, %xmm1
126
	shufps	$0xb1, %xmm5, %xmm5	# swap internals
127
	addps	%xmm1, %xmm6
128
	mulps	%xmm4, %xmm2
129
	movaps	32(%eax), %xmm0
130
	addps	%xmm2, %xmm7
131
	mulps	%xmm5, %xmm3
132
133
	addl	$32, %eax
134
135
	movaps	32(%edx), %xmm2
136
	addps	%xmm3, %xmm7
137
138
	addl	$32, %edx
139
140
141
142
.L1_test:
143
	decl	%ecx
144
	jge	.Loop1
145
146
	# We've handled the bulk of multiplies up to here.
147
	# Let's sse if original n_2_ccomplex_blocks was odd.
148
	# If so, we've got 2 more taps to do.
149
	
150
	movl	16(%ebp), %ecx		# n_2_ccomplex_blocks
151
	andl	$1, %ecx
152
	je	.Leven
153
	
154
	# The count was odd, do 2 more taps.
155
	# Note that we've already got mm0/mm2 preloaded
156
	# from the main loop.
157
158
	movaps	%xmm0, %xmm4
159
	mulps	%xmm2, %xmm0
160
	shufps	$0xb1, %xmm4, %xmm4	# swap internals
161
	addps	%xmm0, %xmm6
162
	mulps	%xmm4, %xmm2
163
	addps	%xmm2, %xmm7
164
165
166
.Leven:
167
	# neg inversor
168
	xorps	%xmm1, %xmm1
169
	movl	$0x80000000, 16(%ebp)
170
	movss	16(%ebp), %xmm1
171
	shufps	$0x11, %xmm1, %xmm1	# b00010001 # 0 -0 0 -0
172
173
	# pfpnacc
174
	xorps	%xmm1, %xmm6
175
176
	movaps	%xmm6, %xmm2
177
	unpcklps %xmm7, %xmm6
178
	unpckhps %xmm7, %xmm2
179
	movaps	%xmm2, %xmm3
180
	shufps	$0x44, %xmm6, %xmm2	# b01000100
181
	shufps	$0xee, %xmm3, %xmm6	# b11101110
182
	addps	%xmm2, %xmm6
183
184
					# xmm6 = r1 i2 r3 i4
185
	movl	20(%ebp), %eax		# @result
186
	movhlps	%xmm6, %xmm4		# xmm4 = r3 i4 ?? ??
187
	addps	%xmm4, %xmm6		# xmm6 = r1+r3 i2+i4 ?? ??
188
	movlps	%xmm6, (%eax)		# store low 2x32 bits (complex) to memory
189
190
	popl	%ebp
191
	ret
192
193
FUNC_TAIL(ccomplex_dotprod_sse)
194
	.ident	"Hand coded x86 SSE assembly"
195
196
#if defined(__linux__) && defined(__ELF__)
197
.section .note.GNU-stack,"",%progbits
198
#endif