Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / complex_dotprod_sse64.S @ 0d4c6442

History | View | Annotate | Download (4.4 kB)

1
#
2
# Copyright 2002,2005 Free Software Foundation, Inc.
3
# 
4
# This file is part of GNU Radio
5
# 
6
# GNU Radio is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 3, or (at your option)
9
# any later version.
10
# 
11
# GNU Radio is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
# 
16
# You should have received a copy of the GNU General Public License
17
# along with GNU Radio; see the file COPYING.  If not, write to
18
# the Free Software Foundation, Inc., 51 Franklin Street,
19
# Boston, MA 02110-1301, USA.
20
# 
21
22
23
# input and taps are guarenteed to be 16 byte aligned.
24
# n_2_complex_blocks is != 0
25
#	
26
#
27
#  complex_dotprod_generic (const short *input,
28
#                         const float *taps, unsigned n_2_complex_blocks, float *result)
29
#  {
30
#    float sum0 = 0;
31
#    float sum1 = 0;
32
#    float sum2 = 0;
33
#    float sum3 = 0;
34
#  
35
#    do {
36
#  
37
#      sum0 += input[0] * taps[0];
38
#      sum1 += input[0] * taps[1];
39
#      sum2 += input[1] * taps[2];
40
#      sum3 += input[1] * taps[3];
41
#  
42
#      input += 2;
43
#      taps += 4;
44
#  
45
#    } while (--n_2_complex_blocks != 0);
46
#  
47
#  
48
#    result[0] = sum0 + sum2;
49
#    result[1] = sum1 + sum3;
50
#  }
51
#
52
53
# TODO: prefetch and better scheduling
54
55
#include "assembly.h"
56
57
58
	.file	"complex_dotprod_sse64.S"
59
	.version	"01.01"
60
.text
61
	.p2align 4
62
.globl GLOB_SYMB(complex_dotprod_sse)
63
	DEF_FUNC_HEAD(complex_dotprod_sse)
64
GLOB_SYMB(complex_dotprod_sse):
65
66
	# intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx, result: rcx
67
68
	mov     %rdx, %rax
69
70
	
71
	# xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults
72
	# xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results
73
74
	xorps	%xmm4, %xmm4		# zero two accumulators
75
	xorps	%xmm5, %xmm5		# xmm5 holds zero for use below
76
77
	# first handle any non-zero remainder of (n_2_complex_blocks % 4)
78
79
	and	$0x3, %rax
80
	jmp	.L1_test
81
82
	.p2align 4
83
.loop1:	
84
85
	pxor    %mm0, %mm0
86
	punpcklwd       0(%rdi), %mm0
87
	psrad   $16, %mm0
88
	cvtpi2ps %mm0, %xmm0
89
	shufps	$0x50, %xmm0, %xmm0
90
91
	mulps	(%rsi), %xmm0
92
	add	$0x10, %rsi
93
	add	$4, %rdi
94
	addps	%xmm0, %xmm4
95
.L1_test:	
96
	dec	%rax
97
	jge	.loop1
98
99
	
100
	# set up for primary loop which is unrolled 4 times
101
	
102
	movaps	%xmm5, %xmm6		# zero remaining accumulators
103
	shr	$2, %rdx		# n_2_complex_blocks / 4
104
	movaps	%xmm5, %xmm7 
105
106
	je	.cleanup		# if zero, take short path
107
108
	# finish setup and loop priming
109
110
	pxor    %mm0, %mm0
111
	punpcklwd       0(%rdi), %mm0
112
	psrad   $16, %mm0
113
	cvtpi2ps %mm0, %xmm0
114
	shufps	$0x50, %xmm0, %xmm0
115
116
	movaps	%xmm5, %xmm2
117
118
	pxor    %mm1, %mm1
119
	punpcklwd       4(%rdi), %mm1
120
	psrad   $16, %mm1
121
	cvtpi2ps %mm1, %xmm1
122
	shufps	$0x50, %xmm1, %xmm1
123
124
	movaps	%xmm5, %xmm3
125
126
	# we know rax is not zero, we checked above,
127
	# hence enter loop at top
128
129
	.p2align 4
130
.loop2:
131
	mulps	(%rsi), %xmm0
132
	addps	%xmm2, %xmm6
133
134
	pxor    %mm2, %mm2
135
	punpcklwd       8(%rdi), %mm2
136
	psrad   $16, %mm2
137
	cvtpi2ps %mm2, %xmm2
138
	shufps	$0x50, %xmm2, %xmm2
139
140
	mulps	0x10(%rsi), %xmm1
141
	addps	%xmm3, %xmm7
142
143
	pxor    %mm3, %mm3
144
	punpcklwd       12(%rdi), %mm3
145
	psrad   $16, %mm3
146
	cvtpi2ps %mm3, %xmm3
147
	shufps	$0x50, %xmm3, %xmm3
148
149
	mulps	0x20(%rsi), %xmm2
150
	addps	%xmm0, %xmm4
151
152
	pxor    %mm0, %mm0
153
	punpcklwd       16(%rdi), %mm0
154
	psrad   $16, %mm0
155
	cvtpi2ps %mm0, %xmm0
156
	shufps	$0x50, %xmm0, %xmm0
157
158
	mulps	0x30(%rsi), %xmm3
159
	addps	%xmm1, %xmm5
160
161
	pxor    %mm1, %mm1
162
	punpcklwd       20(%rdi), %mm1
163
	psrad   $16, %mm1
164
	cvtpi2ps %mm1, %xmm1
165
	shufps	$0x50, %xmm1, %xmm1
166
167
	add	$0x40, %rsi
168
	add	$0x10, %rdi
169
	dec	%rdx
170
	jne	.loop2
171
172
	# OK, now we've done with all the multiplies, but
173
	# we still need to handle the unaccumulated
174
	# products in xmm2 and xmm3
175
176
	addps	%xmm2, %xmm6
177
	addps	%xmm3, %xmm7
178
179
	# now we want to add all accumulators into xmm4
180
181
	addps	%xmm5, %xmm4
182
	addps	%xmm6, %xmm7
183
	addps	%xmm7, %xmm4
184
185
	
186
	# At this point, xmm4 contains 2x2 partial sums.  We need
187
	# to compute a "horizontal complex add" across xmm4.  
188
	
189
.cleanup:				# xmm4 = r1 i2 r3 i4
190
	movhlps	%xmm4, %xmm0		# xmm0 = ?? ?? r1 r2
191
	addps	%xmm4, %xmm0		# xmm0 = ?? ?? r1+r3 i2+i4
192
	movlps	%xmm0, (%rcx)		# store low 2x32 bits (complex) to memory
193
194
	emms
195
	retq
196
197
FUNC_TAIL(complex_dotprod_sse)
198
	.ident	"Hand coded x86_64 SSE assembly"
199
200
#if defined(__linux__) && defined(__ELF__)
201
.section .note.GNU-stack,"",%progbits
202
#endif