Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / fcomplex_dotprod_sse.S @ e4eb47f0

History | View | Annotate | Download (4 kB)

1 5d69a524 jcorgan
#
2 5d69a524 jcorgan
# Copyright 2002 Free Software Foundation, Inc.
3 5d69a524 jcorgan
# 
4 5d69a524 jcorgan
# This file is part of GNU Radio
5 5d69a524 jcorgan
# 
6 5d69a524 jcorgan
# GNU Radio is free software; you can redistribute it and/or modify
7 5d69a524 jcorgan
# it under the terms of the GNU General Public License as published by
8 937b719d eb
# the Free Software Foundation; either version 3, or (at your option)
9 5d69a524 jcorgan
# any later version.
10 5d69a524 jcorgan
# 
11 5d69a524 jcorgan
# GNU Radio is distributed in the hope that it will be useful,
12 5d69a524 jcorgan
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13 5d69a524 jcorgan
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 5d69a524 jcorgan
# GNU General Public License for more details.
15 5d69a524 jcorgan
# 
16 5d69a524 jcorgan
# You should have received a copy of the GNU General Public License
17 5d69a524 jcorgan
# along with GNU Radio; see the file COPYING.  If not, write to
18 86f5c924 eb
# the Free Software Foundation, Inc., 51 Franklin Street,
19 86f5c924 eb
# Boston, MA 02110-1301, USA.
20 5d69a524 jcorgan
# 
21 5d69a524 jcorgan
22 5d69a524 jcorgan
23 5d69a524 jcorgan
# input and taps are guarenteed to be 16 byte aligned.
24 5d69a524 jcorgan
# n_2_complex_blocks is != 0
25 5d69a524 jcorgan
#	
26 5d69a524 jcorgan
#
27 5d69a524 jcorgan
#  fcomplex_dotprod_generic (const float *input,
28 5d69a524 jcorgan
#                         const float *taps, unsigned n_2_complex_blocks, float *result)
29 5d69a524 jcorgan
#  {
30 5d69a524 jcorgan
#    float sum0 = 0;
31 5d69a524 jcorgan
#    float sum1 = 0;
32 5d69a524 jcorgan
#    float sum2 = 0;
33 5d69a524 jcorgan
#    float sum3 = 0;
34 5d69a524 jcorgan
#  
35 5d69a524 jcorgan
#    do {
36 5d69a524 jcorgan
#  
37 5d69a524 jcorgan
#      sum0 += input[0] * taps[0];
38 5d69a524 jcorgan
#      sum1 += input[0] * taps[1];
39 5d69a524 jcorgan
#      sum2 += input[1] * taps[2];
40 5d69a524 jcorgan
#      sum3 += input[1] * taps[3];
41 5d69a524 jcorgan
#  
42 5d69a524 jcorgan
#      input += 2;
43 5d69a524 jcorgan
#      taps += 4;
44 5d69a524 jcorgan
#  
45 5d69a524 jcorgan
#    } while (--n_2_complex_blocks != 0);
46 5d69a524 jcorgan
#  
47 5d69a524 jcorgan
#  
48 5d69a524 jcorgan
#    result[0] = sum0 + sum2;
49 5d69a524 jcorgan
#    result[1] = sum1 + sum3;
50 5d69a524 jcorgan
#  }
51 5d69a524 jcorgan
#
52 5d69a524 jcorgan
53 5d69a524 jcorgan
# TODO: prefetch and better scheduling
54 5d69a524 jcorgan
55 5d69a524 jcorgan
#include "assembly.h"
56 5d69a524 jcorgan
57 5d69a524 jcorgan
58 5d69a524 jcorgan
	.file	"fcomplex_dotprod_sse.S"
59 5d69a524 jcorgan
	.version	"01.01"
60 5d69a524 jcorgan
.text
61 5d69a524 jcorgan
	.p2align 4
62 5d69a524 jcorgan
.globl GLOB_SYMB(fcomplex_dotprod_sse)
63 5d69a524 jcorgan
	DEF_FUNC_HEAD(fcomplex_dotprod_sse)
64 5d69a524 jcorgan
GLOB_SYMB(fcomplex_dotprod_sse):
65 5d69a524 jcorgan
	pushl	%ebp
66 5d69a524 jcorgan
	movl	%esp, %ebp
67 5d69a524 jcorgan
	movl	8(%ebp), %eax		# input
68 5d69a524 jcorgan
	movl	12(%ebp), %edx		# taps
69 5d69a524 jcorgan
	movl	16(%ebp), %ecx
70 5d69a524 jcorgan
71 5d69a524 jcorgan
	
72 5d69a524 jcorgan
	# xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults
73 5d69a524 jcorgan
	# xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results
74 5d69a524 jcorgan
75 5d69a524 jcorgan
	xorps	%xmm4, %xmm4		# zero two accumulators
76 5d69a524 jcorgan
	xorps	%xmm5, %xmm5		# xmm5 holds zero for use below
77 5d69a524 jcorgan
78 5d69a524 jcorgan
	# first handle any non-zero remainder of (n_2_complex_blocks % 4)
79 5d69a524 jcorgan
80 5d69a524 jcorgan
	andl	$0x3, %ecx
81 5d69a524 jcorgan
	jmp	.L1_test
82 5d69a524 jcorgan
83 5d69a524 jcorgan
	.p2align 4
84 e1e14bc5 eb
.Loop1:	
85 5d69a524 jcorgan
86 5d69a524 jcorgan
	movlps	0(%eax), %xmm0
87 5d69a524 jcorgan
	shufps	$0x50, %xmm0, %xmm0	# b01010000
88 5d69a524 jcorgan
89 5d69a524 jcorgan
	mulps	(%edx), %xmm0
90 5d69a524 jcorgan
	addl	$0x10, %edx
91 5d69a524 jcorgan
	addl	$8, %eax
92 5d69a524 jcorgan
	addps	%xmm0, %xmm4
93 5d69a524 jcorgan
.L1_test:	
94 5d69a524 jcorgan
	decl	%ecx
95 e1e14bc5 eb
	jge	.Loop1
96 5d69a524 jcorgan
97 5d69a524 jcorgan
	
98 5d69a524 jcorgan
	# set up for primary loop which is unrolled 4 times
99 5d69a524 jcorgan
	
100 5d69a524 jcorgan
	movl	16(%ebp), %ecx
101 5d69a524 jcorgan
	movaps	%xmm5, %xmm6		# zero remaining accumulators
102 5d69a524 jcorgan
	movaps	%xmm5, %xmm7 
103 5d69a524 jcorgan
104 5d69a524 jcorgan
	shrl	$2, %ecx		# n_2_complex_blocks / 4
105 aa4f0cf3 eb
	je	.Lcleanup		# if zero, take short path
106 5d69a524 jcorgan
107 5d69a524 jcorgan
	# finish setup and loop priming
108 5d69a524 jcorgan
109 5d69a524 jcorgan
	movlps	0(%eax), %xmm0
110 5d69a524 jcorgan
111 5d69a524 jcorgan
	movaps	%xmm5, %xmm2
112 5d69a524 jcorgan
	movaps	%xmm5, %xmm3
113 5d69a524 jcorgan
114 5d69a524 jcorgan
	movlps	8(%eax), %xmm1
115 5d69a524 jcorgan
	shufps	$0x50, %xmm0, %xmm0
116 5d69a524 jcorgan
117 5d69a524 jcorgan
	shufps	$0x50, %xmm1, %xmm1
118 5d69a524 jcorgan
119 5d69a524 jcorgan
	# we know ecx is not zero, we checked above,
120 5d69a524 jcorgan
	# hence enter loop at top
121 5d69a524 jcorgan
122 5d69a524 jcorgan
	.p2align 4
123 e1e14bc5 eb
.Loop2:
124 5d69a524 jcorgan
	addps	%xmm2, %xmm6
125 5d69a524 jcorgan
	movlps	0x10(%eax), %xmm2
126 5d69a524 jcorgan
127 5d69a524 jcorgan
	addps	%xmm3, %xmm7
128 5d69a524 jcorgan
129 5d69a524 jcorgan
	mulps	(%edx), %xmm0
130 5d69a524 jcorgan
131 5d69a524 jcorgan
	movlps	0x18(%eax), %xmm3
132 5d69a524 jcorgan
	shufps	$0x50, %xmm2, %xmm2
133 5d69a524 jcorgan
134 5d69a524 jcorgan
	mulps	0x10(%edx), %xmm1
135 5d69a524 jcorgan
136 5d69a524 jcorgan
	shufps	$0x50, %xmm3, %xmm3
137 5d69a524 jcorgan
138 5d69a524 jcorgan
	addps	%xmm0, %xmm4
139 5d69a524 jcorgan
	movlps	0x20(%eax), %xmm0
140 5d69a524 jcorgan
141 5d69a524 jcorgan
	addps	%xmm1, %xmm5
142 5d69a524 jcorgan
143 5d69a524 jcorgan
	mulps	0x20(%edx), %xmm2
144 5d69a524 jcorgan
	
145 5d69a524 jcorgan
	movlps	0x28(%eax), %xmm1
146 5d69a524 jcorgan
	shufps	$0x50, %xmm0, %xmm0
147 5d69a524 jcorgan
148 5d69a524 jcorgan
	mulps	0x30(%edx), %xmm3
149 5d69a524 jcorgan
150 5d69a524 jcorgan
	shufps	$0x50, %xmm1, %xmm1
151 5d69a524 jcorgan
152 5d69a524 jcorgan
	addl	$0x40, %edx
153 5d69a524 jcorgan
	addl	$0x20, %eax
154 5d69a524 jcorgan
	decl	%ecx
155 e1e14bc5 eb
	jne	.Loop2
156 5d69a524 jcorgan
157 5d69a524 jcorgan
	# OK, now we've done with all the multiplies, but
158 5d69a524 jcorgan
	# we still need to handle the unaccumulated
159 5d69a524 jcorgan
	# products in xmm2 and xmm3
160 5d69a524 jcorgan
161 5d69a524 jcorgan
	addps	%xmm2, %xmm6
162 5d69a524 jcorgan
	addps	%xmm3, %xmm7
163 5d69a524 jcorgan
164 5d69a524 jcorgan
	# now we want to add all accumulators into xmm4
165 5d69a524 jcorgan
166 5d69a524 jcorgan
	addps	%xmm5, %xmm4
167 5d69a524 jcorgan
	addps	%xmm6, %xmm7
168 5d69a524 jcorgan
	addps	%xmm7, %xmm4
169 5d69a524 jcorgan
170 5d69a524 jcorgan
	
171 5d69a524 jcorgan
	# At this point, xmm4 contains 2x2 partial sums.  We need
172 5d69a524 jcorgan
	# to compute a "horizontal complex add" across xmm4.  
173 5d69a524 jcorgan
	
174 aa4f0cf3 eb
.Lcleanup:				# xmm4 = r1 i2 r3 i4
175 5d69a524 jcorgan
	movl	20(%ebp), %eax		# @result
176 5d69a524 jcorgan
	movhlps	%xmm4, %xmm0		# xmm0 = ?? ?? r1 r2
177 5d69a524 jcorgan
	addps	%xmm4, %xmm0		# xmm0 = ?? ?? r1+r3 i2+i4
178 5d69a524 jcorgan
	movlps	%xmm0, (%eax)		# store low 2x32 bits (complex) to memory
179 5d69a524 jcorgan
180 5d69a524 jcorgan
	popl	%ebp
181 5d69a524 jcorgan
	ret
182 5d69a524 jcorgan
183 5d69a524 jcorgan
FUNC_TAIL(fcomplex_dotprod_sse)
184 5d69a524 jcorgan
	.ident	"Hand coded x86 SSE assembly"
185 0d4c6442 eb
186 0d4c6442 eb
#if defined(__linux__) && defined(__ELF__)
187 0d4c6442 eb
.section .note.GNU-stack,"",%progbits
188 0d4c6442 eb
#endif