Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / ccomplex_dotprod_3dnowext64.S @ 7fda6b2c

History | View | Annotate | Download (3.8 kB)

1 5d69a524 jcorgan
#
2 5d69a524 jcorgan
# Copyright 2002,2005 Free Software Foundation, Inc.
3 5d69a524 jcorgan
# 
4 5d69a524 jcorgan
# This file is part of GNU Radio
5 5d69a524 jcorgan
# 
6 5d69a524 jcorgan
# GNU Radio is free software; you can redistribute it and/or modify
7 5d69a524 jcorgan
# it under the terms of the GNU General Public License as published by
8 937b719d eb
# the Free Software Foundation; either version 3, or (at your option)
9 5d69a524 jcorgan
# any later version.
10 5d69a524 jcorgan
# 
11 5d69a524 jcorgan
# GNU Radio is distributed in the hope that it will be useful,
12 5d69a524 jcorgan
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13 5d69a524 jcorgan
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 5d69a524 jcorgan
# GNU General Public License for more details.
15 5d69a524 jcorgan
# 
16 5d69a524 jcorgan
# You should have received a copy of the GNU General Public License
17 5d69a524 jcorgan
# along with GNU Radio; see the file COPYING.  If not, write to
18 86f5c924 eb
# the Free Software Foundation, Inc., 51 Franklin Street,
19 86f5c924 eb
# Boston, MA 02110-1301, USA.
20 5d69a524 jcorgan
# 
21 5d69a524 jcorgan
22 5d69a524 jcorgan
23 5d69a524 jcorgan
# input and taps are guarenteed to be 16 byte aligned.
24 5d69a524 jcorgan
# n_2_ccomplex_blocks is != 0
25 5d69a524 jcorgan
#	
26 5d69a524 jcorgan
#
27 5d69a524 jcorgan
#  ccomplex_dotprod_generic (const float *input,
28 5d69a524 jcorgan
#                         const float *taps, unsigned n_2_ccomplex_blocks, float *result)
29 5d69a524 jcorgan
#  {
30 5d69a524 jcorgan
#    float sum0 = 0;
31 5d69a524 jcorgan
#    float sum1 = 0;
32 5d69a524 jcorgan
#    float sum2 = 0;
33 5d69a524 jcorgan
#    float sum3 = 0;
34 5d69a524 jcorgan
#  
35 5d69a524 jcorgan
#    do {
36 5d69a524 jcorgan
#  
37 5d69a524 jcorgan
#      sum0 += input[0] * taps[0] - input[1] * taps[1];
38 5d69a524 jcorgan
#      sum1 += input[0] * taps[1] + input[1] * taps[0];
39 5d69a524 jcorgan
#      sum2 += input[2] * taps[2] - input[3] * taps[3];
40 5d69a524 jcorgan
#      sum3 += input[2] * taps[3] + input[3] * taps[2];
41 5d69a524 jcorgan
#  
42 5d69a524 jcorgan
#      input += 4;
43 5d69a524 jcorgan
#      taps += 4;
44 5d69a524 jcorgan
#  
45 5d69a524 jcorgan
#    } while (--n_2_ccomplex_blocks != 0);
46 5d69a524 jcorgan
#  
47 5d69a524 jcorgan
#  
48 5d69a524 jcorgan
#    result[0] = sum0 + sum2;
49 5d69a524 jcorgan
#    result[1] = sum1 + sum3;
50 5d69a524 jcorgan
#  }
51 5d69a524 jcorgan
#  		
52 5d69a524 jcorgan
53 5d69a524 jcorgan
# TODO: prefetch and better scheduling
54 5d69a524 jcorgan
55 5d69a524 jcorgan
#include "assembly.h"
56 5d69a524 jcorgan
57 5d69a524 jcorgan
	.file	"ccomplex_dotprod_3dnowext64.S"
58 5d69a524 jcorgan
	.version	"01.01"
59 5d69a524 jcorgan
.text
60 5d69a524 jcorgan
	.p2align 4
61 5d69a524 jcorgan
.globl GLOB_SYMB(ccomplex_dotprod_3dnowext)
62 5d69a524 jcorgan
	DEF_FUNC_HEAD(ccomplex_dotprod_3dnowext)
63 5d69a524 jcorgan
GLOB_SYMB(ccomplex_dotprod_3dnowext):
64 5d69a524 jcorgan
65 5d69a524 jcorgan
	# intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx, result: rcx
66 5d69a524 jcorgan
67 5d69a524 jcorgan
	mov	%rdx, %rax
68 5d69a524 jcorgan
69 5d69a524 jcorgan
70 5d69a524 jcorgan
	# zero accumulators
71 5d69a524 jcorgan
	
72 5d69a524 jcorgan
	pxor	%mm6, %mm6		# mm6 = 0 0 
73 5d69a524 jcorgan
	pxor	%mm7, %mm7		# mm7 = 0 0
74 5d69a524 jcorgan
75 5d69a524 jcorgan
	movq	0(%rdi), %mm0
76 5d69a524 jcorgan
	movq	0(%rsi), %mm2
77 5d69a524 jcorgan
78 5d69a524 jcorgan
	shr	$1, %rax		# rax = n_2_ccomplex_blocks / 2
79 5d69a524 jcorgan
80 5d69a524 jcorgan
	movq	8(%rdi), %mm1
81 5d69a524 jcorgan
	movq	8(%rsi), %mm3
82 5d69a524 jcorgan
	
83 5d69a524 jcorgan
84 5d69a524 jcorgan
	jmp	.L1_test
85 5d69a524 jcorgan
86 5d69a524 jcorgan
	#
87 5d69a524 jcorgan
	# 4 taps / loop
88 5d69a524 jcorgan
	# something like ?? cycles / loop
89 5d69a524 jcorgan
	#
90 5d69a524 jcorgan
	
91 5d69a524 jcorgan
	.p2align 4
92 e1e14bc5 eb
.Loop1:	
93 5d69a524 jcorgan
94 5d69a524 jcorgan
# complex prod: C += A * B,  w/ temp Z
95 5d69a524 jcorgan
#
96 5d69a524 jcorgan
#	movq	0(%rdi), %mmA
97 5d69a524 jcorgan
#	movq	0(%rsi), %mmB
98 5d69a524 jcorgan
#	pswapd	%mmA, %mmZ
99 5d69a524 jcorgan
#	pfmul	%mmB, %mmA
100 5d69a524 jcorgan
#	pfmul	%mmZ, %mmB
101 5d69a524 jcorgan
#	pfpnacc	%mmB, %mmA
102 5d69a524 jcorgan
#	pfadd	%mmA, %mmC
103 5d69a524 jcorgan
104 5d69a524 jcorgan
105 5d69a524 jcorgan
# A=mm0, B=mm2, Z=mm4
106 5d69a524 jcorgan
# A'=mm1, B'=mm3, Z'=mm5
107 5d69a524 jcorgan
108 5d69a524 jcorgan
	pswapd	%mm0, %mm4
109 5d69a524 jcorgan
	pfmul	%mm2, %mm0
110 5d69a524 jcorgan
	pswapd	%mm1, %mm5
111 5d69a524 jcorgan
	pfmul	%mm4, %mm2
112 5d69a524 jcorgan
	pfmul	%mm3, %mm1
113 5d69a524 jcorgan
	pfpnacc	%mm2, %mm0
114 5d69a524 jcorgan
	pfmul	%mm5, %mm3
115 5d69a524 jcorgan
	movq	16(%rsi), %mm2
116 5d69a524 jcorgan
	pfpnacc	%mm3, %mm1
117 5d69a524 jcorgan
	movq	24(%rsi), %mm3
118 5d69a524 jcorgan
119 5d69a524 jcorgan
	pfadd	%mm0, %mm6
120 5d69a524 jcorgan
	movq	16(%rdi), %mm0
121 5d69a524 jcorgan
	pfadd	%mm1, %mm7
122 5d69a524 jcorgan
	movq	24(%rdi), %mm1
123 5d69a524 jcorgan
124 5d69a524 jcorgan
# unroll
125 5d69a524 jcorgan
126 5d69a524 jcorgan
	pswapd	%mm0, %mm4
127 5d69a524 jcorgan
	pfmul	%mm2, %mm0
128 5d69a524 jcorgan
	pswapd	%mm1, %mm5
129 5d69a524 jcorgan
	pfmul	%mm4, %mm2
130 5d69a524 jcorgan
	pfmul	%mm3, %mm1
131 5d69a524 jcorgan
	pfpnacc	%mm2, %mm0
132 5d69a524 jcorgan
	pfmul	%mm5, %mm3
133 5d69a524 jcorgan
	movq	32(%rsi), %mm2
134 5d69a524 jcorgan
	pfpnacc	%mm3, %mm1
135 5d69a524 jcorgan
	movq	40(%rsi), %mm3
136 5d69a524 jcorgan
137 5d69a524 jcorgan
	pfadd	%mm0, %mm6
138 5d69a524 jcorgan
	movq	32(%rdi), %mm0
139 5d69a524 jcorgan
	pfadd	%mm1, %mm7
140 5d69a524 jcorgan
	movq	40(%rdi), %mm1
141 5d69a524 jcorgan
142 5d69a524 jcorgan
	add	$32, %rsi
143 5d69a524 jcorgan
	add	$32, %rdi
144 5d69a524 jcorgan
145 5d69a524 jcorgan
.L1_test:
146 5d69a524 jcorgan
	dec	%rax
147 e1e14bc5 eb
	jge	.Loop1
148 5d69a524 jcorgan
149 5d69a524 jcorgan
	# We've handled the bulk of multiplies up to here.
150 5d69a524 jcorgan
	# Let's see if original n_2_ccomplex_blocks was odd.
151 5d69a524 jcorgan
	# If so, we've got 2 more taps to do.
152 5d69a524 jcorgan
	
153 5d69a524 jcorgan
	and	$1, %rdx
154 5d69a524 jcorgan
	je	.Leven
155 5d69a524 jcorgan
	
156 5d69a524 jcorgan
	# The count was odd, do 2 more taps.
157 5d69a524 jcorgan
	# Note that we've already got mm0/mm2 & mm1/mm3 preloaded
158 5d69a524 jcorgan
	# from the main loop.
159 5d69a524 jcorgan
	
160 5d69a524 jcorgan
# A=mm0, B=mm2, Z=mm4
161 5d69a524 jcorgan
# A'=mm1, B'=mm3, Z'=mm5
162 5d69a524 jcorgan
163 5d69a524 jcorgan
	pswapd	%mm0, %mm4
164 5d69a524 jcorgan
	pfmul	%mm2, %mm0
165 5d69a524 jcorgan
	pswapd	%mm1, %mm5
166 5d69a524 jcorgan
	pfmul	%mm4, %mm2
167 5d69a524 jcorgan
	pfmul	%mm3, %mm1
168 5d69a524 jcorgan
	pfpnacc	%mm2, %mm0
169 5d69a524 jcorgan
	pfmul	%mm5, %mm3
170 5d69a524 jcorgan
	pfpnacc	%mm3, %mm1
171 5d69a524 jcorgan
172 5d69a524 jcorgan
	pfadd	%mm0, %mm6
173 5d69a524 jcorgan
	pfadd	%mm1, %mm7
174 5d69a524 jcorgan
175 5d69a524 jcorgan
.Leven:
176 5d69a524 jcorgan
	# at this point mm6 and mm7 contain partial sums
177 5d69a524 jcorgan
	
178 5d69a524 jcorgan
	pfadd	%mm7, %mm6
179 5d69a524 jcorgan
180 5d69a524 jcorgan
	movq	%mm6, (%rcx)	 	# result
181 5d69a524 jcorgan
182 5d69a524 jcorgan
	femms
183 5d69a524 jcorgan
184 5d69a524 jcorgan
	retq
185 5d69a524 jcorgan
186 5d69a524 jcorgan
FUNC_TAIL(ccomplex_dotprod_3dnowext)
187 5d69a524 jcorgan
	.ident	"Hand coded x86_64 3DNow!Ext assembly"
188 5d69a524 jcorgan
189 0d4c6442 eb
190 0d4c6442 eb
#if defined(__linux__) && defined(__ELF__)
191 0d4c6442 eb
.section .note.GNU-stack,"",%progbits
192 0d4c6442 eb
#endif