Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / fcomplex_dotprod_3dnow64.S @ 5d69a524

History | View | Annotate | Download (3.2 kB)

1
#
2
# Copyright 2002,2005 Free Software Foundation, Inc.
3
# 
4
# This file is part of GNU Radio
5
# 
6
# GNU Radio is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2, or (at your option)
9
# any later version.
10
# 
11
# GNU Radio is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
# 
16
# You should have received a copy of the GNU General Public License
17
# along with GNU Radio; see the file COPYING.  If not, write to
18
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19
# Boston, MA 02111-1307, USA.
20
# 
21
22
# input and taps are guarenteed to be 16 byte aligned.
23
# n_2_complex_blocks is != 0
24
#	
25
#
26
#  fcomplex_dotprod_generic (const float *input,
27
#                         const float *taps, unsigned n_2_complex_blocks, float *result)
28
#  {
29
#    float sum0 = 0;
30
#    float sum1 = 0;
31
#    float sum2 = 0;
32
#    float sum3 = 0;
33
#  
34
#    do {
35
#  
36
#      sum0 += input[0] * taps[0];
37
#      sum1 += input[0] * taps[1];
38
#      sum2 += input[1] * taps[2];
39
#      sum3 += input[1] * taps[3];
40
#  
41
#      input += 2;
42
#      taps += 4;
43
#  
44
#    } while (--n_2_complex_blocks != 0);
45
#  
46
#  
47
#    result[0] = sum0 + sum2;
48
#    result[1] = sum1 + sum3;
49
#  }
50
#  		
51
52
#include "assembly.h"
53
54
55
	.file	"fcomplex_dotprod_3dnow64.S"
56
	.version	"01.01"
57
.text
58
	.p2align 4
59
.globl GLOB_SYMB(fcomplex_dotprod_3dnow)
60
	DEF_FUNC_HEAD(fcomplex_dotprod_3dnow)
61
GLOB_SYMB(fcomplex_dotprod_3dnow):
62
63
	# intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx, result: rcx
64
65
	mov	%rdx, %rax
66
67
	# zero accumulators
68
	
69
	pxor	%mm4, %mm4		# mm4 = 0 0
70
	pxor	%mm5, %mm5		# mm5 = 0 0 
71
	pxor	%mm6, %mm6		# mm6 = 0 0 
72
	pxor	%mm7, %mm7		# mm7 = 0 0
73
74
	shr	$1, %rax		# rax = n_2_complex_blocks / 2
75
76
	movq	0(%rdi), %mm0
77
78
	pxor	%mm2, %mm2
79
	pxor	%mm3, %mm3
80
81
	movq	%mm0, %mm1
82
	punpckldq	%mm0, %mm0
83
	punpckhdq	%mm1, %mm1
84
85
86
	jmp	.L1_test
87
88
	#
89
	# 4 taps / loop
90
	# something like ?? cycles / loop
91
	#
92
	
93
	.p2align 4
94
.loop1:	
95
	pfmul	0(%rsi), %mm0
96
	pfadd	%mm2, %mm6
97
98
	movq	8(%rdi), %mm2
99
100
	pfadd	%mm3, %mm7
101
102
	pfmul	8(%rsi), %mm1
103
104
	movq	%mm2, %mm3
105
	punpckldq	%mm2, %mm2
106
	punpckhdq	%mm3, %mm3
107
108
109
	pfmul	16(%rsi), %mm2
110
	pfadd	%mm0, %mm4
111
112
	movq	16(%rdi), %mm0
113
114
	pfadd	%mm1, %mm5
115
116
	movq	%mm0, %mm1
117
	punpckldq	%mm0, %mm0
118
119
	pfmul	24(%rsi), %mm3
120
121
	punpckhdq	%mm1, %mm1
122
123
124
#TODO: add prefetch?
125
126
	add	$32, %rsi
127
	add	$16, %rdi
128
129
.L1_test:
130
	dec	%rax
131
	jge	.loop1
132
133
	# We've handled the bulk of multiplies up to here.
134
	# Now accumulate the final two additions and see if original
135
	# n_2_complex_blocks was odd.  If so, we've got 2 more
136
	# taps to do.
137
	
138
	pfadd	%mm2, %mm6
139
	and	$1, %rdx
140
	pfadd	%mm3, %mm7
141
	je	.Leven
142
	
143
	# The count was odd, do 2 more taps.
144
	# Note that we've already got mm0 and mm1 preloaded
145
	# from the main loop.
146
	
147
	pfmul	0(%rsi), %mm0
148
	pfadd	%mm0, %mm4
149
	pfmul	8(%rsi), %mm1
150
	pfadd	%mm1, %mm5
151
152
153
.Leven:
154
	# at this point mm4, mm5, mm6 and mm7 contain partial sums
155
	
156
	pfadd	%mm7, %mm6
157
	pfadd	%mm5, %mm4
158
	pfadd	%mm6, %mm4
159
160
	movq	%mm4, (%rcx)		# result
161
	femms
162
163
	retq
164
165
FUNC_TAIL(fcomplex_dotprod_3dnow)
166
	.ident	"Hand coded x86_64 3DNow! assembly"