Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / complex_dotprod_3dnowext64.S @ 01b6697b

History | View | Annotate | Download (3.3 kB)

1
#
2
# Copyright 2002,2005 Free Software Foundation, Inc.
3
# 
4
# This file is part of GNU Radio
5
# 
6
# GNU Radio is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 3, or (at your option)
9
# any later version.
10
# 
11
# GNU Radio is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
# 
16
# You should have received a copy of the GNU General Public License
17
# along with GNU Radio; see the file COPYING.  If not, write to
18
# the Free Software Foundation, Inc., 51 Franklin Street,
19
# Boston, MA 02110-1301, USA.
20
# 
21
22
23
# input and taps are guarenteed to be 16 byte aligned.
24
# n_2_complex_blocks is != 0
25
#	
26
#
27
#  complex_dotprod_generic (const short *input,
28
#                         const float *taps, unsigned n_2_complex_blocks, float *result)
29
#  {
30
#    float sum0 = 0;
31
#    float sum1 = 0;
32
#    float sum2 = 0;
33
#    float sum3 = 0;
34
#  
35
#    do {
36
#  
37
#      sum0 += input[0] * taps[0];
38
#      sum1 += input[0] * taps[1];
39
#      sum2 += input[1] * taps[2];
40
#      sum3 += input[1] * taps[3];
41
#  
42
#      input += 2;
43
#      taps += 4;
44
#  
45
#    } while (--n_2_complex_blocks != 0);
46
#  
47
#  
48
#    result[0] = sum0 + sum2;
49
#    result[1] = sum1 + sum3;
50
#  }
51
#  		
52
53
#include "assembly.h"
54
55
56
	.file	"complex_dotprod_3dnowext64.S"
57
	.version	"01.01"
58
.text
59
	.p2align 4
60
.globl GLOB_SYMB(complex_dotprod_3dnowext)
61
	DEF_FUNC_HEAD(complex_dotprod_3dnowext)
62
GLOB_SYMB(complex_dotprod_3dnowext):
63
64
	# intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx, result: rcx
65
66
	mov	%rdx, %rax
67
68
	# zero accumulators
69
	
70
	pxor	%mm4, %mm4		# mm4 = 0 0
71
	pxor	%mm5, %mm5		# mm5 = 0 0 
72
	pxor	%mm6, %mm6		# mm6 = 0 0 
73
	pxor	%mm7, %mm7		# mm7 = 0 0
74
75
76
	shr	$1, %rax		# rax = n_2_complex_blocks / 2
77
78
	movd	0(%rdi), %mm0
79
	pshufw	$0x55, %mm0, %mm1		# b01010101
80
	pshufw	$0, %mm0, %mm0
81
82
	pxor	%mm2, %mm2
83
	pxor	%mm3, %mm3
84
85
	pi2fw	%mm1, %mm1
86
	pi2fw	%mm0, %mm0
87
88
	jmp	.L1_test
89
90
	#
91
	# 4 taps / loop
92
	# something like ?? cycles / loop
93
	#
94
	
95
	.p2align 4
96
.Loop1:	
97
	pfmul	0(%rsi), %mm0
98
	pfadd	%mm2, %mm6
99
100
	pshufw	$0, 4(%rdi), %mm2
101
102
	pfmul	8(%rsi), %mm1
103
	pfadd	%mm3, %mm7
104
	pi2fw	%mm2, %mm2
105
106
	pshufw	$0x55, 4(%rdi), %mm3		# b01010101
107
108
	pfmul	16(%rsi), %mm2
109
	pi2fw	%mm3, %mm3
110
	pfadd	%mm0, %mm4
111
112
	pshufw	$0, 8(%rdi), %mm0
113
114
	pfmul	24(%rsi), %mm3
115
	pfadd	%mm1, %mm5
116
117
	pshufw	$0x55, 8(%rdi), %mm1	# b01010101
118
	pi2fw	%mm0, %mm0
119
120
#TODO: add prefetch
121
122
	add	$32, %rsi
123
	add	$8, %rdi
124
	pi2fw	%mm1, %mm1
125
126
.L1_test:
127
	dec	%rax
128
	jge	.Loop1
129
130
	# We've handled the bulk of multiplies up to here.
131
	# Now accumulate the final two additions and see if original
132
	# n_2_complex_blocks was odd.  If so, we've got 2 more
133
	# taps to do.
134
	
135
	pfadd	%mm2, %mm6
136
	and	$1, %rdx
137
	pfadd	%mm3, %mm7
138
	je	.Leven
139
	
140
	# The count was odd, do 2 more taps.
141
	# Note that we've already got mm0 and mm1 preloaded
142
	# from the main loop.
143
	
144
	pfmul	0(%rsi), %mm0
145
	pfadd	%mm0, %mm4
146
	pfmul	8(%rsi), %mm1
147
	pfadd	%mm1, %mm5
148
149
.Leven:
150
	# at this point mm4, mm5, mm6 and mm7 contain partial sums
151
	
152
	pfadd	%mm7, %mm6
153
	pfadd	%mm5, %mm4
154
155
	pfadd	%mm6, %mm4
156
	movq	%mm4, (%rcx)
157
158
	femms
159
160
	retq
161
162
FUNC_TAIL(complex_dotprod_3dnowext)
163
	.ident	"Hand coded x86_64 3DNow!Ext assembly"
164
165
166
#if defined(__linux__) && defined(__ELF__)
167
.section .note.GNU-stack,"",%progbits
168
#endif