Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / float_dotprod_3dnow64.S @ 2b60291c

History | View | Annotate | Download (3.1 kB)

1
#
2
# Copyright 2002,2005 Free Software Foundation, Inc.
3
# 
4
# This file is part of GNU Radio
5
# 
6
# GNU Radio is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 3, or (at your option)
9
# any later version.
10
# 
11
# GNU Radio is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
# 
16
# You should have received a copy of the GNU General Public License
17
# along with GNU Radio; see the file COPYING.  If not, write to
18
# the Free Software Foundation, Inc., 51 Franklin Street,
19
# Boston, MA 02110-1301, USA.
20
# 
21
22
23
# input and taps are guarenteed to be 16 byte aligned.
24
# n_4_float_blocks is != 0
25
#	
26
#
27
#  float 
28
#  float_dotprod_generic (const float *input,
29
#                         const float *taps, unsigned n_4_float_blocks)
30
#  {
31
#    float sum0 = 0;
32
#    float sum1 = 0;
33
#    float sum2 = 0;
34
#    float sum3 = 0;
35
#  
36
#    do {
37
#  
38
#      sum0 += input[0] * taps[0];
39
#      sum1 += input[1] * taps[1];
40
#      sum2 += input[2] * taps[2];
41
#      sum3 += input[3] * taps[3];
42
#  
43
#      input += 4;
44
#      taps += 4;
45
#  
46
#    } while (--n_4_float_blocks != 0);
47
#  
48
#  
49
#    return sum0 + sum1 + sum2 + sum3;
50
#  }
51
#  		
52
53
#include "assembly.h"
54
55
56
	.file	"float_dotprod_3dnow64.S"
57
	.version	"01.01"
58
.text
59
	.p2align 4
60
.globl GLOB_SYMB(float_dotprod_3dnow)
61
	DEF_FUNC_HEAD(float_dotprod_3dnow)
62
GLOB_SYMB(float_dotprod_3dnow):
63
64
	# intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx
65
66
	mov     %rdx, %rax
67
68
	# zero accumulators
69
	
70
	pxor	%mm4, %mm4		# mm4 = 0 0
71
	pxor	%mm5, %mm5		# mm5 = 0 0 
72
	pxor	%mm6, %mm6		# mm6 = 0 0 
73
	pxor	%mm7, %mm7		# mm7 = 0 0
74
75
	shr	$1, %rax		# rax = n_4_float_blocks / 2
76
	movq	0(%rsi), %mm0
77
	movq	8(%rsi), %mm1
78
	pxor	%mm2, %mm2
79
	pxor	%mm3, %mm3
80
	jmp	.L1_test
81
82
	#
83
	# 8 taps / loop
84
	# something like 6 cycles / loop
85
	#
86
	
87
	.p2align 4
88
.Loop1:	
89
	pfmul	0(%rdi), %mm0
90
	pfadd	%mm2, %mm6
91
	movq	16(%rsi), %mm2
92
	
93
	pfmul	8(%rdi), %mm1
94
	pfadd	%mm3, %mm7
95
	movq	24(%rsi), %mm3
96
97
	pfmul	16(%rdi), %mm2
98
	pfadd	%mm0, %mm4
99
	movq	32(%rsi), %mm0
100
101
	pfmul	24(%rdi), %mm3
102
	pfadd	%mm1, %mm5
103
	movq	40(%rsi), %mm1
104
105
	add	$32, %rdi
106
	add	$32, %rsi
107
.L1_test:
108
	dec	%rax
109
	jge	.Loop1
110
111
	# We've handled the bulk of multiplies up to here.
112
	# Now accumulate the final two additions and see if original
113
	# n_4_float_blocks was odd.  If so, we've got 4 more
114
	# taps to do.
115
	
116
	pfadd	%mm2, %mm6
117
	and	$1, %rdx
118
	pfadd	%mm3, %mm7
119
	je	.Leven
120
	
121
	# The count was odd, do 4 more taps.
122
	# Note that we've already got mm0 and mm1 preloaded
123
	# from the main loop.
124
	
125
	pfmul	0(%rdi), %mm0
126
	pfadd	%mm0, %mm4
127
	pfmul	8(%rdi), %mm1
128
	pfadd	%mm1, %mm5
129
130
.Leven:			
131
	# at this point mm4, mm5, mm6 and mm7 contain partial sums
132
	
133
	pfadd	%mm7, %mm6
134
	pfadd	%mm5, %mm4
135
	pfadd	%mm6, %mm4
136
	pfacc	%mm4, %mm4
137
	
138
	movd    %mm4, -8(%rsp)
139
	movss   -8(%rsp), %xmm0
140
	femms
141
142
	retq
143
144
FUNC_TAIL(float_dotprod_3dnow)
145
	.ident	"Hand coded x86_64 3DNow! assembly"
146
147
#if defined(__linux__) && defined(__ELF__)
148
.section .note.GNU-stack,"",%progbits
149
#endif