Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / float_dotprod_3dnow.S @ a3b19015

History | View | Annotate | Download (3.1 kB)

1 5d69a524 jcorgan
#
2 5d69a524 jcorgan
# Copyright 2002 Free Software Foundation, Inc.
3 5d69a524 jcorgan
# 
4 5d69a524 jcorgan
# This file is part of GNU Radio
5 5d69a524 jcorgan
# 
6 5d69a524 jcorgan
# GNU Radio is free software; you can redistribute it and/or modify
7 5d69a524 jcorgan
# it under the terms of the GNU General Public License as published by
8 937b719d eb
# the Free Software Foundation; either version 3, or (at your option)
9 5d69a524 jcorgan
# any later version.
10 5d69a524 jcorgan
# 
11 5d69a524 jcorgan
# GNU Radio is distributed in the hope that it will be useful,
12 5d69a524 jcorgan
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13 5d69a524 jcorgan
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 5d69a524 jcorgan
# GNU General Public License for more details.
15 5d69a524 jcorgan
# 
16 5d69a524 jcorgan
# You should have received a copy of the GNU General Public License
17 5d69a524 jcorgan
# along with GNU Radio; see the file COPYING.  If not, write to
18 86f5c924 eb
# the Free Software Foundation, Inc., 51 Franklin Street,
19 86f5c924 eb
# Boston, MA 02110-1301, USA.
20 5d69a524 jcorgan
# 
21 5d69a524 jcorgan
22 5d69a524 jcorgan
23 5d69a524 jcorgan
# input and taps are guarenteed to be 16 byte aligned.
24 5d69a524 jcorgan
# n_4_float_blocks is != 0
25 5d69a524 jcorgan
#	
26 5d69a524 jcorgan
#
27 5d69a524 jcorgan
#  float 
28 5d69a524 jcorgan
#  float_dotprod_generic (const float *input,
29 5d69a524 jcorgan
#                         const float *taps, unsigned n_4_float_blocks)
30 5d69a524 jcorgan
#  {
31 5d69a524 jcorgan
#    float sum0 = 0;
32 5d69a524 jcorgan
#    float sum1 = 0;
33 5d69a524 jcorgan
#    float sum2 = 0;
34 5d69a524 jcorgan
#    float sum3 = 0;
35 5d69a524 jcorgan
#  
36 5d69a524 jcorgan
#    do {
37 5d69a524 jcorgan
#  
38 5d69a524 jcorgan
#      sum0 += input[0] * taps[0];
39 5d69a524 jcorgan
#      sum1 += input[1] * taps[1];
40 5d69a524 jcorgan
#      sum2 += input[2] * taps[2];
41 5d69a524 jcorgan
#      sum3 += input[3] * taps[3];
42 5d69a524 jcorgan
#  
43 5d69a524 jcorgan
#      input += 4;
44 5d69a524 jcorgan
#      taps += 4;
45 5d69a524 jcorgan
#  
46 5d69a524 jcorgan
#    } while (--n_4_float_blocks != 0);
47 5d69a524 jcorgan
#  
48 5d69a524 jcorgan
#  
49 5d69a524 jcorgan
#    return sum0 + sum1 + sum2 + sum3;
50 5d69a524 jcorgan
#  }
51 5d69a524 jcorgan
#  		
52 5d69a524 jcorgan
53 5d69a524 jcorgan
#include "assembly.h"
54 5d69a524 jcorgan
55 5d69a524 jcorgan
56 5d69a524 jcorgan
	.file	"float_dotprod_3dnow.S"
57 5d69a524 jcorgan
	.version	"01.01"
58 5d69a524 jcorgan
.text
59 5d69a524 jcorgan
	.p2align 4
60 5d69a524 jcorgan
.globl GLOB_SYMB(float_dotprod_3dnow)
61 5d69a524 jcorgan
	DEF_FUNC_HEAD(float_dotprod_3dnow)
62 5d69a524 jcorgan
GLOB_SYMB(float_dotprod_3dnow):
63 5d69a524 jcorgan
	pushl	%ebp
64 5d69a524 jcorgan
	movl	%esp, %ebp
65 5d69a524 jcorgan
	movl	8(%ebp), %edx
66 5d69a524 jcorgan
	movl	12(%ebp), %eax
67 5d69a524 jcorgan
	movl	16(%ebp), %ecx
68 5d69a524 jcorgan
69 5d69a524 jcorgan
	# zero accumulators
70 5d69a524 jcorgan
	
71 5d69a524 jcorgan
	pxor	%mm4, %mm4		# mm4 = 0 0
72 5d69a524 jcorgan
	pxor	%mm5, %mm5		# mm5 = 0 0 
73 5d69a524 jcorgan
	pxor	%mm6, %mm6		# mm6 = 0 0 
74 5d69a524 jcorgan
	pxor	%mm7, %mm7		# mm7 = 0 0
75 5d69a524 jcorgan
76 5d69a524 jcorgan
	shrl	$1, %ecx		# ecx = n_4_float_blocks / 2
77 5d69a524 jcorgan
	movq	0(%eax), %mm0
78 5d69a524 jcorgan
	movq	8(%eax), %mm1
79 5d69a524 jcorgan
	pxor	%mm2, %mm2
80 5d69a524 jcorgan
	pxor	%mm3, %mm3
81 5d69a524 jcorgan
	jmp	.L1_test
82 5d69a524 jcorgan
83 5d69a524 jcorgan
	#
84 5d69a524 jcorgan
	# 8 taps / loop
85 5d69a524 jcorgan
	# something like 6 cycles / loop
86 5d69a524 jcorgan
	#
87 5d69a524 jcorgan
	
88 5d69a524 jcorgan
	.p2align 4
89 e1e14bc5 eb
.Loop1:	
90 5d69a524 jcorgan
	pfmul	0(%edx), %mm0
91 5d69a524 jcorgan
	pfadd	%mm2, %mm6
92 5d69a524 jcorgan
	movq	16(%eax), %mm2
93 5d69a524 jcorgan
	
94 5d69a524 jcorgan
	pfmul	8(%edx), %mm1
95 5d69a524 jcorgan
	pfadd	%mm3, %mm7
96 5d69a524 jcorgan
	movq	24(%eax), %mm3
97 5d69a524 jcorgan
98 5d69a524 jcorgan
	pfmul	16(%edx), %mm2
99 5d69a524 jcorgan
	pfadd	%mm0, %mm4
100 5d69a524 jcorgan
	movq	32(%eax), %mm0
101 5d69a524 jcorgan
102 5d69a524 jcorgan
	pfmul	24(%edx), %mm3
103 5d69a524 jcorgan
	pfadd	%mm1, %mm5
104 5d69a524 jcorgan
	movq	40(%eax), %mm1
105 5d69a524 jcorgan
106 5d69a524 jcorgan
	addl	$32, %edx
107 5d69a524 jcorgan
	addl	$32, %eax
108 5d69a524 jcorgan
.L1_test:
109 5d69a524 jcorgan
	decl	%ecx
110 e1e14bc5 eb
	jge	.Loop1
111 5d69a524 jcorgan
112 5d69a524 jcorgan
	# We've handled the bulk of multiplies up to here.
113 5d69a524 jcorgan
	# Now accumulate the final two additions and see if original
114 5d69a524 jcorgan
	# n_4_float_blocks was odd.  If so, we've got 4 more
115 5d69a524 jcorgan
	# taps to do.
116 5d69a524 jcorgan
	
117 5d69a524 jcorgan
	movl	16(%ebp), %ecx
118 5d69a524 jcorgan
	pfadd	%mm2, %mm6
119 5d69a524 jcorgan
	andl	$1, %ecx
120 5d69a524 jcorgan
	pfadd	%mm3, %mm7
121 5d69a524 jcorgan
	je	.Leven
122 5d69a524 jcorgan
	
123 5d69a524 jcorgan
	# The count was odd, do 4 more taps.
124 5d69a524 jcorgan
	# Note that we've already got mm0 and mm1 preloaded
125 5d69a524 jcorgan
	# from the main loop.
126 5d69a524 jcorgan
	
127 5d69a524 jcorgan
	pfmul	0(%edx), %mm0
128 5d69a524 jcorgan
	pfadd	%mm0, %mm4
129 5d69a524 jcorgan
	pfmul	8(%edx), %mm1
130 5d69a524 jcorgan
	pfadd	%mm1, %mm5
131 5d69a524 jcorgan
132 5d69a524 jcorgan
.Leven:			
133 5d69a524 jcorgan
	# at this point mm4, mm5, mm6 and mm7 contain partial sums
134 5d69a524 jcorgan
	
135 5d69a524 jcorgan
	pfadd	%mm7, %mm6
136 5d69a524 jcorgan
	pfadd	%mm5, %mm4
137 5d69a524 jcorgan
	pfadd	%mm6, %mm4
138 5d69a524 jcorgan
	pfacc	%mm4, %mm4
139 5d69a524 jcorgan
	
140 5d69a524 jcorgan
	movd	%mm4, 16(%ebp)
141 5d69a524 jcorgan
	femms
142 5d69a524 jcorgan
	flds	16(%ebp)
143 5d69a524 jcorgan
144 5d69a524 jcorgan
	popl	%ebp
145 5d69a524 jcorgan
	ret
146 5d69a524 jcorgan
147 5d69a524 jcorgan
FUNC_TAIL(float_dotprod_3dnow)
148 5d69a524 jcorgan
	.ident	"Hand coded x86 3DNow! assembly"
149 0d4c6442 eb
150 0d4c6442 eb
#if defined(__linux__) && defined(__ELF__)
151 0d4c6442 eb
.section .note.GNU-stack,"",%progbits
152 0d4c6442 eb
#endif