Statistics
| Branch: | Tag: | Revision:

root / gnuradio-core / src / lib / filter / 3dnow_float_dotprod_really_simple.S @ 100e6105

History | View | Annotate | Download (2.2 kB)

1 5d69a524 jcorgan
#
2 5d69a524 jcorgan
# Copyright 2002 Free Software Foundation, Inc.
3 5d69a524 jcorgan
# 
4 5d69a524 jcorgan
# This file is part of GNU Radio
5 5d69a524 jcorgan
# 
6 5d69a524 jcorgan
# GNU Radio is free software; you can redistribute it and/or modify
7 5d69a524 jcorgan
# it under the terms of the GNU General Public License as published by
8 937b719d eb
# the Free Software Foundation; either version 3, or (at your option)
9 5d69a524 jcorgan
# any later version.
10 5d69a524 jcorgan
# 
11 5d69a524 jcorgan
# GNU Radio is distributed in the hope that it will be useful,
12 5d69a524 jcorgan
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13 5d69a524 jcorgan
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 5d69a524 jcorgan
# GNU General Public License for more details.
15 5d69a524 jcorgan
# 
16 5d69a524 jcorgan
# You should have received a copy of the GNU General Public License
17 5d69a524 jcorgan
# along with GNU Radio; see the file COPYING.  If not, write to
18 86f5c924 eb
# the Free Software Foundation, Inc., 51 Franklin Street,
19 86f5c924 eb
# Boston, MA 02110-1301, USA.
20 5d69a524 jcorgan
# 
21 5d69a524 jcorgan
22 5d69a524 jcorgan
23 5d69a524 jcorgan
# input and taps are guarenteed to be 16 byte aligned.
24 5d69a524 jcorgan
# n_4_float_blocks is != 0
25 5d69a524 jcorgan
#	
26 5d69a524 jcorgan
#
27 5d69a524 jcorgan
#  float 
28 5d69a524 jcorgan
#  sse_float_dotprod (const float *input,
29 5d69a524 jcorgan
#                  const float *taps, unsigned n_4_float_blocks)
30 5d69a524 jcorgan
#  {
31 5d69a524 jcorgan
#    float sum0 = 0;
32 5d69a524 jcorgan
#    float sum1 = 0;
33 5d69a524 jcorgan
#    float sum2 = 0;
34 5d69a524 jcorgan
#    float sum3 = 0;
35 5d69a524 jcorgan
#  
36 5d69a524 jcorgan
#    do {
37 5d69a524 jcorgan
#  
38 5d69a524 jcorgan
#      sum0 += input[0] * taps[0];
39 5d69a524 jcorgan
#      sum1 += input[1] * taps[1];
40 5d69a524 jcorgan
#      sum2 += input[2] * taps[2];
41 5d69a524 jcorgan
#      sum3 += input[3] * taps[3];
42 5d69a524 jcorgan
#  
43 5d69a524 jcorgan
#      input += 4;
44 5d69a524 jcorgan
#      taps += 4;
45 5d69a524 jcorgan
#  
46 5d69a524 jcorgan
#    } while (--n_4_float_blocks != 0);
47 5d69a524 jcorgan
#  
48 5d69a524 jcorgan
#  
49 5d69a524 jcorgan
#    return sum0 + sum1 + sum2 + sum3;
50 5d69a524 jcorgan
#  }
51 5d69a524 jcorgan
#  		
52 5d69a524 jcorgan
53 5d69a524 jcorgan
54 5d69a524 jcorgan
	.file	"3dnow_float_dotprod_really_simple.s"
55 5d69a524 jcorgan
	.version	"01.01"
56 5d69a524 jcorgan
.text
57 5d69a524 jcorgan
	.p2align 4
58 5d69a524 jcorgan
.globl sse_float_dotprod
59 5d69a524 jcorgan
	.type	 sse_float_dotprod,@function
60 5d69a524 jcorgan
sse_float_dotprod:
61 5d69a524 jcorgan
	pushl	%ebp
62 5d69a524 jcorgan
	movl	%esp, %ebp
63 5d69a524 jcorgan
	movl	8(%ebp), %edx
64 5d69a524 jcorgan
	movl	12(%ebp), %eax
65 5d69a524 jcorgan
	movl	16(%ebp), %ecx
66 5d69a524 jcorgan
67 5d69a524 jcorgan
	
68 5d69a524 jcorgan
	# The plan is to get it computing the correct answer, and
69 5d69a524 jcorgan
	# then to unroll and schedule the inner loop.
70 5d69a524 jcorgan
71 5d69a524 jcorgan
	pxor	%mm4, %mm4		# mm4 = 0 0
72 5d69a524 jcorgan
	shll	$1, %ecx		# count * 2
73 5d69a524 jcorgan
74 5d69a524 jcorgan
	.p2align 4
75 5d69a524 jcorgan
.loop1:
76 5d69a524 jcorgan
	movq	(%eax), %mm0
77 5d69a524 jcorgan
	pfmul	(%edx), %mm0
78 5d69a524 jcorgan
	pfadd	%mm0, %mm4
79 5d69a524 jcorgan
	addl	$8, %edx
80 5d69a524 jcorgan
	addl	$8, %eax
81 5d69a524 jcorgan
	decl	%ecx
82 5d69a524 jcorgan
	jne	.loop1
83 5d69a524 jcorgan
	
84 5d69a524 jcorgan
	# at this point mm4 contains partial sums
85 5d69a524 jcorgan
86 5d69a524 jcorgan
	pfacc	%mm4, %mm4
87 5d69a524 jcorgan
	movd	%mm4, 16(%ebp)
88 5d69a524 jcorgan
	femms
89 5d69a524 jcorgan
	flds	16(%ebp)
90 5d69a524 jcorgan
91 5d69a524 jcorgan
	popl	%ebp
92 5d69a524 jcorgan
	ret
93 5d69a524 jcorgan
.Lfe1:
94 5d69a524 jcorgan
	.size	 sse_float_dotprod,.Lfe1-sse_float_dotprod
95 5d69a524 jcorgan
	.ident	"Hand coded x86 3DNow! assembly"
96 0d4c6442 eb
97 0d4c6442 eb
#if defined(__linux__) && defined(__ELF__)
98 0d4c6442 eb
.section .note.GNU-stack,"",%progbits
99 0d4c6442 eb
#endif