diff options
author | Tom Rondeau <trondeau@vt.edu> | 2012-06-13 14:53:41 -0400 |
---|---|---|
committer | Tom Rondeau <trondeau@vt.edu> | 2012-06-13 14:53:41 -0400 |
commit | 9e2e896e9d4dbc4627702cde82a48e9ee5136f26 (patch) | |
tree | fb62fd58ec90f34e080d1dc2f52b2200b57fa0ed /gr-filter | |
parent | 46bd64dc8231402535bad742a74250599e595148 (diff) |
filter: process 4 vectors each time in volk dot_prod to speed up fir filters.
This makes the volk version of the SSE FIR filter the same speed as using the hand-crafted float_dotprod from before.
Diffstat (limited to 'gr-filter')
-rw-r--r-- | gr-filter/include/filter/fir_filter.h | 2 | ||||
-rw-r--r-- | gr-filter/lib/fir_filter.cc | 38 |
2 files changed, 36 insertions, 4 deletions
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h index 1eb70f92a4..e6194df7b5 100644 --- a/gr-filter/include/filter/fir_filter.h +++ b/gr-filter/include/filter/fir_filter.h @@ -102,7 +102,7 @@ namespace gr { gr_complex filter(const gr_complex input[]); void filterN(gr_complex output[], const gr_complex input[], - unsigned long n) + unsigned long n); void filterNdec(gr_complex output[], const gr_complex input[], unsigned long n, diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc index 098dd8d367..c6e179246d 100644 --- a/gr-filter/lib/fir_filter.cc +++ b/gr-filter/lib/fir_filter.cc @@ -24,6 +24,7 @@ #include <fft/fft.h> #include <volk/volk.h> #include <cstdio> +#include <float_dotprod_x86.h> namespace gr { namespace filter { @@ -101,12 +102,16 @@ namespace gr { { return d_ntaps; } - + + /* float fir_filter_fff::filter(const float input[]) { volk_32f_x2_dot_prod_32f_a(d_output, input, - d_aligned_taps[d_offset], d_ntaps+3); + d_aligned_taps[d_offset], + (d_ntaps + d_offset - 1) / 4 + 1); + //*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset], + // (d_ntaps + d_offset - 1) / 4 + 1); return *d_output; } @@ -126,7 +131,34 @@ namespace gr { j += (d_offset == 0 ? 4 : 0); } } + */ + + float + fir_filter_fff::filter(const float input[]) + { + //unsigned long ar = ((unsigned long) input); + //int off = (ar - (ar & ~15))/4; + + const float *ar = (float *)((unsigned long) input & ~15); + unsigned al = input - ar; + + volk_32f_x2_dot_prod_32f_a(d_output, ar, + d_aligned_taps[al], + (d_ntaps + al - 1) / 4 + 1); + //*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset], + // (d_ntaps + d_offset - 1) / 4 + 1); + return *d_output; + } + void + fir_filter_fff::filterN(float output[], + const float input[], + unsigned long n) + { + for(unsigned long i = 0; i < n; i++) { + output[i] = filter(&input[i]); + } + } void fir_filter_fff::filterNdec(float output[], @@ -136,7 +168,7 @@ namespace gr { { unsigned long j = 0; for(unsigned long i = 0; i < n; i++) { - filterN(&output[i], &input[j], 1); + output[i] = filter(&input[j]); j += decimate; } } |