filter: process 4 vectors each time in volk dot_prod to speed up fir filters.

This makes the volk version of the SSE FIR filter the same speed as using the hand-crafted float_dotprod from before.
author: Tom Rondeau <trondeau@vt.edu> 2012-06-13 14:53:41 -0400
committer: Tom Rondeau <trondeau@vt.edu> 2012-06-13 14:53:41 -0400
commit: 9e2e896e9d4dbc4627702cde82a48e9ee5136f26 (patch)
tree: fb62fd58ec90f34e080d1dc2f52b2200b57fa0ed /gr-filter
parent: 46bd64dc8231402535bad742a74250599e595148 (diff)
2 files changed, 36 insertions, 4 deletions
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h
index 1eb70f92a4..e6194df7b5 100644
--- a/gr-filter/include/filter/fir_filter.h
+++ b/gr-filter/include/filter/fir_filter.h
@@ -102,7 +102,7 @@ namespace gr {
 	gr_complex filter(const gr_complex input[]);
 	void filterN(gr_complex output[],
 		     const gr_complex input[],
-		     unsigned long n)
+		     unsigned long n);
 	void filterNdec(gr_complex output[],
 			const gr_complex input[],
 			unsigned long n,
diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc
index 098dd8d367..c6e179246d 100644
--- a/gr-filter/lib/fir_filter.cc
+++ b/gr-filter/lib/fir_filter.cc
@@ -24,6 +24,7 @@
 #include <fft/fft.h>
 #include <volk/volk.h>
 #include <cstdio>
+#include <float_dotprod_x86.h>
 
 namespace gr {
   namespace filter {
@@ -101,12 +102,16 @@ namespace gr {
       {
 	return d_ntaps;
       }
-      
+
+      /*      
       float
       fir_filter_fff::filter(const float input[])
       {
 	volk_32f_x2_dot_prod_32f_a(d_output, input,
-				   d_aligned_taps[d_offset], d_ntaps+3);
+				   d_aligned_taps[d_offset],
+				   (d_ntaps + d_offset - 1) / 4 + 1);
+	//*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset],
+	//			      (d_ntaps + d_offset - 1) / 4 + 1);
 	return *d_output;
       }
       
@@ -126,7 +131,34 @@ namespace gr {
 	  j += (d_offset == 0 ? 4 : 0);
 	}
       }
+      */
+
+      float
+      fir_filter_fff::filter(const float input[])
+      {
+	//unsigned long ar = ((unsigned long) input);
+	//int off = (ar - (ar & ~15))/4;
+
+	const float *ar = (float *)((unsigned long) input & ~15);
+	unsigned al = input - ar;
+
+	volk_32f_x2_dot_prod_32f_a(d_output, ar,
+				   d_aligned_taps[al],
+				   (d_ntaps + al - 1) / 4 + 1);
+	//*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset],
+	//			      (d_ntaps + d_offset - 1) / 4 + 1);
+	return *d_output;
+      }
       
+      void
+      fir_filter_fff::filterN(float output[],
+			      const float input[],
+			      unsigned long n)
+      {
+	for(unsigned long i = 0; i < n; i++) {
+	  output[i] = filter(&input[i]);
+	}
+      }
       
       void
       fir_filter_fff::filterNdec(float output[],
@@ -136,7 +168,7 @@ namespace gr {
       {
 	unsigned long j = 0;
 	for(unsigned long i = 0; i < n; i++) {
-	  filterN(&output[i], &input[j], 1);
+	  output[i] = filter(&input[j]);
 	  j += decimate;
 	}
       }
author	Tom Rondeau <trondeau@vt.edu>	2012-06-13 14:53:41 -0400
committer	Tom Rondeau <trondeau@vt.edu>	2012-06-13 14:53:41 -0400
commit	9e2e896e9d4dbc4627702cde82a48e9ee5136f26 (patch)
tree	fb62fd58ec90f34e080d1dc2f52b2200b57fa0ed /gr-filter
parent	46bd64dc8231402535bad742a74250599e595148 (diff)