filter: working aligned taps loads for regular and decimating filter.

Has working QA code but needs significant performance improvements.
author: Tom Rondeau <trondeau@vt.edu> 2012-06-10 19:35:55 -0400
committer: Tom Rondeau <trondeau@vt.edu> 2012-06-10 22:35:59 -0400
commit: 227448e247ba720b87b99e1b9382cd3737241ab3 (patch)
tree: c466e391fb6391b9020457486f852f6f86c6fd20 /gr-filter
parent: 14532d8da0f40f2b58595bd7e217004bdbfc90e3 (diff)
4 files changed, 65 insertions, 17 deletions
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h
index fba4641bf3..1eb70f92a4 100644
--- a/gr-filter/include/filter/fir_filter.h
+++ b/gr-filter/include/filter/fir_filter.h
@@ -41,8 +41,8 @@ namespace gr {
 	void set_taps(const std::vector<float> &taps);
 	std::vector<float> taps() const;
 	unsigned int ntaps() const;
-
-	float filter(const float input[]);
+	
+	inline float filter(const float input[]);
 	void  filterN(float output[],
 		      const float input[],
 		      unsigned long n);
@@ -53,8 +53,10 @@ namespace gr {
 
       private:
 	unsigned int d_ntaps;
-	float *d_taps;
-	float *d_aligned_taps[4];
+	float  *d_taps;
+	float **d_aligned_taps;
+	int     d_offset;
+	float  *d_output;
       };
 
       /**************************************************************/
@@ -100,7 +102,7 @@ namespace gr {
 	gr_complex filter(const gr_complex input[]);
 	void filterN(gr_complex output[],
 		     const gr_complex input[],
-		     unsigned long n);
+		     unsigned long n)
 	void filterNdec(gr_complex output[],
 			const gr_complex input[],
 			unsigned long n,
diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc
index 3abcda53e9..098dd8d367 100644
--- a/gr-filter/lib/fir_filter.cc
+++ b/gr-filter/lib/fir_filter.cc
@@ -23,6 +23,7 @@
 #include <filter/fir_filter.h>
 #include <fft/fft.h>
 #include <volk/volk.h>
+#include <cstdio>
 
 namespace gr {
   namespace filter {
@@ -33,15 +34,29 @@ namespace gr {
       {
 	d_taps = NULL;
 	set_taps(taps);
+	d_offset = 0;
+
+	// Make sure the output sample is always aligned, too.
+	d_output = fft::malloc_float(1);
       }
       
       fir_filter_fff::~fir_filter_fff()
       {
+	// Free taps
 	if(d_taps != NULL) {
 	  fft::free(d_taps);
 	  d_taps = NULL;
 	}
-    }
+
+	// Free all aligned taps
+	for(int i = 0; i < 4; i++) {
+	  fft::free(d_aligned_taps[i]);
+	}
+	fft::free(d_aligned_taps);
+
+	// Free output sample
+	fft::free(d_output);
+      }
       
       void
       fir_filter_fff::set_taps(const std::vector<float> &taps)
@@ -50,6 +65,11 @@ namespace gr {
 	if(d_taps != NULL) {
 	  fft::free(d_taps);
 	  d_taps = NULL;
+
+	  for(int i = 0; i < 4; i++) {
+	    fft::free(d_aligned_taps[i]);
+	  }
+	  fft::free(d_aligned_taps);
 	}
 	
 	d_ntaps = (int)taps.size();
@@ -58,6 +78,8 @@ namespace gr {
 	  d_taps[d_ntaps-i-1] = taps[i];
 	}
 
+	// Make a set of taps at all possible arch alignments
+	d_aligned_taps = (float**)malloc(4*sizeof(float**));
 	for(int i = 0; i < 4; i++) {
 	  d_aligned_taps[i] = fft::malloc_float(d_ntaps+3);
 	  memset(d_aligned_taps[i], 0, sizeof(float)*(d_ntaps+3));
@@ -83,12 +105,9 @@ namespace gr {
       float
       fir_filter_fff::filter(const float input[])
       {
-	float output;
-
-	//const float *ar = (float*)((unsigned long)input & ~15);
-
-	volk_32f_x2_dot_prod_32f_u(&output, input, d_taps, d_ntaps);
-	return output;
+	volk_32f_x2_dot_prod_32f_a(d_output, input,
+				   d_aligned_taps[d_offset], d_ntaps+3);
+	return *d_output;
       }
       
       void
@@ -96,8 +115,16 @@ namespace gr {
 			      const float input[],
 			      unsigned long n)
       {
-	for(unsigned long i = 0; i < n; i++)
-	  output[i] = filter(&input[i]);
+	unsigned long ar = ((unsigned long) input);
+	int off = (ar - (ar & ~15))/4;
+
+	int j = -off;
+	d_offset = off;
+	for(unsigned long i = 0; i < n; i++) {
+	  output[i] = filter(&input[j]);
+	  d_offset= (d_offset+1) & 0x03;
+	  j += (d_offset == 0 ? 4 : 0);
+	}
       }
       
       
@@ -108,8 +135,8 @@ namespace gr {
 				 unsigned int decimate)
       {
 	unsigned long j = 0;
-	for(unsigned long i = 0; i < n; i++){
-	  output[i] = filter(&input[j]);
+	for(unsigned long i = 0; i < n; i++) {
+	  filterN(&output[i], &input[j], 1);
 	  j += decimate;
 	}
       }
diff --git a/gr-filter/lib/fir_filter_XXX_impl.cc.t b/gr-filter/lib/fir_filter_XXX_impl.cc.t
index c3637042d7..18bec38be3 100644
--- a/gr-filter/lib/fir_filter_XXX_impl.cc.t
+++ b/gr-filter/lib/fir_filter_XXX_impl.cc.t
@@ -26,6 +26,7 @@
 
 #include "@IMPL_NAME@.h"
 #include <gr_io_signature.h>
+#include <volk/volk.h>
 
 namespace gr {
   namespace filter {
@@ -47,6 +48,10 @@ namespace gr {
       d_fir = new kernel::@BASE_NAME@(decimation, taps);
       d_updated = false;
       set_history(d_fir->ntaps());
+
+      const int alignment_multiple =
+	volk_get_alignment() / sizeof(float);
+      set_alignment(std::max(1, alignment_multiple));
     }
 
     @IMPL_NAME@::~@IMPL_NAME@()
@@ -85,7 +90,8 @@ namespace gr {
 	d_fir->filterN(out, in, noutput_items);
       }
       else {
-	d_fir->filterNdec(out, in, noutput_items, decimation());
+	d_fir->filterNdec(out, in, noutput_items,
+			  decimation());
       }
       
       return noutput_items;
diff --git a/gr-filter/python/qa_fir_filter.py b/gr-filter/python/qa_fir_filter.py
index 38bfd9ea51..93974bb89a 100755
--- a/gr-filter/python/qa_fir_filter.py
+++ b/gr-filter/python/qa_fir_filter.py
@@ -41,6 +41,19 @@ class test_filter(gr_unittest.TestCase):
         result_data = dst.data()
         self.assertFloatTuplesAlmostEqual(expected_data, result_data, 5)
 
+    def test_fir_filter_fff_002(self):
+        src_data = 10*[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+        expected_data = [0.0,] + 4*[3.5, 7.5, 1.5, 5.5, 4.5,] + [3.5, 7.5, 1.5, 5.5]
+        src = gr.vector_source_f(src_data)
+        op  = filter.fir_filter_fff(4, [0.5, 0.5])
+        dst = gr.vector_sink_f()
+        self.tb.connect(src, op, dst)
+        self.tb.run()
+        result_data = dst.data()
+        print result_data
+        print expected_data
+        self.assertFloatTuplesAlmostEqual(expected_data, result_data, 5)
+
     def test_fir_filter_ccf_001(self):
         src_data = [1+1j, 2+2j, 3+3j, 4+4j]
         expected_data = [0.5+0.5j, 1.5+1.5j, 2.5+2.5j, 3.5+3.5j]
author	Tom Rondeau <trondeau@vt.edu>	2012-06-10 19:35:55 -0400
committer	Tom Rondeau <trondeau@vt.edu>	2012-06-10 22:35:59 -0400
commit	227448e247ba720b87b99e1b9382cd3737241ab3 (patch)
tree	c466e391fb6391b9020457486f852f6f86c6fd20 /gr-filter
parent	14532d8da0f40f2b58595bd7e217004bdbfc90e3 (diff)