blocks/moving_average: use vector functions

This improves performance of moving average for me by about 4-5x. My test is [here](https://github.com/ThomasHabets/radiostuff/blob/master/amateur/listen_70cm.grc), which processes 10MHz to find the strongest signal. Without this PR I see `moving_average` in `top` taking about 89.4-94% CPU. With this patch it's ~20.4-23.7%. Since without this patch it's that high, I don't know that it's not even better. Test measured on a Libremv2 with Intel Core i7-6500U @ 2.5GHz. The memory access pattern is probably worse with this patch, but at least on my hardware on my workload this seems to be dwarfed by the gain of using volk. It can be hard to reason about memory access patterns, so benchmarks overrule theory. The test only actually uses float averaging. More benchmarking may be required. Possible improvements: * Add template specialization for `uint16` and `uint32`? * Refactor for non-volk fallback to use only one loop, with (presumably) better memory access pattern)
author: Thomas Habets <thomas@habets.se> 2020-06-07 18:52:22 +0100
committer: mormj <34754695+mormj@users.noreply.github.com> 2020-06-15 08:21:35 -0400
commit: 518aa489498ec692dee51bb69936b620660ba113 (patch)
tree: 49ff128f6549ad7d91eb11020d85581f292f3f1b /gr-blocks/lib/moving_average_impl.cc
parent: ef825aad904b9a5ff99160d7f38181eb175137eb (diff)
1 files changed, 93 insertions, 21 deletions
diff --git a/gr-blocks/lib/moving_average_impl.cc b/gr-blocks/lib/moving_average_impl.cc
index 03036e365b..7a566bac2e 100644
--- a/gr-blocks/lib/moving_average_impl.cc
+++ b/gr-blocks/lib/moving_average_impl.cc
@@ -15,10 +15,91 @@
 
 #include "moving_average_impl.h"
 #include <gnuradio/io_signature.h>
+#include <volk/volk.h>
+#include <numeric>
 
 namespace gr {
 namespace blocks {
 
+namespace {
+template <typename T>
+inline void volk_add(T* out, const T* add, unsigned int num)
+{
+    for (unsigned int elem = 0; elem < num; elem++) {
+        out[elem] += add[elem];
+    }
+}
+
+template <>
+inline void volk_add<float>(float* out, const float* add, unsigned int num)
+{
+    volk_32f_x2_add_32f(out, out, add, num);
+}
+
+template <>
+inline void volk_add<gr_complex>(gr_complex* out, const gr_complex* add, unsigned int num)
+{
+    volk_32fc_x2_add_32fc(out, out, add, num);
+}
+
+template <typename T>
+inline void volk_sub(T* out, const T* sub, unsigned int num)
+{
+    for (unsigned int elem = 0; elem < num; elem++) {
+        out[elem] -= sub[elem];
+    }
+}
+
+template <>
+inline void volk_sub<float>(float* out, const float* sub, unsigned int num)
+{
+    volk_32f_x2_subtract_32f(out, out, sub, num);
+}
+
+// For gr_complex, do some checking that the types are as we expect before
+// allowing the specialization.
+// That long `enable_if` boils down to the word `void`.
+template <>
+inline typename std::enable_if<std::is_same<gr_complex::value_type, float>::value>::type
+volk_sub<gr_complex>(gr_complex* out, const gr_complex* sub, unsigned int num)
+{
+    // std::complex is required to be implemented by adjacent and non-padded
+    // imaginary and real values, so it's safe to treat as such.
+    auto fout = reinterpret_cast<gr_complex::value_type*>(out);
+    auto fsub = reinterpret_cast<const gr_complex::value_type*>(sub);
+
+    // But just to be sure, let's double-check that.
+    static_assert(sizeof(gr_complex) == 2 * sizeof(gr_complex::value_type),
+                  "Can't happen: sizeof(gr_complex) != 2*sizeof(gr_complex::value_type)");
+
+    volk_32f_x2_subtract_32f(fout, fout, fsub, 2 * num);
+}
+
+template <typename T>
+inline void volk_mul(T* out, const T* in, const T* scale, unsigned int num)
+{
+    for (unsigned int elem = 0; elem < num; elem++) {
+        out[elem] = in[elem] * scale[elem];
+    }
+}
+
+template <>
+inline void
+volk_mul<float>(float* out, const float* in, const float* mul, unsigned int num)
+{
+    volk_32f_x2_multiply_32f(out, in, mul, num);
+}
+
+template <>
+inline void volk_mul<gr_complex>(gr_complex* out,
+                                 const gr_complex* in,
+                                 const gr_complex* mul,
+                                 unsigned int num)
+{
+    volk_32fc_x2_multiply_32fc(out, in, mul, num);
+}
+} // namespace
+
 template <class T>
 typename moving_average<T>::sptr
 moving_average<T>::make(int length, T scale, int max_iter, unsigned int vlen)
@@ -44,11 +125,11 @@ moving_average_impl<T>::moving_average_impl(int length,
       d_updated(false)
 {
     this->set_history(length);
-    // we don't have C++11's <array>, so initialize the stored vector instead
     // we store this vector so that work() doesn't spend its time allocating and freeing
     // vector storage
     if (d_vlen > 1) {
-        d_sum = std::vector<T>(d_vlen);
+        d_sum = volk::vector<T>(d_vlen);
+        d_scales = volk::vector<T>(d_vlen, d_scale);
     }
 }
 
@@ -87,21 +168,18 @@ int moving_average_impl<T>::work(int noutput_items,
     if (d_updated) {
         d_length = d_new_length;
         d_scale = d_new_scale;
+        std::fill(std::begin(d_scales), std::end(d_scales), d_scale);
         this->set_history(d_length);
         d_updated = false;
         return 0; // history requirements might have changed
     }
 
-    const T* in = (const T*)input_items[0];
-    T* out = (T*)output_items[0];
+    const T* in = static_cast<const T*>(input_items[0]);
+    T* out = static_cast<T*>(output_items[0]);
 
-    unsigned int num_iter =
-        (unsigned int)((noutput_items > d_max_iter) ? d_max_iter : noutput_items);
+    const unsigned int num_iter = std::min(noutput_items, d_max_iter);
     if (d_vlen == 1) {
-        T sum = in[0];
-        for (int i = 1; i < d_length - 1; i++) {
-            sum += in[i];
-        }
+        T sum = std::accumulate(&in[0], &in[d_length - 1], T{});
 
         for (unsigned int i = 0; i < num_iter; i++) {
             sum += in[i + d_length - 1];
@@ -111,22 +189,16 @@ int moving_average_impl<T>::work(int noutput_items,
 
     } else { // d_vlen > 1
         // gets automatically optimized well
-        for (unsigned int elem = 0; elem < d_vlen; elem++) {
-            d_sum[elem] = in[elem];
-        }
+        std::copy(&in[0], &in[d_vlen], std::begin(d_sum));
 
         for (int i = 1; i < d_length - 1; i++) {
-            for (unsigned int elem = 0; elem < d_vlen; elem++) {
-                d_sum[elem] += in[i * d_vlen + elem];
-            }
+            volk_add(d_sum.data(), &in[i * d_vlen], d_vlen);
         }
 
         for (unsigned int i = 0; i < num_iter; i++) {
-            for (unsigned int elem = 0; elem < d_vlen; elem++) {
-                d_sum[elem] += in[(i + d_length - 1) * d_vlen + elem];
-                out[i * d_vlen + elem] = d_sum[elem] * d_scale;
-                d_sum[elem] -= in[i * d_vlen + elem];
-            }
+            volk_add(d_sum.data(), &in[(i + d_length - 1) * d_vlen], d_vlen);
+            volk_mul(&out[i * d_vlen], d_sum.data(), d_scales.data(), d_vlen);
+            volk_sub(d_sum.data(), &in[i * d_vlen], d_vlen);
         }
     }
     return num_iter;
author	Thomas Habets <thomas@habets.se>	2020-06-07 18:52:22 +0100
committer	mormj <34754695+mormj@users.noreply.github.com>	2020-06-15 08:21:35 -0400
commit	518aa489498ec692dee51bb69936b620660ba113 (patch)
tree	49ff128f6549ad7d91eb11020d85581f292f3f1b /gr-blocks/lib/moving_average_impl.cc
parent	ef825aad904b9a5ff99160d7f38181eb175137eb (diff)