1 files changed, 93 insertions, 21 deletions
diff --git a/gr-blocks/lib/moving_average_impl.cc b/gr-blocks/lib/moving_average_impl.cc
index 03036e365b..7a566bac2e 100644
--- a/gr-blocks/lib/moving_average_impl.cc
+++ b/gr-blocks/lib/moving_average_impl.cc
@@ -15,10 +15,91 @@
 
 #include "moving_average_impl.h"
 #include <gnuradio/io_signature.h>
+#include <volk/volk.h>
+#include <numeric>
 
 namespace gr {
 namespace blocks {
 
+namespace {
+template <typename T>
+inline void volk_add(T* out, const T* add, unsigned int num)
+{
+    for (unsigned int elem = 0; elem < num; elem++) {
+        out[elem] += add[elem];
+    }
+}
+
+template <>
+inline void volk_add<float>(float* out, const float* add, unsigned int num)
+{
+    volk_32f_x2_add_32f(out, out, add, num);
+}
+
+template <>
+inline void volk_add<gr_complex>(gr_complex* out, const gr_complex* add, unsigned int num)
+{
+    volk_32fc_x2_add_32fc(out, out, add, num);
+}
+
+template <typename T>
+inline void volk_sub(T* out, const T* sub, unsigned int num)
+{
+    for (unsigned int elem = 0; elem < num; elem++) {
+        out[elem] -= sub[elem];
+    }
+}
+
+template <>
+inline void volk_sub<float>(float* out, const float* sub, unsigned int num)
+{
+    volk_32f_x2_subtract_32f(out, out, sub, num);
+}
+
+// For gr_complex, do some checking that the types are as we expect before
+// allowing the specialization.
+// That long `enable_if` boils down to the word `void`.
+template <>
+inline typename std::enable_if<std::is_same<gr_complex::value_type, float>::value>::type
+volk_sub<gr_complex>(gr_complex* out, const gr_complex* sub, unsigned int num)
+{
+    // std::complex is required to be implemented by adjacent and non-padded
+    // imaginary and real values, so it's safe to treat as such.
+    auto fout = reinterpret_cast<gr_complex::value_type*>(out);
+    auto fsub = reinterpret_cast<const gr_complex::value_type*>(sub);
+
+    // But just to be sure, let's double-check that.
+    static_assert(sizeof(gr_complex) == 2 * sizeof(gr_complex::value_type),
+                  "Can't happen: sizeof(gr_complex) != 2*sizeof(gr_complex::value_type)");
+
+    volk_32f_x2_subtract_32f(fout, fout, fsub, 2 * num);
+}
+
+template <typename T>
+inline void volk_mul(T* out, const T* in, const T* scale, unsigned int num)
+{
+    for (unsigned int elem = 0; elem < num; elem++) {
+        out[elem] = in[elem] * scale[elem];
+    }
+}
+
+template <>
+inline void
+volk_mul<float>(float* out, const float* in, const float* mul, unsigned int num)
+{
+    volk_32f_x2_multiply_32f(out, in, mul, num);
+}
+
+template <>
+inline void volk_mul<gr_complex>(gr_complex* out,
+                                 const gr_complex* in,
+                                 const gr_complex* mul,
+                                 unsigned int num)
+{
+    volk_32fc_x2_multiply_32fc(out, in, mul, num);
+}
+} // namespace
+
 template <class T>
 typename moving_average<T>::sptr
 moving_average<T>::make(int length, T scale, int max_iter, unsigned int vlen)
@@ -44,11 +125,11 @@ moving_average_impl<T>::moving_average_impl(int length,
       d_updated(false)
 {
     this->set_history(length);
-    // we don't have C++11's <array>, so initialize the stored vector instead
     // we store this vector so that work() doesn't spend its time allocating and freeing
     // vector storage
     if (d_vlen > 1) {
-        d_sum = std::vector<T>(d_vlen);
+        d_sum = volk::vector<T>(d_vlen);
+        d_scales = volk::vector<T>(d_vlen, d_scale);
     }
 }
 
@@ -87,21 +168,18 @@ int moving_average_impl<T>::work(int noutput_items,
     if (d_updated) {
         d_length = d_new_length;
         d_scale = d_new_scale;
+        std::fill(std::begin(d_scales), std::end(d_scales), d_scale);
         this->set_history(d_length);
         d_updated = false;
         return 0; // history requirements might have changed
     }
 
-    const T* in = (const T*)input_items[0];
-    T* out = (T*)output_items[0];
+    const T* in = static_cast<const T*>(input_items[0]);
+    T* out = static_cast<T*>(output_items[0]);
 
-    unsigned int num_iter =
-        (unsigned int)((noutput_items > d_max_iter) ? d_max_iter : noutput_items);
+    const unsigned int num_iter = std::min(noutput_items, d_max_iter);
     if (d_vlen == 1) {
-        T sum = in[0];
-        for (int i = 1; i < d_length - 1; i++) {
-            sum += in[i];
-        }
+        T sum = std::accumulate(&in[0], &in[d_length - 1], T{});
 
         for (unsigned int i = 0; i < num_iter; i++) {
             sum += in[i + d_length - 1];
@@ -111,22 +189,16 @@ int moving_average_impl<T>::work(int noutput_items,
 
     } else { // d_vlen > 1
         // gets automatically optimized well
-        for (unsigned int elem = 0; elem < d_vlen; elem++) {
-            d_sum[elem] = in[elem];
-        }
+        std::copy(&in[0], &in[d_vlen], std::begin(d_sum));
 
         for (int i = 1; i < d_length - 1; i++) {
-            for (unsigned int elem = 0; elem < d_vlen; elem++) {
-                d_sum[elem] += in[i * d_vlen + elem];
-            }
+            volk_add(d_sum.data(), &in[i * d_vlen], d_vlen);
         }
 
         for (unsigned int i = 0; i < num_iter; i++) {
-            for (unsigned int elem = 0; elem < d_vlen; elem++) {
-                d_sum[elem] += in[(i + d_length - 1) * d_vlen + elem];
-                out[i * d_vlen + elem] = d_sum[elem] * d_scale;
-                d_sum[elem] -= in[i * d_vlen + elem];
-            }
+            volk_add(d_sum.data(), &in[(i + d_length - 1) * d_vlen], d_vlen);
+            volk_mul(&out[i * d_vlen], d_sum.data(), d_scales.data(), d_vlen);
+            volk_sub(d_sum.data(), &in[i * d_vlen], d_vlen);
         }
     }
     return num_iter;