diff options
author | Thomas Habets <thomas@habets.se> | 2020-06-07 18:52:22 +0100 |
---|---|---|
committer | mormj <34754695+mormj@users.noreply.github.com> | 2020-06-15 08:21:35 -0400 |
commit | 518aa489498ec692dee51bb69936b620660ba113 (patch) | |
tree | 49ff128f6549ad7d91eb11020d85581f292f3f1b /gr-blocks/lib/moving_average_impl.cc | |
parent | ef825aad904b9a5ff99160d7f38181eb175137eb (diff) |
blocks/moving_average: use vector functions
This improves performance of moving average for me by about 4-5x.
My test is
[here](https://github.com/ThomasHabets/radiostuff/blob/master/amateur/listen_70cm.grc),
which processes 10MHz to find the strongest signal.
Without this PR I see `moving_average` in `top` taking about 89.4-94%
CPU. With this patch it's ~20.4-23.7%. Since without this patch it's
that high, I don't know that it's not even better.
Test measured on a Libremv2 with Intel Core i7-6500U @ 2.5GHz.
The memory access pattern is probably worse with this patch, but at
least on my hardware on my workload this seems to be dwarfed by the
gain of using volk. It can be hard to reason about memory access
patterns, so benchmarks overrule theory.
The test only actually uses float averaging.
More benchmarking may be required.
Possible improvements:
* Add template specialization for `uint16` and `uint32`?
* Refactor for non-volk fallback to use only one loop, with
(presumably) better memory access pattern)
Diffstat (limited to 'gr-blocks/lib/moving_average_impl.cc')
-rw-r--r-- | gr-blocks/lib/moving_average_impl.cc | 114 |
1 files changed, 93 insertions, 21 deletions
diff --git a/gr-blocks/lib/moving_average_impl.cc b/gr-blocks/lib/moving_average_impl.cc index 03036e365b..7a566bac2e 100644 --- a/gr-blocks/lib/moving_average_impl.cc +++ b/gr-blocks/lib/moving_average_impl.cc @@ -15,10 +15,91 @@ #include "moving_average_impl.h" #include <gnuradio/io_signature.h> +#include <volk/volk.h> +#include <numeric> namespace gr { namespace blocks { +namespace { +template <typename T> +inline void volk_add(T* out, const T* add, unsigned int num) +{ + for (unsigned int elem = 0; elem < num; elem++) { + out[elem] += add[elem]; + } +} + +template <> +inline void volk_add<float>(float* out, const float* add, unsigned int num) +{ + volk_32f_x2_add_32f(out, out, add, num); +} + +template <> +inline void volk_add<gr_complex>(gr_complex* out, const gr_complex* add, unsigned int num) +{ + volk_32fc_x2_add_32fc(out, out, add, num); +} + +template <typename T> +inline void volk_sub(T* out, const T* sub, unsigned int num) +{ + for (unsigned int elem = 0; elem < num; elem++) { + out[elem] -= sub[elem]; + } +} + +template <> +inline void volk_sub<float>(float* out, const float* sub, unsigned int num) +{ + volk_32f_x2_subtract_32f(out, out, sub, num); +} + +// For gr_complex, do some checking that the types are as we expect before +// allowing the specialization. +// That long `enable_if` boils down to the word `void`. +template <> +inline typename std::enable_if<std::is_same<gr_complex::value_type, float>::value>::type +volk_sub<gr_complex>(gr_complex* out, const gr_complex* sub, unsigned int num) +{ + // std::complex is required to be implemented by adjacent and non-padded + // imaginary and real values, so it's safe to treat as such. + auto fout = reinterpret_cast<gr_complex::value_type*>(out); + auto fsub = reinterpret_cast<const gr_complex::value_type*>(sub); + + // But just to be sure, let's double-check that. + static_assert(sizeof(gr_complex) == 2 * sizeof(gr_complex::value_type), + "Can't happen: sizeof(gr_complex) != 2*sizeof(gr_complex::value_type)"); + + volk_32f_x2_subtract_32f(fout, fout, fsub, 2 * num); +} + +template <typename T> +inline void volk_mul(T* out, const T* in, const T* scale, unsigned int num) +{ + for (unsigned int elem = 0; elem < num; elem++) { + out[elem] = in[elem] * scale[elem]; + } +} + +template <> +inline void +volk_mul<float>(float* out, const float* in, const float* mul, unsigned int num) +{ + volk_32f_x2_multiply_32f(out, in, mul, num); +} + +template <> +inline void volk_mul<gr_complex>(gr_complex* out, + const gr_complex* in, + const gr_complex* mul, + unsigned int num) +{ + volk_32fc_x2_multiply_32fc(out, in, mul, num); +} +} // namespace + template <class T> typename moving_average<T>::sptr moving_average<T>::make(int length, T scale, int max_iter, unsigned int vlen) @@ -44,11 +125,11 @@ moving_average_impl<T>::moving_average_impl(int length, d_updated(false) { this->set_history(length); - // we don't have C++11's <array>, so initialize the stored vector instead // we store this vector so that work() doesn't spend its time allocating and freeing // vector storage if (d_vlen > 1) { - d_sum = std::vector<T>(d_vlen); + d_sum = volk::vector<T>(d_vlen); + d_scales = volk::vector<T>(d_vlen, d_scale); } } @@ -87,21 +168,18 @@ int moving_average_impl<T>::work(int noutput_items, if (d_updated) { d_length = d_new_length; d_scale = d_new_scale; + std::fill(std::begin(d_scales), std::end(d_scales), d_scale); this->set_history(d_length); d_updated = false; return 0; // history requirements might have changed } - const T* in = (const T*)input_items[0]; - T* out = (T*)output_items[0]; + const T* in = static_cast<const T*>(input_items[0]); + T* out = static_cast<T*>(output_items[0]); - unsigned int num_iter = - (unsigned int)((noutput_items > d_max_iter) ? d_max_iter : noutput_items); + const unsigned int num_iter = std::min(noutput_items, d_max_iter); if (d_vlen == 1) { - T sum = in[0]; - for (int i = 1; i < d_length - 1; i++) { - sum += in[i]; - } + T sum = std::accumulate(&in[0], &in[d_length - 1], T{}); for (unsigned int i = 0; i < num_iter; i++) { sum += in[i + d_length - 1]; @@ -111,22 +189,16 @@ int moving_average_impl<T>::work(int noutput_items, } else { // d_vlen > 1 // gets automatically optimized well - for (unsigned int elem = 0; elem < d_vlen; elem++) { - d_sum[elem] = in[elem]; - } + std::copy(&in[0], &in[d_vlen], std::begin(d_sum)); for (int i = 1; i < d_length - 1; i++) { - for (unsigned int elem = 0; elem < d_vlen; elem++) { - d_sum[elem] += in[i * d_vlen + elem]; - } + volk_add(d_sum.data(), &in[i * d_vlen], d_vlen); } for (unsigned int i = 0; i < num_iter; i++) { - for (unsigned int elem = 0; elem < d_vlen; elem++) { - d_sum[elem] += in[(i + d_length - 1) * d_vlen + elem]; - out[i * d_vlen + elem] = d_sum[elem] * d_scale; - d_sum[elem] -= in[i * d_vlen + elem]; - } + volk_add(d_sum.data(), &in[(i + d_length - 1) * d_vlen], d_vlen); + volk_mul(&out[i * d_vlen], d_sum.data(), d_scales.data(), d_vlen); + volk_sub(d_sum.data(), &in[i * d_vlen], d_vlen); } } return num_iter; |