10 files changed, 203 insertions, 233 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 007166bbf6..066e973f6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -419,6 +419,20 @@ message(STATUS "  Override with -DENABLE_INTERNAL_VOLK=ON/OFF")
 find_package(LOG4CPP REQUIRED)
 
 ########################################################################
+# Setup Native Capabilities Flag
+########################################################################
+option(ENABLE_NATIVE "Enable native build optimizations" OFF)
+IF(UNIX)
+    IF (ENABLE_NATIVE)
+        MESSAGE(STATUS "Found GNU Radio native optimization flag.  Setting native CPU optimization flags.")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -ftree-vectorize")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -ftree-vectorize")
+    ELSE (ENABLE_NATIVE)
+        MESSAGE(STATUS "Not using additional GNU Radio native architecture optimizations.")
+    ENDIF (ENABLE_NATIVE)
+ENDIF(UNIX)
+
+########################################################################
 # Disable complex math NaN/INFO range checking for performance
 ########################################################################
 check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE)
diff --git a/gnuradio-runtime/include/gnuradio/math.h b/gnuradio-runtime/include/gnuradio/math.h
index c0e1a7dc88..f628a5875c 100644
--- a/gnuradio-runtime/include/gnuradio/math.h
+++ b/gnuradio-runtime/include/gnuradio/math.h
@@ -31,12 +31,30 @@
 #define GR_M_LOG2E 1.4426950408889634074          /* log_2 e */
 #define GR_M_PI 3.14159265358979323846            /* pi */
 #define GR_M_PI_4 0.78539816339744830961566084582 /* pi/4 */
-#define GR_M_TWOPI (2 * GR_M_PI)                  /* 2*pi */
+#define GR_M_TWOPI 6.28318530717958647692         /* 2*pi */
 #define GR_M_SQRT2 1.41421356237309504880         /* sqrt(2) */
+#define GR_M_ONE_OVER_2PI 0.15915494309189533577  /* 1 / (2*pi) */
+#define GR_M_MINUS_TWO_PI -6.28318530717958647692 /* - 2*pi */
 
 
 namespace gr {
 
+static inline void
+fast_cc_multiply(gr_complex& out, const gr_complex cc1, const gr_complex cc2)
+{
+    // The built-in complex.h multiply has significant NaN/INF checking that
+    // considerably slows down performance.  While on some compilers the
+    // -fcx-limit-range flag can be used, this fast function makes the math consistent
+    // in terms of performance for the Costas loop.
+    float o_r, o_i;
+
+    o_r = (cc1.real() * cc2.real()) - (cc1.imag() * cc2.imag());
+    o_i = (cc1.real() * cc2.imag()) + (cc1.imag() * cc2.real());
+
+    out.real(o_r);
+    out.imag(o_i);
+}
+
 static inline bool is_power_of_2(long x) { return x != 0 && (x & (x - 1)) == 0; }
 
 /*!
@@ -62,10 +80,7 @@ static inline float fast_atan2f(gr_complex z) { return fast_atan2f(z.imag(), z.r
 /* This bounds x by +/- clip without a branch */
 static inline float branchless_clip(float x, float clip)
 {
-    float x1 = fabsf(x + clip);
-    float x2 = fabsf(x - clip);
-    x1 -= x2;
-    return 0.5 * x1;
+    return 0.5 * (std::abs(x + clip) - std::abs(x - clip));
 }
 
 static inline float clip(float x, float clip)
diff --git a/gnuradio-runtime/include/gnuradio/nco.h b/gnuradio-runtime/include/gnuradio/nco.h
index 6542fe58de..4bcdfe686b 100644
--- a/gnuradio-runtime/include/gnuradio/nco.h
+++ b/gnuradio-runtime/include/gnuradio/nco.h
@@ -44,27 +44,15 @@ public:
     void adjust_freq(double delta_angle_rate) { phase_inc += delta_angle_rate; }
 
     // increment current phase angle
-    void step()
-    {
-        phase += phase_inc;
-        if (fabs(phase) > GR_M_PI) {
-            while (phase > GR_M_PI)
-                phase -= 2 * GR_M_PI;
-
-            while (phase < -GR_M_PI)
-                phase += 2 * GR_M_PI;
-        }
-    }
-
-    void step(int n)
+    void step(int n = 1)
     {
         phase += phase_inc * n;
         if (fabs(phase) > GR_M_PI) {
             while (phase > GR_M_PI)
-                phase -= 2 * GR_M_PI;
+                phase -= GR_M_TWOPI;
 
             while (phase < -GR_M_PI)
-                phase += 2 * GR_M_PI;
+                phase += GR_M_TWOPI;
         }
     }
 
@@ -73,7 +61,7 @@ public:
     double get_freq() const { return phase_inc; }
 
     // compute sin and cos for current phase angle
-    void sincos(float* sinx, float* cosx) const;
+    void sincos(float* sinx, float* cosx) const { gr::sincosf(phase, sinx, cosx); }
 
     // compute cos or sin for current phase angle
     float cos() const { return std::cos(phase); }
@@ -94,12 +82,6 @@ protected:
 };
 
 template <class o_type, class i_type>
-void nco<o_type, i_type>::sincos(float* sinx, float* cosx) const
-{
-    gr::sincosf(phase, sinx, cosx);
-}
-
-template <class o_type, class i_type>
 void nco<o_type, i_type>::sin(float* output, int noutput_items, double ampl)
 {
     for (int i = 0; i < noutput_items; i++) {
diff --git a/gnuradio-runtime/include/gnuradio/sincos.h b/gnuradio-runtime/include/gnuradio/sincos.h
index 15241195d0..cd0f6eb0dd 100644
--- a/gnuradio-runtime/include/gnuradio/sincos.h
+++ b/gnuradio-runtime/include/gnuradio/sincos.h
@@ -12,12 +12,48 @@
 #define INCLUDED_GR_SINCOS_H
 
 #include <gnuradio/api.h>
+#include <cmath>
 
 namespace gr {
 
-// compute sine and cosine at the same time
-GR_RUNTIME_API void sincos(double x, double* sin, double* cos);
-GR_RUNTIME_API void sincosf(float x, float* sin, float* cos);
+#if defined(HAVE_SINCOS)
+
+inline void sincos(double x, double* sinx, double* cosx) { ::sincos(x, sinx, cosx); }
+
+#else
+
+inline void sincos(double x, double* sinx, double* cosx)
+{
+    *sinx = ::sin(x);
+    *cosx = ::cos(x);
+}
+
+#endif
+
+// ----------------------------------------------------------------
+
+#if defined(HAVE_SINCOSF)
+
+inline void sincosf(float x, float* sinx, float* cosx) { ::sincosf(x, sinx, cosx); }
+
+#elif defined(HAVE_SINF) && defined(HAVE_COSF)
+
+inline void sincosf(float x, float* sinx, float* cosx)
+{
+    *sinx = ::sinf(x);
+    *cosx = ::cosf(x);
+}
+
+#else
+
+inline void sincosf(float x, float* sinx, float* cosx)
+{
+    *sinx = ::sin(x);
+    *cosx = ::cos(x);
+}
+
+#endif
+
 } // namespace gr
 
 #endif /* INCLUDED_GR_SINCOS_H */
diff --git a/gnuradio-runtime/lib/CMakeLists.txt b/gnuradio-runtime/lib/CMakeLists.txt
index 04cd7b72c7..a56c68cfd7 100644
--- a/gnuradio-runtime/lib/CMakeLists.txt
+++ b/gnuradio-runtime/lib/CMakeLists.txt
@@ -106,7 +106,6 @@ target_sources(gnuradio-runtime PRIVATE
   ${CMAKE_CURRENT_SOURCE_DIR}/math/fast_atan2f.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/math/fxpt.cc
   ${CMAKE_CURRENT_SOURCE_DIR}/math/random.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/math/sincos.cc
   )
 
 # Controlport
diff --git a/gnuradio-runtime/lib/math/sincos.cc b/gnuradio-runtime/lib/math/sincos.cc
deleted file mode 100644
index b093e09a4a..0000000000
--- a/gnuradio-runtime/lib/math/sincos.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* -*- c++ -*- */
-/*
- * Copyright 2004,2010,2013 Free Software Foundation, Inc.
- *
- * This file is part of GNU Radio
- *
- * SPDX-License-Identifier: GPL-3.0-or-later
- *
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE // ask for GNU extensions if available
-#endif
-
-#include <gnuradio/sincos.h>
-#include <math.h>
-
-namespace gr {
-
-#if defined(HAVE_SINCOS)
-
-void sincos(double x, double* sinx, double* cosx) { ::sincos(x, sinx, cosx); }
-
-#else
-
-void sincos(double x, double* sinx, double* cosx)
-{
-    *sinx = ::sin(x);
-    *cosx = ::cos(x);
-}
-
-#endif
-
-// ----------------------------------------------------------------
-
-#if defined(HAVE_SINCOSF)
-
-void sincosf(float x, float* sinx, float* cosx) { ::sincosf(x, sinx, cosx); }
-
-#elif defined(HAVE_SINF) && defined(HAVE_COSF)
-
-void sincosf(float x, float* sinx, float* cosx)
-{
-    *sinx = ::sinf(x);
-    *cosx = ::cosf(x);
-}
-
-#else
-
-void sincosf(float x, float* sinx, float* cosx)
-{
-    *sinx = ::sin(x);
-    *cosx = ::cos(x);
-}
-
-#endif
-
-} /* namespace gr */
diff --git a/gr-blocks/include/gnuradio/blocks/control_loop.h b/gr-blocks/include/gnuradio/blocks/control_loop.h
index 7aa43b531c..64d5ade620 100644
--- a/gr-blocks/include/gnuradio/blocks/control_loop.h
+++ b/gr-blocks/include/gnuradio/blocks/control_loop.h
@@ -12,6 +12,7 @@
 #define GR_BLOCKS_CONTROL_LOOP
 
 #include <gnuradio/blocks/api.h>
+#include <gnuradio/math.h>
 
 namespace gr {
 namespace blocks {
@@ -72,7 +73,11 @@ public:
     /*! \brief Advance the control loop based on the current gain
      *  settings and the inputted error signal.
      */
-    void advance_loop(float error);
+    void advance_loop(float error)
+    {
+        d_freq = d_freq + d_beta * error;
+        d_phase = d_phase + d_freq + d_alpha * error;
+    }
 
     /*! \brief Keep the phase between -2pi and 2pi.
      *
@@ -87,7 +92,13 @@ public:
      * method in case another way is desired as this is fairly
      * heavy-handed.
      */
-    void phase_wrap();
+    void phase_wrap()
+    {
+        while (d_phase > GR_M_TWOPI)
+            d_phase -= GR_M_TWOPI;
+        while (d_phase < -GR_M_TWOPI)
+            d_phase += GR_M_TWOPI;
+    }
 
     /*! \brief Keep the frequency between d_min_freq and d_max_freq.
      *
@@ -102,7 +113,14 @@ public:
      * method in case another way is desired as this is fairly
      * heavy-handed.
      */
-    void frequency_limit();
+    void frequency_limit()
+    {
+        if (d_freq > d_max_freq)
+            d_freq = d_max_freq;
+        else if (d_freq < d_min_freq)
+            d_freq = d_min_freq;
+    }
+
 
     /*******************************************************************
      * SET FUNCTIONS
diff --git a/gr-blocks/lib/control_loop.cc b/gr-blocks/lib/control_loop.cc
index 099bc20fb9..f98d0c795b 100644
--- a/gr-blocks/lib/control_loop.cc
+++ b/gr-blocks/lib/control_loop.cc
@@ -40,28 +40,6 @@ void control_loop::update_gains()
     d_beta = (4 * d_loop_bw * d_loop_bw) / denom;
 }
 
-void control_loop::advance_loop(float error)
-{
-    d_freq = d_freq + d_beta * error;
-    d_phase = d_phase + d_freq + d_alpha * error;
-}
-
-void control_loop::phase_wrap()
-{
-    while (d_phase > M_TWOPI)
-        d_phase -= M_TWOPI;
-    while (d_phase < -M_TWOPI)
-        d_phase += M_TWOPI;
-}
-
-void control_loop::frequency_limit()
-{
-    if (d_freq > d_max_freq)
-        d_freq = d_max_freq;
-    else if (d_freq < d_min_freq)
-        d_freq = d_min_freq;
-}
-
 /*******************************************************************
  * SET FUNCTIONS
  *******************************************************************/
diff --git a/gr-digital/lib/costas_loop_cc_impl.cc b/gr-digital/lib/costas_loop_cc_impl.cc
index 930183c0c0..5f2cba275e 100644
--- a/gr-digital/lib/costas_loop_cc_impl.cc
+++ b/gr-digital/lib/costas_loop_cc_impl.cc
@@ -37,7 +37,8 @@ costas_loop_cc_impl::costas_loop_cc_impl(float loop_bw, unsigned int order, bool
       blocks::control_loop(loop_bw, 1.0, -1.0),
       d_error(0),
       d_noise(1.0),
-      d_phase_detector(choose_phase_detector(order, use_snr))
+      d_use_snr(use_snr),
+      d_order(order)
 {
     message_port_register_in(pmt::mp("noise"));
     set_msg_handler(pmt::mp("noise"),
@@ -46,96 +47,6 @@ costas_loop_cc_impl::costas_loop_cc_impl(float loop_bw, unsigned int order, bool
 
 costas_loop_cc_impl::~costas_loop_cc_impl() {}
 
-costas_loop_cc_impl::d_phase_detector_t
-costas_loop_cc_impl::choose_phase_detector(unsigned int order, bool use_snr)
-{
-    switch (order) {
-    case 2:
-        if (use_snr) {
-            return &costas_loop_cc_impl::phase_detector_snr_2;
-        }
-        return &costas_loop_cc_impl::phase_detector_2;
-
-    case 4:
-        if (use_snr) {
-            return &costas_loop_cc_impl::phase_detector_snr_4;
-        }
-        return &costas_loop_cc_impl::phase_detector_4;
-
-    case 8:
-        if (use_snr) {
-            return &costas_loop_cc_impl::phase_detector_snr_8;
-        }
-        return &costas_loop_cc_impl::phase_detector_8;
-    }
-    throw std::invalid_argument("order must be 2, 4, or 8");
-}
-
-float costas_loop_cc_impl::phase_detector_8(gr_complex sample) const
-{
-    /* This technique splits the 8PSK constellation into 2 squashed
-       QPSK constellations, one when I is larger than Q and one
-       where Q is larger than I. The error is then calculated
-       proportionally to these squashed constellations by the const
-       K = sqrt(2)-1.
-
-       The signal magnitude must be > 1 or K will incorrectly bias
-       the error value.
-
-       Ref: Z. Huang, Z. Yi, M. Zhang, K. Wang, "8PSK demodulation for
-       new generation DVB-S2", IEEE Proc. Int. Conf. Communications,
-       Circuits and Systems, Vol. 2, pp. 1447 - 1450, 2004.
-    */
-
-    const float K = (sqrtf(2.0) - 1);
-    if (fabsf(sample.real()) >= fabsf(sample.imag())) {
-        return ((sample.real() > 0 ? 1.0 : -1.0) * sample.imag() -
-                (sample.imag() > 0 ? 1.0 : -1.0) * sample.real() * K);
-    } else {
-        return ((sample.real() > 0 ? 1.0 : -1.0) * sample.imag() * K -
-                (sample.imag() > 0 ? 1.0 : -1.0) * sample.real());
-    }
-}
-
-float costas_loop_cc_impl::phase_detector_4(gr_complex sample) const
-{
-    return ((sample.real() > 0 ? 1.0 : -1.0) * sample.imag() -
-            (sample.imag() > 0 ? 1.0 : -1.0) * sample.real());
-}
-
-float costas_loop_cc_impl::phase_detector_2(gr_complex sample) const
-{
-    return (sample.real() * sample.imag());
-}
-
-float costas_loop_cc_impl::phase_detector_snr_8(gr_complex sample) const
-{
-    const float K = (sqrtf(2.0) - 1);
-    const float snr = std::norm(sample) / d_noise;
-    if (fabsf(sample.real()) >= fabsf(sample.imag())) {
-        return ((blocks::tanhf_lut(snr * sample.real()) * sample.imag()) -
-                (blocks::tanhf_lut(snr * sample.imag()) * sample.real() * K));
-    } else {
-        return ((blocks::tanhf_lut(snr * sample.real()) * sample.imag() * K) -
-                (blocks::tanhf_lut(snr * sample.imag()) * sample.real()));
-    }
-}
-
-float costas_loop_cc_impl::phase_detector_snr_4(gr_complex sample) const
-{
-    const float snr = std::norm(sample) / d_noise;
-    return ((blocks::tanhf_lut(snr * sample.real()) * sample.imag()) -
-            (blocks::tanhf_lut(snr * sample.imag()) * sample.real()));
-}
-
-float costas_loop_cc_impl::phase_detector_snr_2(gr_complex sample) const
-{
-    const float snr = std::norm(sample) / d_noise;
-    return blocks::tanhf_lut(snr * sample.real()) * sample.imag();
-}
-
-float costas_loop_cc_impl::error() const { return d_error; }
-
 void costas_loop_cc_impl::handle_set_noise(pmt::pmt_t msg)
 {
     if (pmt::is_real(msg)) {
@@ -154,8 +65,6 @@ int costas_loop_cc_impl::work(int noutput_items,
     float* phase_optr = output_items.size() >= 3 ? (float*)output_items[2] : NULL;
     float* error_optr = output_items.size() >= 4 ? (float*)output_items[3] : NULL;
 
-    gr_complex nco_out;
-
     std::vector<tag_t> tags;
     get_tags_in_range(tags,
                       0,
@@ -163,6 +72,15 @@ int costas_loop_cc_impl::work(int noutput_items,
                       nitems_read(0) + noutput_items,
                       pmt::intern("phase_est"));
 
+    // Get this out of the for loop if not used:
+    bool has_additional_outputs = false;
+    if (freq_optr)
+        has_additional_outputs = true;
+    else if (phase_optr)
+        has_additional_outputs = true;
+    else if (error_optr)
+        has_additional_outputs = true;
+
     for (int i = 0; i < noutput_items; i++) {
         if (!tags.empty()) {
             if (tags[0].offset - nitems_read(0) == (size_t)i) {
@@ -171,22 +89,46 @@ int costas_loop_cc_impl::work(int noutput_items,
             }
         }
 
-        nco_out = gr_expj(-d_phase);
-        optr[i] = iptr[i] * nco_out;
-
-        d_error = (*this.*d_phase_detector)(optr[i]);
+        const gr_complex nco_out = gr_expj(-d_phase);
+
+        gr::fast_cc_multiply(optr[i], iptr[i], nco_out);
+
+        // EXPENSIVE LINE with function pointer, switch was about 20% faster in testing.
+        // Left in for logic justification/reference. d_error = phase_detector_2(optr[i]);
+        switch (d_order) {
+        case 2:
+            if (d_use_snr)
+                d_error = phase_detector_snr_2(optr[i]);
+            else
+                d_error = phase_detector_2(optr[i]);
+            break;
+        case 4:
+            if (d_use_snr)
+                d_error = phase_detector_snr_4(optr[i]);
+            else
+                d_error = phase_detector_4(optr[i]);
+            break;
+        case 8:
+            if (d_use_snr)
+                d_error = phase_detector_snr_8(optr[i]);
+            else
+                d_error = phase_detector_8(optr[i]);
+            break;
+        }
         d_error = gr::branchless_clip(d_error, 1.0);
 
         advance_loop(d_error);
         phase_wrap();
         frequency_limit();
 
-        if (freq_optr != NULL)
-            freq_optr[i] = d_freq;
-        if (phase_optr != NULL)
-            phase_optr[i] = d_phase;
-        if (error_optr != NULL)
-            error_optr[i] = d_error;
+        if (has_additional_outputs) {
+            if (freq_optr)
+                freq_optr[i] = d_freq;
+            if (phase_optr)
+                phase_optr[i] = d_phase;
+            if (error_optr)
+                error_optr[i] = d_error;
+        }
     }
 
     return noutput_items;
diff --git a/gr-digital/lib/costas_loop_cc_impl.h b/gr-digital/lib/costas_loop_cc_impl.h
index 7d90f9fc12..fa33bbec8a 100644
--- a/gr-digital/lib/costas_loop_cc_impl.h
+++ b/gr-digital/lib/costas_loop_cc_impl.h
@@ -22,28 +22,60 @@ class costas_loop_cc_impl : public costas_loop_cc
 private:
     float d_error;
     float d_noise;
+    bool d_use_snr;
+    int d_order;
 
     /*! \brief the phase detector circuit for 8th-order PSK loops.
      *
      *  \param sample complex sample
      *  \return the phase error
      */
-    float phase_detector_8(gr_complex sample) const; // for 8PSK
+    float phase_detector_8(gr_complex sample) const // for 8PSK
+    {
+        /* This technique splits the 8PSK constellation into 2 squashed
+   QPSK constellations, one when I is larger than Q and one
+   where Q is larger than I. The error is then calculated
+   proportionally to these squashed constellations by the const
+   K = sqrt(2)-1.
+
+   The signal magnitude must be > 1 or K will incorrectly bias
+   the error value.
+
+   Ref: Z. Huang, Z. Yi, M. Zhang, K. Wang, "8PSK demodulation for
+   new generation DVB-S2", IEEE Proc. Int. Conf. Communications,
+   Circuits and Systems, Vol. 2, pp. 1447 - 1450, 2004.
+         */
+
+        const float K = (sqrtf(2.0) - 1);
+        if (fabsf(sample.real()) >= fabsf(sample.imag())) {
+            return ((sample.real() > 0.0f ? 1.0f : -1.0f) * sample.imag() -
+                    (sample.imag() > 0.0f ? 1.0f : -1.0f) * sample.real() * K);
+        } else {
+            return ((sample.real() > 0.0f ? 1.0f : -1.0f) * sample.imag() * K -
+                    (sample.imag() > 0.0f ? 1.0f : -1.0f) * sample.real());
+        }
+    };
 
     /*! \brief the phase detector circuit for fourth-order loops.
      *
      *  \param sample complex sample
      *  \return the phase error
      */
-    float phase_detector_4(gr_complex sample) const; // for QPSK
+    float phase_detector_4(gr_complex sample) const // for QPSK
+    {
+        return ((sample.real() > 0.0f ? 1.0f : -1.0f) * sample.imag() -
+                (sample.imag() > 0.0f ? 1.0f : -1.0f) * sample.real());
+    };
 
     /*! \brief the phase detector circuit for second-order loops.
      *
      *  \param sample a complex sample
      *  \return the phase error
      */
-    float phase_detector_2(gr_complex sample) const; // for BPSK
-
+    float phase_detector_2(gr_complex sample) const // for BPSK
+    {
+        return (sample.real() * sample.imag());
+    }
 
     /*! \brief the phase detector circuit for 8th-order PSK
      *  loops. Uses tanh instead of slicing and the noise estimate
@@ -52,7 +84,18 @@ private:
      *  \param sample complex sample
      *  \return the phase error
      */
-    float phase_detector_snr_8(gr_complex sample) const; // for 8PSK
+    float phase_detector_snr_8(gr_complex sample) const // for 8PSK
+    {
+        const float K = (sqrtf(2.0) - 1.0);
+        const float snr = std::norm(sample) / d_noise;
+        if (fabsf(sample.real()) >= fabsf(sample.imag())) {
+            return ((blocks::tanhf_lut(snr * sample.real()) * sample.imag()) -
+                    (blocks::tanhf_lut(snr * sample.imag()) * sample.real() * K));
+        } else {
+            return ((blocks::tanhf_lut(snr * sample.real()) * sample.imag() * K) -
+                    (blocks::tanhf_lut(snr * sample.imag()) * sample.real()));
+        }
+    };
 
     /*! \brief the phase detector circuit for fourth-order
      *  loops. Uses tanh instead of slicing and the noise estimate
@@ -61,7 +104,12 @@ private:
      *  \param sample complex sample
      *  \return the phase error
      */
-    float phase_detector_snr_4(gr_complex sample) const; // for QPSK
+    float phase_detector_snr_4(gr_complex sample) const // for QPSK
+    {
+        const float snr = std::norm(sample) / d_noise;
+        return ((blocks::tanhf_lut(snr * sample.real()) * sample.imag()) -
+                (blocks::tanhf_lut(snr * sample.imag()) * sample.real()));
+    };
 
     /*! \brief the phase detector circuit for second-order
      *  loops. Uses tanh instead of slicing and the noise estimate
@@ -70,17 +118,17 @@ private:
      *  \param sample a complex sample
      *  \return the phase error
      */
-    float phase_detector_snr_2(gr_complex sample) const; // for BPSK
-
-    typedef float (costas_loop_cc_impl::*d_phase_detector_t)(gr_complex sample) const;
-    static d_phase_detector_t choose_phase_detector(unsigned int order, bool use_snr);
-    const d_phase_detector_t d_phase_detector;
+    float phase_detector_snr_2(gr_complex sample) const // for BPSK
+    {
+        const float snr = std::norm(sample) / d_noise;
+        return blocks::tanhf_lut(snr * sample.real()) * sample.imag();
+    };
 
 public:
     costas_loop_cc_impl(float loop_bw, unsigned int order, bool use_snr = false);
     ~costas_loop_cc_impl();
 
-    float error() const;
+    float error() const { return d_error; };
 
     void handle_set_noise(pmt::pmt_t msg);