AEC3: Multichannel suppressor

This change adds multichannel support to the AEC3 suppressor.
Processing of mono capture is bit-exact to the previous code.

Bug: webrtc:10913
Change-Id: I89affe3e066021bc34e4b525edf44dd3bea68365
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/158882
Commit-Queue: Gustaf Ullberg <gustaf@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#29692}
This commit is contained in:
Gustaf Ullberg 2019-11-05 15:19:02 +01:00 committed by Commit Bot
parent 3ee47de99b
commit 5ea5749a86
7 changed files with 323 additions and 278 deletions

View File

@ -41,6 +41,8 @@ rtc_library("aec3") {
"decimator.cc",
"decimator.h",
"delay_estimate.h",
"dominant_nearend_detector.cc",
"dominant_nearend_detector.h",
"downsampled_render_buffer.cc",
"downsampled_render_buffer.h",
"echo_audibility.cc",

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/aec3/dominant_nearend_detector.h"
#include <algorithm>
#include <numeric>
namespace webrtc {
DominantNearendDetector::DominantNearendDetector(
const EchoCanceller3Config::Suppressor::DominantNearendDetection config,
size_t num_capture_channels)
: enr_threshold_(config.enr_threshold),
enr_exit_threshold_(config.enr_exit_threshold),
snr_threshold_(config.snr_threshold),
hold_duration_(config.hold_duration),
trigger_threshold_(config.trigger_threshold),
use_during_initial_phase_(config.use_during_initial_phase),
num_capture_channels_(num_capture_channels),
trigger_counters_(num_capture_channels_),
hold_counters_(num_capture_channels_) {}
void DominantNearendDetector::Update(
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
nearend_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
residual_echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
comfort_noise_spectrum,
bool initial_state) {
nearend_state_ = false;
auto low_frequency_energy = [](rtc::ArrayView<const float> spectrum) {
RTC_DCHECK_LE(16, spectrum.size());
return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f);
};
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
const float ne_sum = low_frequency_energy(nearend_spectrum[ch]);
const float echo_sum = low_frequency_energy(residual_echo_spectrum[ch]);
const float noise_sum = low_frequency_energy(comfort_noise_spectrum[ch]);
// Detect strong active nearend if the nearend is sufficiently stronger than
// the echo and the nearend noise.
if ((!initial_state || use_during_initial_phase_) &&
echo_sum < enr_threshold_ * ne_sum &&
ne_sum > snr_threshold_ * noise_sum) {
if (++trigger_counters_[ch] >= trigger_threshold_) {
// After a period of strong active nearend activity, flag nearend mode.
hold_counters_[ch] = hold_duration_;
trigger_counters_[ch] = trigger_threshold_;
}
} else {
// Forget previously detected strong active nearend activity.
trigger_counters_[ch] = std::max(0, trigger_counters_[ch] - 1);
}
// Exit nearend-state early at strong echo.
if (echo_sum > enr_exit_threshold_ * ne_sum &&
echo_sum > snr_threshold_ * noise_sum) {
hold_counters_[ch] = 0;
}
// Remain in any nearend mode for a certain duration.
hold_counters_[ch] = std::max(0, hold_counters_[ch] - 1);
nearend_state_ = nearend_state_ || hold_counters_[ch] > 0;
}
}
} // namespace webrtc

View File

@ -0,0 +1,56 @@
/*
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AEC3_DOMINANT_NEAREND_DETECTOR_H_
#define MODULES_AUDIO_PROCESSING_AEC3_DOMINANT_NEAREND_DETECTOR_H_
#include <vector>
#include "api/array_view.h"
#include "api/audio/echo_canceller3_config.h"
#include "modules/audio_processing/aec3/aec3_common.h"
namespace webrtc {
// Class for selecting whether the suppressor is in the nearend or echo state.
class DominantNearendDetector {
public:
DominantNearendDetector(
const EchoCanceller3Config::Suppressor::DominantNearendDetection config,
size_t num_capture_channels);
// Returns whether the current state is the nearend state.
bool IsNearendState() const { return nearend_state_; }
// Updates the state selection based on latest spectral estimates.
void Update(rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
nearend_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
residual_echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
comfort_noise_spectrum,
bool initial_state);
private:
const float enr_threshold_;
const float enr_exit_threshold_;
const float snr_threshold_;
const int hold_duration_;
const int trigger_threshold_;
const bool use_during_initial_phase_;
const size_t num_capture_channels_;
bool nearend_state_ = false;
std::vector<int> trigger_counters_;
std::vector<int> hold_counters_;
};
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AEC3_DOMINANT_NEAREND_DETECTOR_H_

View File

@ -148,7 +148,7 @@ class EchoRemoverImpl final : public EchoRemover {
const size_t num_capture_channels_;
const bool use_shadow_filter_output_;
Subtractor subtractor_;
std::vector<std::unique_ptr<SuppressionGain>> suppression_gains_;
SuppressionGain suppression_gain_;
ComfortNoiseGenerator cng_;
SuppressionFilter suppression_filter_;
RenderSignalAnalyzer render_signal_analyzer_;
@ -195,7 +195,10 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config,
num_capture_channels_,
data_dumper_.get(),
optimization_),
suppression_gains_(num_capture_channels_),
suppression_gain_(config_,
optimization_,
sample_rate_hz,
num_capture_channels),
cng_(optimization_, num_capture_channels_),
suppression_filter_(optimization_,
sample_rate_hz_,
@ -203,9 +206,9 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config,
render_signal_analyzer_(config_),
residual_echo_estimator_(config_, num_render_channels),
aec_state_(config_, num_capture_channels_),
e_old_(num_capture_channels_),
y_old_(num_capture_channels_),
e_heap_(NumChannelsOnHeap(num_capture_channels_)),
e_old_(num_capture_channels_, {0.f}),
y_old_(num_capture_channels_, {0.f}),
e_heap_(NumChannelsOnHeap(num_capture_channels_), {0.f}),
Y2_heap_(NumChannelsOnHeap(num_capture_channels_)),
E2_heap_(NumChannelsOnHeap(num_capture_channels_)),
R2_heap_(NumChannelsOnHeap(num_capture_channels_)),
@ -216,16 +219,6 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config,
high_band_comfort_noise_heap_(NumChannelsOnHeap(num_capture_channels_)),
subtractor_output_heap_(NumChannelsOnHeap(num_capture_channels_)) {
RTC_DCHECK(ValidFullBandRate(sample_rate_hz));
for (auto& e_k : e_heap_) {
e_k.fill(0.f);
}
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
suppression_gains_[ch] = std::make_unique<SuppressionGain>(
config_, optimization_, sample_rate_hz);
e_old_[ch].fill(0.f);
y_old_[ch].fill(0.f);
}
}
EchoRemoverImpl::~EchoRemoverImpl() = default;
@ -343,9 +336,7 @@ void EchoRemoverImpl::ProcessCapture(
if (echo_path_variability.delay_change !=
EchoPathVariability::DelayAdjustment::kNone) {
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
suppression_gains_[ch]->SetInitialState(true);
}
suppression_gain_.SetInitialState(true);
}
}
if (gain_change_hangover_ > 0) {
@ -359,9 +350,7 @@ void EchoRemoverImpl::ProcessCapture(
// State transition.
if (aec_state_.TransitionTriggered()) {
subtractor_.ExitInitialState();
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
suppression_gains_[ch]->SetInitialState(false);
}
suppression_gain_.SetInitialState(false);
}
// Perform linear echo cancellation.
@ -390,10 +379,6 @@ void EchoRemoverImpl::ProcessCapture(
1);
data_dumper_->DumpWav("aec3_output_linear2", kBlockSize, &e[0][0], 16000, 1);
float high_bands_gain = 1.f;
std::array<float, kFftLengthBy2Plus1> G;
G.fill(1.f);
// Estimate the residual echo power.
residual_echo_estimator_.Estimate(aec_state_, *render_buffer, S2_linear, Y2,
R2);
@ -402,34 +387,27 @@ void EchoRemoverImpl::ProcessCapture(
cng_.Compute(aec_state_.SaturatedCapture(), Y2, comfort_noise,
high_band_comfort_noise);
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
// Suppressor echo estimate.
const auto& echo_spectrum =
aec_state_.UsableLinearEstimate() ? S2_linear[ch] : R2[ch];
// Suppressor nearend estimate.
std::array<float, kFftLengthBy2Plus1> nearend_spectrum_bounded;
if (aec_state_.UsableLinearEstimate()) {
// Suppressor nearend estimate.
if (aec_state_.UsableLinearEstimate()) {
// E2 is bound by Y2.
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
std::transform(E2[ch].begin(), E2[ch].end(), Y2[ch].begin(),
nearend_spectrum_bounded.begin(),
E2[ch].begin(),
[](float a, float b) { return std::min(a, b); });
}
const auto& nearend_spectrum =
aec_state_.UsableLinearEstimate() ? nearend_spectrum_bounded : Y2[ch];
// Compute preferred gains for each channel. The minimum gain determines the
// final gain.
float high_bands_gain_channel;
std::array<float, kFftLengthBy2Plus1> G_channel;
suppression_gains_[ch]->GetGain(nearend_spectrum, echo_spectrum, R2[ch],
cng_.NoiseSpectrum()[ch],
render_signal_analyzer_, aec_state_, x,
&high_bands_gain_channel, &G_channel);
high_bands_gain = std::min(high_bands_gain, high_bands_gain_channel);
std::transform(G.begin(), G.end(), G_channel.begin(), G.begin(),
[](float a, float b) { return std::min(a, b); });
}
const auto& nearend_spectrum = aec_state_.UsableLinearEstimate() ? E2 : Y2;
// Suppressor echo estimate.
const auto& echo_spectrum =
aec_state_.UsableLinearEstimate() ? S2_linear : R2;
// Compute preferred gains.
float high_bands_gain;
std::array<float, kFftLengthBy2Plus1> G;
suppression_gain_.GetGain(nearend_spectrum, echo_spectrum, R2,
cng_.NoiseSpectrum(), render_signal_analyzer_,
aec_state_, x, &high_bands_gain, &G);
suppression_filter_.ApplyGain(comfort_noise, high_band_comfort_noise, G,
high_bands_gain, Y_fft, y);

View File

@ -25,8 +25,10 @@
namespace webrtc {
namespace {
// Adjust the gains according to the presence of known external filters.
void AdjustForExternalFilters(std::array<float, kFftLengthBy2Plus1>* gain) {
void PostprocessGains(std::array<float, kFftLengthBy2Plus1>* gain) {
// TODO(gustaf): Investigate if this can be relaxed to achieve higher
// transparency above 2 kHz.
// Limit the low frequency gains to avoid the impact of the high-pass filter
// on the lower-frequency gain influencing the overall achieved gain.
(*gain)[0] = (*gain)[1] = std::min((*gain)[1], (*gain)[2]);
@ -41,6 +43,21 @@ void AdjustForExternalFilters(std::array<float, kFftLengthBy2Plus1>* gain) {
gain->begin() + kAntiAliasingImpactLimit, gain->end() - 1,
[min_upper_gain](float& a) { a = std::min(a, min_upper_gain); });
(*gain)[kFftLengthBy2] = (*gain)[kFftLengthBy2Minus1];
// Limits the gain in the frequencies for which the adaptive filter has not
// converged.
// TODO(peah): Make adaptive to take the actual filter error into account.
constexpr size_t kUpperAccurateBandPlus1 = 29;
constexpr float oneByBandsInSum =
1 / static_cast<float>(kUpperAccurateBandPlus1 - 20);
const float hf_gain_bound =
std::accumulate(gain->begin() + 20,
gain->begin() + kUpperAccurateBandPlus1, 0.f) *
oneByBandsInSum;
std::for_each(gain->begin() + kUpperAccurateBandPlus1, gain->end(),
[hf_gain_bound](float& a) { a = std::min(a, hf_gain_bound); });
}
// Scales the echo according to assessed audibility at the other end.
@ -79,33 +96,14 @@ void WeightEchoForAudibility(const EchoCanceller3Config& config,
weigh(threshold, normalizer, 7, kFftLengthBy2Plus1, echo, weighted_echo);
}
// TODO(peah): Make adaptive to take the actual filter error into account.
constexpr size_t kUpperAccurateBandPlus1 = 29;
// Limits the gain in the frequencies for which the adaptive filter has not
// converged. Currently, these frequencies are not hardcoded to the frequencies
// which are typically not excited by speech.
// TODO(peah): Make adaptive to take the actual filter error into account.
void AdjustNonConvergedFrequencies(
std::array<float, kFftLengthBy2Plus1>* gain) {
constexpr float oneByBandsInSum =
1 / static_cast<float>(kUpperAccurateBandPlus1 - 20);
const float hf_gain_bound =
std::accumulate(gain->begin() + 20,
gain->begin() + kUpperAccurateBandPlus1, 0.f) *
oneByBandsInSum;
std::for_each(gain->begin() + kUpperAccurateBandPlus1, gain->end(),
[hf_gain_bound](float& a) { a = std::min(a, hf_gain_bound); });
}
} // namespace
int SuppressionGain::instance_count_ = 0;
float SuppressionGain::UpperBandsGain(
const std::array<float, kFftLengthBy2Plus1>& echo_spectrum,
const std::array<float, kFftLengthBy2Plus1>& comfort_noise_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
comfort_noise_spectrum,
const absl::optional<int>& narrow_peak_band,
bool saturated_echo,
const std::vector<std::vector<std::vector<float>>>& render,
@ -161,18 +159,22 @@ float SuppressionGain::UpperBandsGain(
anti_howling_gain = 0.01f * sqrtf(low_band_energy / high_band_energy);
}
// Bound the upper gain during significant echo activity.
auto low_frequency_energy = [](rtc::ArrayView<const float> spectrum) {
RTC_DCHECK_LE(16, spectrum.size());
return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f);
};
const float echo_sum = low_frequency_energy(echo_spectrum);
const float noise_sum = low_frequency_energy(comfort_noise_spectrum);
const auto& cfg = config_.suppressor.high_bands_suppression;
float gain_bound = 1.f;
if (echo_sum > cfg.enr_threshold * noise_sum &&
!dominant_nearend_detector_.IsNearendState()) {
gain_bound = cfg.max_gain_during_echo;
if (!dominant_nearend_detector_.IsNearendState()) {
// Bound the upper gain during significant echo activity.
const auto& cfg = config_.suppressor.high_bands_suppression;
auto low_frequency_energy = [](rtc::ArrayView<const float> spectrum) {
RTC_DCHECK_LE(16, spectrum.size());
return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f);
};
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
const float echo_sum = low_frequency_energy(echo_spectrum[ch]);
const float noise_sum = low_frequency_energy(comfort_noise_spectrum[ch]);
if (echo_sum > cfg.enr_threshold * noise_sum) {
gain_bound = cfg.max_gain_during_echo;
break;
}
}
}
// Choose the gain as the minimum of the lower and upper gains.
@ -184,8 +186,6 @@ void SuppressionGain::GainToNoAudibleEcho(
const std::array<float, kFftLengthBy2Plus1>& nearend,
const std::array<float, kFftLengthBy2Plus1>& echo,
const std::array<float, kFftLengthBy2Plus1>& masker,
const std::array<float, kFftLengthBy2Plus1>& min_gain,
const std::array<float, kFftLengthBy2Plus1>& max_gain,
std::array<float, kFftLengthBy2Plus1>* gain) const {
const auto& p = dominant_nearend_detector_.IsNearendState() ? nearend_params_
: normal_params_;
@ -198,7 +198,7 @@ void SuppressionGain::GainToNoAudibleEcho(
(p.enr_suppress_[k] - p.enr_transparent_[k]);
g = std::max(g, p.emr_transparent_[k] / emr);
}
(*gain)[k] = std::max(std::min(g, max_gain[k]), min_gain[k]);
(*gain)[k] = g;
}
}
@ -206,6 +206,8 @@ void SuppressionGain::GainToNoAudibleEcho(
// above the zero sample values.
void SuppressionGain::GetMinGain(
rtc::ArrayView<const float> weighted_residual_echo,
rtc::ArrayView<const float> last_nearend,
rtc::ArrayView<const float> last_echo,
bool low_noise_render,
bool saturated_echo,
rtc::ArrayView<float> min_gain) const {
@ -227,7 +229,7 @@ void SuppressionGain::GetMinGain(
// Make sure the gains of the low frequencies do not decrease too
// quickly after strong nearend.
if (last_nearend_[k] > last_echo_[k]) {
if (last_nearend[k] > last_echo[k]) {
min_gain[k] = std::max(min_gain[k], last_gain_[k] * dec);
min_gain[k] = std::min(min_gain[k], 1.f);
}
@ -249,79 +251,91 @@ void SuppressionGain::GetMaxGain(rtc::ArrayView<float> max_gain) const {
}
}
// TODO(peah): Add further optimizations, in particular for the divisions.
void SuppressionGain::LowerBandGain(
bool low_noise_render,
const AecState& aec_state,
const std::array<float, kFftLengthBy2Plus1>& suppressor_input,
const std::array<float, kFftLengthBy2Plus1>& nearend,
const std::array<float, kFftLengthBy2Plus1>& residual_echo,
const std::array<float, kFftLengthBy2Plus1>& comfort_noise,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
suppressor_input,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> residual_echo,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> comfort_noise,
std::array<float, kFftLengthBy2Plus1>* gain) {
gain->fill(1.f);
const bool saturated_echo = aec_state.SaturatedEcho();
// Weight echo power in terms of audibility. // Precompute 1/weighted echo
// (note that when the echo is zero, the precomputed value is never used).
std::array<float, kFftLengthBy2Plus1> weighted_residual_echo;
WeightEchoForAudibility(config_, residual_echo, weighted_residual_echo);
std::array<float, kFftLengthBy2Plus1> min_gain;
GetMinGain(weighted_residual_echo, low_noise_render, saturated_echo,
min_gain);
std::array<float, kFftLengthBy2Plus1> max_gain;
GetMaxGain(max_gain);
GainToNoAudibleEcho(nearend, weighted_residual_echo, comfort_noise, min_gain,
max_gain, gain);
AdjustForExternalFilters(gain);
for (size_t ch = 0; ch < num_capture_channels_; ++ch) {
std::array<float, kFftLengthBy2Plus1> G;
std::array<float, kFftLengthBy2Plus1> nearend;
nearend_smoothers_[ch].Average(suppressor_input[ch], nearend);
// Adjust the gain for frequencies which have not yet converged.
AdjustNonConvergedFrequencies(gain);
// Weight echo power in terms of audibility.
std::array<float, kFftLengthBy2Plus1> weighted_residual_echo;
WeightEchoForAudibility(config_, residual_echo[ch], weighted_residual_echo);
// Store data required for the gain computation of the next block.
std::copy(nearend.begin(), nearend.end(), last_nearend_.begin());
std::copy(weighted_residual_echo.begin(), weighted_residual_echo.end(),
last_echo_.begin());
std::array<float, kFftLengthBy2Plus1> min_gain;
GetMinGain(weighted_residual_echo, last_nearend_[ch], last_echo_[ch],
low_noise_render, saturated_echo, min_gain);
GainToNoAudibleEcho(nearend, weighted_residual_echo, comfort_noise[0], &G);
// Clamp gains.
for (size_t k = 0; k < gain->size(); ++k) {
G[k] = std::max(std::min(G[k], max_gain[k]), min_gain[k]);
(*gain)[k] = std::min((*gain)[k], G[k]);
}
// Store data required for the gain computation of the next block.
std::copy(nearend.begin(), nearend.end(), last_nearend_[ch].begin());
std::copy(weighted_residual_echo.begin(), weighted_residual_echo.end(),
last_echo_[ch].begin());
}
// Limit high-frequency gains.
PostprocessGains(gain);
// Store computed gains.
std::copy(gain->begin(), gain->end(), last_gain_.begin());
aec3::VectorMath(optimization_).Sqrt(*gain);
// Debug outputs for the purpose of development and analysis.
data_dumper_->DumpRaw("aec3_suppressor_min_gain", min_gain);
data_dumper_->DumpRaw("aec3_suppressor_max_gain", max_gain);
data_dumper_->DumpRaw("aec3_dominant_nearend",
dominant_nearend_detector_.IsNearendState());
// Transform gains to amplitude domain.
aec3::VectorMath(optimization_).Sqrt(*gain);
}
SuppressionGain::SuppressionGain(const EchoCanceller3Config& config,
Aec3Optimization optimization,
int sample_rate_hz)
int sample_rate_hz,
size_t num_capture_channels)
: data_dumper_(
new ApmDataDumper(rtc::AtomicOps::Increment(&instance_count_))),
optimization_(optimization),
config_(config),
num_capture_channels_(num_capture_channels),
state_change_duration_blocks_(
static_cast<int>(config_.filter.config_change_duration_blocks)),
moving_average_(kFftLengthBy2Plus1,
config.suppressor.nearend_average_blocks),
last_nearend_(num_capture_channels_, {0}),
last_echo_(num_capture_channels_, {0}),
nearend_smoothers_(
num_capture_channels_,
aec3::MovingAverage(kFftLengthBy2Plus1,
config.suppressor.nearend_average_blocks)),
nearend_params_(config_.suppressor.nearend_tuning),
normal_params_(config_.suppressor.normal_tuning),
dominant_nearend_detector_(
config_.suppressor.dominant_nearend_detection) {
dominant_nearend_detector_(config_.suppressor.dominant_nearend_detection,
num_capture_channels_) {
RTC_DCHECK_LT(0, state_change_duration_blocks_);
one_by_state_change_duration_blocks_ = 1.f / state_change_duration_blocks_;
last_gain_.fill(1.f);
last_nearend_.fill(0.f);
last_echo_.fill(0.f);
}
SuppressionGain::~SuppressionGain() = default;
void SuppressionGain::GetGain(
const std::array<float, kFftLengthBy2Plus1>& nearend_spectrum,
const std::array<float, kFftLengthBy2Plus1>& echo_spectrum,
const std::array<float, kFftLengthBy2Plus1>& residual_echo_spectrum,
const std::array<float, kFftLengthBy2Plus1>& comfort_noise_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
nearend_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
residual_echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
comfort_noise_spectrum,
const RenderSignalAnalyzer& render_signal_analyzer,
const AecState& aec_state,
const std::vector<std::vector<std::vector<float>>>& render,
@ -337,18 +351,20 @@ void SuppressionGain::GetGain(
return;
}
std::array<float, kFftLengthBy2Plus1> nearend_average;
moving_average_.Average(nearend_spectrum, nearend_average);
// Update the state selection.
// Update the nearend state selection.
dominant_nearend_detector_.Update(nearend_spectrum, residual_echo_spectrum,
comfort_noise_spectrum, initial_state_);
// Compute gain for the lower band.
bool low_noise_render = low_render_detector_.Detect(render);
LowerBandGain(low_noise_render, aec_state, nearend_spectrum, nearend_average,
LowerBandGain(low_noise_render, aec_state, nearend_spectrum,
residual_echo_spectrum, comfort_noise_spectrum, low_band_gain);
if (cfg.enforce_empty_higher_bands) {
*high_bands_gain = 0.f;
return;
}
// Compute the gain for the upper bands.
const absl::optional<int> narrow_peak_band =
render_signal_analyzer.NarrowPeakBand();
@ -356,9 +372,6 @@ void SuppressionGain::GetGain(
*high_bands_gain =
UpperBandsGain(echo_spectrum, comfort_noise_spectrum, narrow_peak_band,
aec_state.SaturatedEcho(), render, *low_band_gain);
if (cfg.enforce_empty_higher_bands) {
*high_bands_gain = 0.f;
}
}
void SuppressionGain::SetInitialState(bool state) {
@ -394,54 +407,6 @@ bool SuppressionGain::LowNoiseRenderDetector::Detect(
return low_noise_render;
}
SuppressionGain::DominantNearendDetector::DominantNearendDetector(
const EchoCanceller3Config::Suppressor::DominantNearendDetection config)
: enr_threshold_(config.enr_threshold),
enr_exit_threshold_(config.enr_exit_threshold),
snr_threshold_(config.snr_threshold),
hold_duration_(config.hold_duration),
trigger_threshold_(config.trigger_threshold),
use_during_initial_phase_(config.use_during_initial_phase) {}
void SuppressionGain::DominantNearendDetector::Update(
rtc::ArrayView<const float> nearend_spectrum,
rtc::ArrayView<const float> residual_echo_spectrum,
rtc::ArrayView<const float> comfort_noise_spectrum,
bool initial_state) {
auto low_frequency_energy = [](rtc::ArrayView<const float> spectrum) {
RTC_DCHECK_LE(16, spectrum.size());
return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f);
};
const float ne_sum = low_frequency_energy(nearend_spectrum);
const float echo_sum = low_frequency_energy(residual_echo_spectrum);
const float noise_sum = low_frequency_energy(comfort_noise_spectrum);
// Detect strong active nearend if the nearend is sufficiently stronger than
// the echo and the nearend noise.
if ((!initial_state || use_during_initial_phase_) &&
echo_sum < enr_threshold_ * ne_sum &&
ne_sum > snr_threshold_ * noise_sum) {
if (++trigger_counter_ >= trigger_threshold_) {
// After a period of strong active nearend activity, flag nearend mode.
hold_counter_ = hold_duration_;
trigger_counter_ = trigger_threshold_;
}
} else {
// Forget previously detected strong active nearend activity.
trigger_counter_ = std::max(0, trigger_counter_ - 1);
}
// Exit nearend-state early at strong echo.
if (echo_sum > enr_exit_threshold_ * ne_sum &&
echo_sum > snr_threshold_ * noise_sum) {
hold_counter_ = 0;
}
// Remain in any nearend mode for a certain duration.
hold_counter_ = std::max(0, hold_counter_ - 1);
nearend_state_ = hold_counter_ > 0;
}
SuppressionGain::GainParameters::GainParameters(
const EchoCanceller3Config::Suppressor::Tuning& tuning)
: max_inc_factor(tuning.max_inc_factor),

View File

@ -20,6 +20,7 @@
#include "api/audio/echo_canceller3_config.h"
#include "modules/audio_processing/aec3/aec3_common.h"
#include "modules/audio_processing/aec3/aec_state.h"
#include "modules/audio_processing/aec3/dominant_nearend_detector.h"
#include "modules/audio_processing/aec3/fft_data.h"
#include "modules/audio_processing/aec3/moving_average.h"
#include "modules/audio_processing/aec3/render_signal_analyzer.h"
@ -32,13 +33,17 @@ class SuppressionGain {
public:
SuppressionGain(const EchoCanceller3Config& config,
Aec3Optimization optimization,
int sample_rate_hz);
int sample_rate_hz,
size_t num_capture_channels);
~SuppressionGain();
void GetGain(
const std::array<float, kFftLengthBy2Plus1>& nearend_spectrum,
const std::array<float, kFftLengthBy2Plus1>& echo_spectrum,
const std::array<float, kFftLengthBy2Plus1>& residual_echo_spectrum,
const std::array<float, kFftLengthBy2Plus1>& comfort_noise_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
nearend_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
residual_echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
comfort_noise_spectrum,
const RenderSignalAnalyzer& render_signal_analyzer,
const AecState& aec_state,
const std::vector<std::vector<std::vector<float>>>& render,
@ -51,31 +56,31 @@ class SuppressionGain {
private:
// Computes the gain to apply for the bands beyond the first band.
float UpperBandsGain(
const std::array<float, kFftLengthBy2Plus1>& echo_spectrum,
const std::array<float, kFftLengthBy2Plus1>& comfort_noise_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> echo_spectrum,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
comfort_noise_spectrum,
const absl::optional<int>& narrow_peak_band,
bool saturated_echo,
const std::vector<std::vector<std::vector<float>>>& render,
const std::array<float, kFftLengthBy2Plus1>& low_band_gain) const;
void GainToNoAudibleEcho(
const std::array<float, kFftLengthBy2Plus1>& nearend,
const std::array<float, kFftLengthBy2Plus1>& echo,
const std::array<float, kFftLengthBy2Plus1>& masker,
const std::array<float, kFftLengthBy2Plus1>& min_gain,
const std::array<float, kFftLengthBy2Plus1>& max_gain,
std::array<float, kFftLengthBy2Plus1>* gain) const;
void GainToNoAudibleEcho(const std::array<float, kFftLengthBy2Plus1>& nearend,
const std::array<float, kFftLengthBy2Plus1>& echo,
const std::array<float, kFftLengthBy2Plus1>& masker,
std::array<float, kFftLengthBy2Plus1>* gain) const;
void LowerBandGain(
bool stationary_with_low_power,
const AecState& aec_state,
const std::array<float, kFftLengthBy2Plus1>& suppressor_input,
const std::array<float, kFftLengthBy2Plus1>& nearend,
const std::array<float, kFftLengthBy2Plus1>& residual_echo,
const std::array<float, kFftLengthBy2Plus1>& comfort_noise,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>>
suppressor_input,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> residual_echo,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> comfort_noise,
std::array<float, kFftLengthBy2Plus1>* gain);
void GetMinGain(rtc::ArrayView<const float> weighted_residual_echo,
rtc::ArrayView<const float> last_nearend,
rtc::ArrayView<const float> last_echo,
bool low_noise_render,
bool saturated_echo,
rtc::ArrayView<float> min_gain) const;
@ -90,35 +95,6 @@ class SuppressionGain {
float average_power_ = 32768.f * 32768.f;
};
// Class for selecting whether the suppressor is in the nearend or echo state.
class DominantNearendDetector {
public:
explicit DominantNearendDetector(
const EchoCanceller3Config::Suppressor::DominantNearendDetection
config);
// Returns whether the current state is the nearend state.
bool IsNearendState() const { return nearend_state_; }
// Updates the state selection based on latest spectral estimates.
void Update(rtc::ArrayView<const float> nearend_spectrum,
rtc::ArrayView<const float> residual_echo_spectrum,
rtc::ArrayView<const float> comfort_noise_spectrum,
bool initial_state);
private:
const float enr_threshold_;
const float enr_exit_threshold_;
const float snr_threshold_;
const int hold_duration_;
const int trigger_threshold_;
const bool use_during_initial_phase_;
bool nearend_state_ = false;
int trigger_counter_ = 0;
int hold_counter_ = 0;
};
struct GainParameters {
explicit GainParameters(
const EchoCanceller3Config::Suppressor::Tuning& tuning);
@ -133,15 +109,15 @@ class SuppressionGain {
std::unique_ptr<ApmDataDumper> data_dumper_;
const Aec3Optimization optimization_;
const EchoCanceller3Config config_;
const size_t num_capture_channels_;
const int state_change_duration_blocks_;
float one_by_state_change_duration_blocks_;
std::array<float, kFftLengthBy2Plus1> last_gain_;
std::array<float, kFftLengthBy2Plus1> last_nearend_;
std::array<float, kFftLengthBy2Plus1> last_echo_;
std::vector<std::array<float, kFftLengthBy2Plus1>> last_nearend_;
std::vector<std::array<float, kFftLengthBy2Plus1>> last_echo_;
LowNoiseRenderDetector low_render_detector_;
bool initial_state_ = true;
int initial_state_change_counter_ = 0;
aec3::MovingAverage moving_average_;
std::vector<aec3::MovingAverage> nearend_smoothers_;
const GainParameters nearend_params_;
const GainParameters normal_params_;
DominantNearendDetector dominant_nearend_detector_;

View File

@ -26,16 +26,15 @@ namespace aec3 {
// Verifies that the check for non-null output gains works.
TEST(SuppressionGain, NullOutputGains) {
std::array<float, kFftLengthBy2Plus1> E2;
std::array<float, kFftLengthBy2Plus1> R2;
std::array<float, kFftLengthBy2Plus1> S2;
std::array<float, kFftLengthBy2Plus1> N2;
std::vector<std::array<float, kFftLengthBy2Plus1>> E2(1, {0.f});
std::vector<std::array<float, kFftLengthBy2Plus1>> R2(1, {0.f});
std::vector<std::array<float, kFftLengthBy2Plus1>> S2(1);
std::vector<std::array<float, kFftLengthBy2Plus1>> N2(1, {0.f});
for (auto& S2_k : S2) {
S2_k.fill(.1f);
}
FftData E;
FftData Y;
E2.fill(0.f);
R2.fill(0.f);
S2.fill(0.1f);
N2.fill(0.f);
E.re.fill(0.f);
E.im.fill(0.f);
Y.re.fill(0.f);
@ -44,7 +43,7 @@ TEST(SuppressionGain, NullOutputGains) {
float high_bands_gain;
AecState aec_state(EchoCanceller3Config{}, 1);
EXPECT_DEATH(
SuppressionGain(EchoCanceller3Config{}, DetectOptimization(), 16000)
SuppressionGain(EchoCanceller3Config{}, DetectOptimization(), 16000, 1)
.GetGain(E2, S2, R2, N2,
RenderSignalAnalyzer((EchoCanceller3Config{})), aec_state,
std::vector<std::vector<std::vector<float>>>(
@ -59,46 +58,43 @@ TEST(SuppressionGain, NullOutputGains) {
// Does a sanity check that the gains are correctly computed.
TEST(SuppressionGain, BasicGainComputation) {
constexpr size_t kNumRenderChannels = 1;
constexpr size_t kNumCaptureChannels = 1;
constexpr size_t kNumCaptureChannels = 2;
constexpr int kSampleRateHz = 16000;
constexpr size_t kNumBands = NumBandsForRate(kSampleRateHz);
SuppressionGain suppression_gain(EchoCanceller3Config(), DetectOptimization(),
kSampleRateHz);
kSampleRateHz, kNumCaptureChannels);
RenderSignalAnalyzer analyzer(EchoCanceller3Config{});
float high_bands_gain;
std::vector<std::array<float, kFftLengthBy2Plus1>> E2(kNumCaptureChannels);
std::array<float, kFftLengthBy2Plus1> S2;
std::vector<std::array<float, kFftLengthBy2Plus1>> S2(kNumCaptureChannels,
{0.f});
std::vector<std::array<float, kFftLengthBy2Plus1>> Y2(kNumCaptureChannels);
std::array<float, kFftLengthBy2Plus1> R2;
std::array<float, kFftLengthBy2Plus1> N2;
std::vector<std::array<float, kFftLengthBy2Plus1>> R2(kNumCaptureChannels);
std::vector<std::array<float, kFftLengthBy2Plus1>> N2(kNumCaptureChannels);
std::array<float, kFftLengthBy2Plus1> g;
std::vector<SubtractorOutput> output(kNumCaptureChannels);
std::array<float, kBlockSize> y;
std::vector<std::vector<std::vector<float>>> x(
kNumBands, std::vector<std::vector<float>>(
kNumRenderChannels, std::vector<float>(kBlockSize, 0.f)));
EchoCanceller3Config config;
AecState aec_state(config, kNumCaptureChannels);
ApmDataDumper data_dumper(42);
Subtractor subtractor(config, 1, 1, &data_dumper, DetectOptimization());
Subtractor subtractor(config, kNumRenderChannels, kNumCaptureChannels,
&data_dumper, DetectOptimization());
std::unique_ptr<RenderDelayBuffer> render_delay_buffer(
RenderDelayBuffer::Create(config, kSampleRateHz, kNumRenderChannels));
absl::optional<DelayEstimate> delay_estimate;
// Ensure that a strong noise is detected to mask any echoes.
for (auto& E2_k : E2) {
E2_k.fill(10.f);
for (size_t ch = 0; ch < kNumCaptureChannels; ++ch) {
E2[ch].fill(10.f);
Y2[ch].fill(10.f);
R2[ch].fill(.1f);
N2[ch].fill(100.f);
}
for (auto& Y2_k : Y2) {
Y2_k.fill(10.f);
}
R2.fill(0.1f);
S2.fill(0.1f);
N2.fill(100.f);
for (auto& subtractor_output : output) {
subtractor_output.Reset();
}
y.fill(0.f);
// Ensure that the gain is no longer forced to zero.
for (int k = 0; k <= kNumBlocksPerSecond / 5 + 1; ++k) {
@ -111,41 +107,37 @@ TEST(SuppressionGain, BasicGainComputation) {
aec_state.Update(delay_estimate, subtractor.FilterFrequencyResponses(),
subtractor.FilterImpulseResponses(),
*render_delay_buffer->GetRenderBuffer(), E2, Y2, output);
suppression_gain.GetGain(E2[0], S2, R2, N2, analyzer, aec_state, x,
suppression_gain.GetGain(E2, S2, R2, N2, analyzer, aec_state, x,
&high_bands_gain, &g);
}
std::for_each(g.begin(), g.end(),
[](float a) { EXPECT_NEAR(1.f, a, 0.001); });
// Ensure that a strong nearend is detected to mask any echoes.
for (auto& E2_k : E2) {
E2_k.fill(100.f);
for (size_t ch = 0; ch < kNumCaptureChannels; ++ch) {
E2[ch].fill(100.f);
Y2[ch].fill(100.f);
R2[ch].fill(0.1f);
S2[ch].fill(0.1f);
N2[ch].fill(0.f);
}
for (auto& Y2_k : Y2) {
Y2_k.fill(100.f);
}
R2.fill(0.1f);
S2.fill(0.1f);
N2.fill(0.f);
for (int k = 0; k < 100; ++k) {
aec_state.Update(delay_estimate, subtractor.FilterFrequencyResponses(),
subtractor.FilterImpulseResponses(),
*render_delay_buffer->GetRenderBuffer(), E2, Y2, output);
suppression_gain.GetGain(E2[0], S2, R2, N2, analyzer, aec_state, x,
suppression_gain.GetGain(E2, S2, R2, N2, analyzer, aec_state, x,
&high_bands_gain, &g);
}
std::for_each(g.begin(), g.end(),
[](float a) { EXPECT_NEAR(1.f, a, 0.001); });
// Ensure that a strong echo is suppressed.
for (auto& E2_k : E2) {
E2_k.fill(1000000000.f);
}
R2.fill(10000000000000.f);
// Add a strong echo to one of the channels and ensure that it is suppressed.
E2[1].fill(1000000000.f);
R2[1].fill(10000000000000.f);
for (int k = 0; k < 10; ++k) {
suppression_gain.GetGain(E2[0], S2, R2, N2, analyzer, aec_state, x,
suppression_gain.GetGain(E2, S2, R2, N2, analyzer, aec_state, x,
&high_bands_gain, &g);
}
std::for_each(g.begin(), g.end(),