@article{Bellur2017,
abstract = {{\textcopyright} 2014 IEEE.Parsing natural acoustic scenes using computational methodologies poses many challenges. Given the rich and complex nature of the acoustic environment, data mismatch between train and test conditions is a major hurdle in data-driven audio processing systems. In contrast, the brain exhibits a remarkable ability at segmenting acoustic scenes with relative ease. When tackling challenging listening conditions that are often faced in everyday life, the biological system relies on a number of principles that allow it to effortlessly parse its rich soundscape. In the current study, we leverage a key principle employed by the auditory system: its ability to adapt the neural representation of its sensory input in a high-dimensional space. We propose a framework that mimics this process in a computational model for robust speech activity detection. The system employs a 2-D Gabor filter bank whose parameters are retuned offline to improve the separability between the feature representation of speech and nonspeech sounds. This retuning process, driven by feedback from statistical models of speech and nonspeech classes, attempts to minimize the misclassification risk of mismatched data, with respect to the original statistical models. We hypothesize that this risk minimization procedure results in an emphasis of unique speech and nonspeech modulations in the high-dimensional space. We show that such an adapted system is indeed robust to other novel conditions, with a marked reduction in equal error rates for a variety of databases with additive and convolutive noise distortions. We discuss the lessons learned from biology with regard to adapting to an ever-changing acoustic environment and the impact on building truly intelligent audio processing systems.},
author = {Bellur, Ashwin and Elhilali, Mounya},
doi = {10.1109/TASLP.2016.2639322},
issn = {2329-9290},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
keywords = {Adaptation,gabor filters,genetic algorithm,spectrotemporal filters,speech activity detection},
number = {3},
pages = {481--492},
title = {{Feedback-Driven Sensory Mapping Adaptation for Robust Speech Activity Detection}},
url = {http://ieeexplore.ieee.org/document/7782359/},
volume = {25},
year = {2017}
}