@inproceedings{Kaya2012CISS,
abstract = {The auditory system is flooded with information throughout our daily lives. Rather than processing all of this information, we selectively shift our attention to various auditory events - either events of interest (top-down attention) or events that capture our attention exogenously (bottom-up). In this work, we are concerned with aspects of human attention that are bottom-up stimulus-driven. Saliency of an auditory event is measured by how much the event differs from the surrounding sounds that precede it in time. To calculate this, we propose a novel auditory saliency map that is defined only over time. The proposed model is contrasted against previously published auditory saliency maps which treat the two-dimensional auditory time-frequency spectrogram as an image that can be analyzed using visual saliency models. Instead, our proposed model capitalizes on the rich high-dimensional feature space that defines auditory events; where each acoustic dimension is processed across multiple scales. These normalized feature maps are then combined over time into a single temporal saliency map. The peaks of the temporal saliency map indicate the locations of the salient events in the auditory scene. We validate the accuracy of the proposed model in simulated test scenarios of simple and complex sound clips. By exploiting the unique aspects of auditory processing that cannot be readily captured by visual processes, we are able to outperform other auditory saliency models; all while highlighting the commonalities and differences between the two modalities in processing salient events in everyday scenes.},
author = {Kaya, Emine Merve and Elhilali, Mounya},
booktitle = {46th Annual Conference on Information Sciences and Systems (CISS)},
doi = {10.1109/CISS.2012.6310945},
isbn = {978-1-4673-3140-1},
pages = {1--6},
title = {{A temporal saliency map for modeling auditory attention}},
url = {http://ieeexplore.ieee.org/document/6310945/},
year = {2012}
}