@InProceedings{AlejandroCartas2019,
author="Alejandro Cartas
and Jordi Luque
and Petia Radeva
and Carlos Segura
and Mariella Dimiccoli",
title="Seeing and Hearing Egocentric Actions: How Much Can We Learn?",
booktitle="IEEE International Conference on Computer Vision Workshops",
year="2019",
pages="4470--4480",
abstract="Our interaction with the world is an inherently multimodal experience. However, the understanding of human-to-object interactions has historically been addressed focusing on a single modality. In particular, a limited number of works have considered to integrate the visual and audio modalities for this purpose. In this work, we propose a multimodal approach for egocentric action recognition in a kitchen environment that relies on audio and visual information. Our model combines a sparse temporal sampling strategy with a late fusion of audio, spatial, and temporal streams. Experimental results on the EPIC-Kitchens dataset show that multimodal integration leads to better performance than unimodal approaches. In particular, we achieved a 5.18\% improvement over the state of the art on verb classification.",
optnote="MILAB; no proj",
optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3385), last updated on Tue, 31 Mar 2020 11:56:39 +0200",
doi="10.1109/ICCVW.2019.00548",
opturl="https://ieeexplore.ieee.org/document/9022020"
}