@Article{FatemehNoroozi2019,
author="Fatemeh Noroozi
and Marina Marjanovic
and Angelina Njegus
and Sergio Escalera
and Gholamreza Anbarjafari",
title="Audio-Visual Emotion Recognition in Video Clips",
journal="IEEE Transactions on Affective Computing",
year="2019",
volume="10",
number="1",
pages="60--75",
abstract="This paper presents a multimodal emotion recognition system, which is based on the analysis of audio and visual cues. From the audio channel, Mel-Frequency Cepstral Coefficients, Filter Bank Energies and prosodic features are extracted. For the visual part, two strategies are considered. First, facial landmarks{\textquoteright} geometric relations, i.e. distances and angles, are computed. Second, we summarize each emotional video into a reduced set of key-frames, which are taught to visually discriminate between the emotions. In order to do so, a convolutional neural network is applied to key-frames summarizing videos. Finally, confidence outputs of all the classifiers from all the modalities are used to define a new feature space to be learned for final emotion label prediction, in a late fusion/stacking fashion. The experiments conducted on the SAVEE, eNTERFACE{\textquoteright}05, and RML databases show significant performance improvements by our proposed system in comparison to current alternatives, defining the current state-of-the-art in all three databases.",
optnote="HUPBA; 602.143; 602.133;MILAB",
optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3011), last updated on Mon, 24 Oct 2022 19:00:10 +0200",
doi="10.1109/TAFFC.2017.2713783"
}