@Article{FatemehNoroozi2019, author="Fatemeh Noroozi and Marina Marjanovic and Angelina Njegus and Sergio Escalera and Gholamreza Anbarjafari", title="Audio-Visual Emotion Recognition in Video Clips", journal="IEEE Transactions on Affective Computing", year="2019", volume="10", number="1", pages="60--75", abstract="This paper presents a multimodal emotion recognition system, which is based on the analysis of audio and visual cues. From the audio channel, Mel-Frequency Cepstral Coefficients, Filter Bank Energies and prosodic features are extracted. For the visual part, two strategies are considered. First, facial landmarks{\textquoteright} geometric relations, i.e. distances and angles, are computed. Second, we summarize each emotional video into a reduced set of key-frames, which are taught to visually discriminate between the emotions. In order to do so, a convolutional neural network is applied to key-frames summarizing videos. Finally, confidence outputs of all the classifiers from all the modalities are used to define a new feature space to be learned for final emotion label prediction, in a late fusion/stacking fashion. The experiments conducted on the SAVEE, eNTERFACE{\textquoteright}05, and RML databases show significant performance improvements by our proposed system in comparison to current alternatives, defining the current state-of-the-art in all three databases.", optnote="HUPBA; 602.143; 602.133", optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3011), last updated on Mon, 24 Oct 2022 19:00:10 +0200", doi="10.1109/TAFFC.2017.2713783" }