@InProceedings{XinhangSong2017,
author="Xinhang Song
and Shuqiang Jiang
and Luis Herranz",
title="Combining Models from Multiple Sources for RGB-D Scene Recognition",
booktitle="26th  International Joint Conference on Artificial Intelligence",
year="2017",
pages="4523--4529",
optkeywords="Robotics and Vision",
optkeywords="Vision and Perception",
abstract="Depth can complement RGB with useful cues about object volumes and scene layout. However, RGB-D image datasets are still too small for directly training deep convolutional neural networks (CNNs), in contrast to the massive monomodal RGB datasets. Previous works in RGB-D recognition typically combine two separate networks for RGB and depth data, pretrained with a large RGB dataset and then fine tuned to the respective target RGB and depth datasets. These approaches have several limitations: 1) only use low-level filters learned from RGB data, thus not being able to exploit properly depth-specific patterns, and 2) RGB and depth features are only combined at high-levels but rarely at lower-levels. In this paper, we propose a framework that leverages both knowledge acquired from large RGB datasets together with depth-specific cues learned from the limited depth data, obtaining more effective multi-source and multi-modal representations. We propose a multi-modal combination method that selects discriminative combinations of layers from the different source models and target modalities, capturing both high-level properties of the task and intrinsic low-level properties of both modalities.",
optnote="LAMP; 600.120",
optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=2966), last updated on Thu, 04 Apr 2019 12:30:20 +0200",
doi="10.24963/ijcai.2017/631",
file=":http://refbase.cvc.uab.es/files/SJH2017b.pdf:PDF"
}