@Article{SouhailBakkali2023,
author="Souhail Bakkali
and Zuheng Ming
and Mickael Coustaty
and Mar{\c{c}}al Rusi{\~n}ol
and Oriol Ramos Terrades",
title="VLCDoC: Vision-Language Contrastive Pre-Training Model for Cross-Modal Document Classification",
journal="Pattern Recognition",
year="2023",
volume="139",
pages="109419",
abstract="Multimodal learning from document data has achieved great success lately as it allows to pre-train semantically meaningful features as a prior into a learnable downstream approach. In this paper, we approach the document classification problem by learning cross-modal representations through language and vision cues, considering intra- and inter-modality relationships. Instead of merging features from different modalities into a common representation space, the proposed method exploits high-level interactions and learns relevant semantic information from effective attention flows within and across modalities. The proposed learning objective is devised between intra- and inter-modality alignment tasks, where the similarity distribution per task is computed by contracting positive sample pairs while simultaneously contrasting negative ones in the common feature representation space\}. Extensive experiments on public document classification datasets demonstrate the effectiveness and the generalization capacity of our model on both low-scale and large-scale datasets.",
optnote="DAG; 600.140; 600.121",
optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3826), last updated on Mon, 20 Nov 2023 12:04:31 +0100",
issn="ISSN 0031-3203",
doi="10.1016/j.patcog.2023.109419",
file=":http://refbase.cvc.uab.es/files/BMC2022.pdf:PDF"
}