@Article{LluisGomez2021,
author="Lluis Gomez
and Ali Furkan Biten
and Ruben Tito
and Andres Mafla
and Mar{\c{c}}al Rusi{\~n}ol
and Ernest Valveny
and Dimosthenis Karatzas",
title="Multimodal grid features and cell pointers for scene text visual question answering",
journal="Pattern Recognition Letters",
year="2021",
volume="150",
pages="242--249",
abstract="This paper presents a new model for the task of scene text visual question answering. In this task questions about a given image can only be answered by reading and understanding scene text. Current state of the art models for this task make use of a dual attention mechanism in which one attention module attends to visual features while the other attends to textual features. A possible issue with this is that it makes difficult for the model to reason jointly about both modalities. To fix this problem we propose a new model that is based on an single attention mechanism that attends to multi-modal features conditioned to the question. The output weights of this attention module over a grid of multi-modal spatial features are interpreted as the probability that a certain spatial location of the image contains the answer text to the given question. Our experiments demonstrate competitive performance in two standard datasets with a model that is  faster than previous methods at inference time. Furthermore, we also provide a novel analysis of the ST-VQA dataset based on a human performance study. Supplementary material, code, and data is made available through this link.",
optnote="DAG; 600.084; 600.121",
optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3620), last updated on Fri, 28 Jan 2022 10:29:23 +0100",
opturl="https://www.sciencedirect.com/science/article/pii/S0167865521002336?via\%3Dihub",
file=":http://refbase.cvc.uab.es/files/GBT2021.pdf:PDF"
}