@Article{JoakimBruslundHaurum2022,
author="Joakim Bruslund Haurum
and Meysam Madadi
and Sergio Escalera
and Thomas B. Moeslund",
title="Multi-scale hybrid vision transformer and Sinkhorn tokenizer for sewer defect classification",
journal="Automation in Construction",
year="2022",
volume="144",
pages="104614",
optkeywords="Sewer Defect Classification",
optkeywords="Vision Transformers",
optkeywords="Sinkhorn-Knopp",
optkeywords="Convolutional Neural Networks",
optkeywords="Closed-Circuit Television",
optkeywords="Sewer Inspection",
abstract="A crucial part of image classification consists of capturing non-local spatial semantics of image content. This paper describes the multi-scale hybrid vision transformer (MSHViT), an extension of the classical convolutional neural network (CNN) backbone, for multi-label sewer defect classification. To better model spatial semantics in the images, features are aggregated at different scales non-locally through the use of a lightweight vision transformer, and a smaller set of tokens was produced through a novel Sinkhorn clustering-based tokenizer using distinct cluster centers. The proposed MSHViT and Sinkhorn tokenizer were evaluated on the Sewer-ML multi-label sewer defect classification dataset, showing consistent performance improvements of up to 2.53 percentage points.",
optnote="HuPBA",
optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3780), last updated on Tue, 25 Apr 2023 15:26:25 +0200",
doi="10.1016/j.autcon.2022.104614"
}