@InProceedings{KaiWang2022, author="Kai Wang and Fei Yang and Joost Van de Weijer", title="Attention Distillation: self-supervised vision transformer students need more guidance", booktitle="33rd British Machine Vision Conference", year="2022", abstract="Self-supervised learning has been widely applied to train high-quality vision transformers. Unleashing their excellent performance on memory and compute constraint devices is therefore an important research topic. However, how to distill knowledge from one self-supervised ViT to another has not yet been explored. Moreover, the existing self-supervised knowledge distillation (SSKD) methods focus on ConvNet based architectures are suboptimal for ViT knowledge distillation. In this paper, we study knowledge distillation of self-supervised vision transformers (ViT-SSKD). We show that directly distilling information from the crucial attention mechanism from teacher to student can significantly narrow the performance gap between both. In experiments on ImageNet-Subset and ImageNet-1K, we show that our method AttnDistill outperforms existing self-supervised knowledge distillation (SSKD) methods and achieves state-of-the-art k-NN accuracy compared with self-supervised learning (SSL) methods learning from scratch (with the ViT-S model). We are also the first to apply the tiny ViT-T model on self-supervised learning. Moreover, AttnDistill is independent of self-supervised learning algorithms, it can be adapted to ViT based SSL methods to improve the performance in future research.", optnote="LAMP; 600.147", optnote="exported from refbase (http://refbase.cvc.uab.es/show.php?record=3793), last updated on Mon, 24 Apr 2023 13:24:54 +0200", file=":http://refbase.cvc.uab.es/files/WYW2022.pdf:PDF" }