@article{lin2024siamese, title={Siamese Vision Transformers are Scalable Audio-visual Learners}, author={Lin, Yan-Bo and Bertasius, Gedas}, journal={arXiv preprint arXiv:2403.19638}, year={2024} }