@InProceedings{AVSIAM_ECCV24, author = {Lin, Yan-Bo and Bertasius, Gedas}, title = {Siamese Vision Transformers are Scalable Audio-visual Learners}, booktitle = {Proceedings of the European Conference on Computer Vision (ECCV)}, year = {2024} }