Source prompt:
machine gun
↓
Target prompt:
laser gun
Input Video
ControlVideo+ZEUS
TokenFlow+ZEUS
RAVE+ZEUS
AvED (Ours)
Source prompt:
a dog is howling
↓
Target prompt:
a lion is roaring
Source prompt:
a race car is driving on the road
↓
Target prompt:
a police car is driving on the road
Source prompt:
firework lit up in the night sky
↓
Target prompt:
a splash of water lit up in the night sky
@inproceedings{lin2026aved,
title={Zero-Shot Audio-Visual Editing via Cross-Modal Delta Denoising},
author={Lin, Yan-Bo and Lin, Kevin and Yang, Zhengyuan and Li, Linjie and Wang, Jianfeng and Lin, Chung-Ching and Wang, Xiaofei and Bertasius, Gedas and Wang, Lijuan},
booktitle={2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
year={2026}
}