@inproceedings{10.1145/3641519.3657509, author = {Bozic, Vukasin and Djelouah, Abdelaziz and Zhang, Yang and Timofte, Radu and Gross, Markus and Schroers, Christopher}, title = {Versatile Vision Foundation Model for Image and Video Colorization}, year = {2024}, isbn = {9798400705250}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3641519.3657509}, doi = {10.1145/3641519.3657509}, abstract = {Image and video colorization are among the most common problems in image restoration. This is an ill-posed problem and a wide variety of methods have been proposed, ranging from more traditional computer vision strategies to most recent development with transformer-based or generative neural network models. In this work we show how a latent diffusion model, pre-trained on text-to-image synthesis, can be finetuned for image colorization and provide a flexible solution for a wide variety of scenarios: high quality direct colorization with diverse results, user guided colorization through colors hints, text prompts or reference image and finally video colorization. Some works already investigated using diffusion models for colorization, however the proposed solutions are often more complex and require training a side model guiding the denoising process (\`{a} la ControlNet). Not only is this approach increasing the number of parameters and compute time, it also results in sub optimal colorization as we show. Our evaluation demonstrates that our model is the only approach that offers a wide flexibility while either matching or outperforming existing methods specialized in each sub-task, by proposing a group of universal, architecture-agnostic mechanisms which could be applied to any pre-trained diffusion model.}, booktitle = {ACM SIGGRAPH 2024 Conference Papers}, articleno = {94}, numpages = {11}, keywords = {Colorization, Image Restoration, Image and Video Colorization}, location = {Denver, CO, USA}, series = {SIGGRAPH '24} }