|
import gradio as gr |
|
import os |
|
import cv2 |
|
import numpy as np |
|
from PIL import Image |
|
from moviepy.editor import * |
|
|
|
import torch |
|
import random |
|
import requests |
|
from io import BytesIO |
|
from diffusers import StableDiffusionPipeline |
|
from diffusers import DDIMScheduler |
|
from utils import * |
|
from inversion_utils import * |
|
from torch import autocast, inference_mode |
|
import re |
|
|
|
def get_frames(video_in): |
|
frames = [] |
|
|
|
clip = VideoFileClip(video_in) |
|
|
|
|
|
if clip.fps > 30: |
|
print("vide rate is over 30, resetting to 30") |
|
clip_resized = clip.resize(height=512) |
|
clip_resized.write_videofile("video_resized.mp4", fps=30) |
|
else: |
|
print("video rate is OK") |
|
clip_resized = clip.resize(height=512) |
|
clip_resized.write_videofile("video_resized.mp4", fps=clip.fps) |
|
|
|
print("video resized to 512 height") |
|
|
|
|
|
cap= cv2.VideoCapture("video_resized.mp4") |
|
|
|
fps = cap.get(cv2.CAP_PROP_FPS) |
|
print("video fps: " + str(fps)) |
|
i=0 |
|
while(cap.isOpened()): |
|
ret, frame = cap.read() |
|
if ret == False: |
|
break |
|
cv2.imwrite('kang'+str(i)+'.jpg',frame) |
|
frames.append('kang'+str(i)+'.jpg') |
|
i+=1 |
|
|
|
cap.release() |
|
cv2.destroyAllWindows() |
|
print("broke the video into frames") |
|
|
|
return frames, fps |
|
|
|
def create_video(frames, fps): |
|
print("building video result") |
|
clip = ImageSequenceClip(frames, fps=fps) |
|
clip.write_videofile("_result.mp4", fps=fps) |
|
|
|
return "_result.mp4" |
|
|
|
def randomize_seed_fn(seed, randomize_seed): |
|
if randomize_seed: |
|
seed = random.randint(0, np.iinfo(np.int32).max) |
|
torch.manual_seed(seed) |
|
return seed |
|
|
|
def invert(x0, prompt_src="", num_diffusion_steps=100, cfg_scale_src = 3.5, eta = 1): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd_pipe.scheduler.set_timesteps(num_diffusion_steps) |
|
|
|
|
|
with autocast("cuda"), inference_mode(): |
|
w0 = (sd_pipe.vae.encode(x0).latent_dist.mode() * 0.18215).float() |
|
|
|
|
|
wt, zs, wts = inversion_forward_process(sd_pipe, w0, etas=eta, prompt=prompt_src, cfg_scale=cfg_scale_src, prog_bar=False, num_inference_steps=num_diffusion_steps) |
|
return zs, wts |
|
|
|
|
|
|
|
def sample(zs, wts, prompt_tar="", skip=36, cfg_scale_tar=15, eta = 1): |
|
|
|
|
|
w0, _ = inversion_reverse_process(sd_pipe, xT=wts[skip], etas=eta, prompts=[prompt_tar], cfg_scales=[cfg_scale_tar], prog_bar=False, zs=zs[skip:]) |
|
|
|
|
|
with autocast("cuda"), inference_mode(): |
|
x0_dec = sd_pipe.vae.decode(1 / 0.18215 * w0).sample |
|
if x0_dec.dim()<4: |
|
x0_dec = x0_dec[None,:,:,:] |
|
img = image_grid(x0_dec) |
|
return img |
|
|
|
|
|
sd_model_id = "runwayml/stable-diffusion-v1-5" |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model_id).to(device) |
|
sd_pipe.scheduler = DDIMScheduler.from_config(sd_model_id, subfolder = "scheduler") |
|
|
|
def infer(video_in, do_inversion, wts, zs, |
|
src_prompt, |
|
tar_prompt, |
|
steps, |
|
cfg_scale_src, |
|
cfg_scale_tar, |
|
skip, seed, randomize_seed): |
|
|
|
|
|
|
|
break_vid = get_frames(video_in) |
|
frames_list= break_vid[0] |
|
fps = break_vid[1] |
|
|
|
n_frame = len(frames_list) |
|
|
|
if n_frame >= len(frames_list): |
|
print("video is shorter than the cut value") |
|
n_frame = len(frames_list) |
|
|
|
|
|
result_frames = [] |
|
print("set stop frames to: " + str(n_frame)) |
|
|
|
for i, image in enumerate(frames_list[0:int(n_frame)]): |
|
|
|
|
|
|
|
seed = randomize_seed_fn(seed, randomize_seed) |
|
do_inversion = True |
|
|
|
image = Image.open(image).convert("RGB") |
|
image = np.array(image) |
|
output_frame = edit(image, |
|
do_inversion, wts, zs, |
|
src_prompt, |
|
tar_prompt, |
|
steps, |
|
cfg_scale_src, |
|
cfg_scale_tar, |
|
skip, |
|
seed, |
|
randomize_seed |
|
) |
|
print(output_frame) |
|
do_inversion = False |
|
image = output_frame[0] |
|
|
|
image.save("_frame_" + str(i) + ".jpeg") |
|
result_frames.append("_frame_" + str(i) + ".jpeg") |
|
print("frame " + str(i) + "/" + str(n_frame) + ": done;") |
|
|
|
|
|
final_vid = create_video(result_frames, fps) |
|
|
|
|
|
return final_vid |
|
|
|
def get_example(): |
|
case = [ |
|
[ |
|
'Examples/gnochi_mirror.jpeg', |
|
'Watercolor painting of a cat sitting next to a mirror', |
|
'Examples/gnochi_mirror_watercolor_painting.png', |
|
'', |
|
100, |
|
3.5, |
|
36, |
|
15, |
|
|
|
], |
|
[ |
|
'Examples/source_an_old_man.png', |
|
'A bronze statue of an old man', |
|
'Examples/ddpm_a_bronze_statue_of_an_old_man.png', |
|
'', |
|
100, |
|
3.5, |
|
36, |
|
15, |
|
|
|
], |
|
[ |
|
'Examples/source_a_ceramic_vase_with_yellow_flowers.jpeg', |
|
'A pink ceramic vase with a wheat bouquet', |
|
'Examples/ddpm_a_pink_ceramic_vase_with_a_wheat_bouquet.png', |
|
'', |
|
100, |
|
3.5, |
|
36, |
|
15, |
|
|
|
], |
|
|
|
[ |
|
'Examples/source_a_model_on_a_runway.jpeg', |
|
'A zebra on the runway', |
|
'Examples/ddpm_a_zebra_on_the_run_way.png', |
|
'', |
|
100, |
|
3.5, |
|
36, |
|
15, |
|
|
|
] |
|
|
|
|
|
] |
|
return case |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
intro = """ |
|
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> |
|
Edit Friendly DDPM Inversion |
|
</h1> |
|
<p style="font-size: 0.9rem; text-align: center; margin: 0rem; line-height: 1.2em; margin-top:1em"> |
|
Based on the work introduced in: |
|
<a href="https://arxiv.org/abs/2304.06140" style="text-decoration: underline;" target="_blank">An Edit Friendly DDPM Noise Space: |
|
Inversion and Manipulations </a> |
|
<p/> |
|
<p style="font-size: 0.9rem; margin: 0rem; line-height: 1.2em; margin-top:1em"> |
|
For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. |
|
<a href="https://huggingface.co/spaces/LinoyTsaban/edit_friendly_ddpm_inversion?duplicate=true"> |
|
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> |
|
<p/>""" |
|
with gr.Blocks(css='style.css') as demo: |
|
|
|
def reset_do_inversion(): |
|
do_inversion = True |
|
return do_inversion |
|
|
|
|
|
def edit(input_image, |
|
do_inversion, |
|
wts, zs, |
|
src_prompt ="", |
|
tar_prompt="", |
|
steps=100, |
|
cfg_scale_src = 3.5, |
|
cfg_scale_tar = 15, |
|
skip=36, |
|
seed = 0, |
|
randomize_seed = True): |
|
|
|
x0 = load_512(input_image, device=device) |
|
|
|
if do_inversion or randomize_seed: |
|
zs_tensor, wts_tensor = invert(x0 =x0 , prompt_src=src_prompt, num_diffusion_steps=steps, cfg_scale_src=cfg_scale_src) |
|
wts = gr.State(value=wts_tensor) |
|
zs = gr.State(value=zs_tensor) |
|
do_inversion = False |
|
|
|
output = sample(zs.value, wts.value, prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=cfg_scale_tar) |
|
return output, wts, zs, do_inversion |
|
|
|
gr.HTML(intro) |
|
wts = gr.State() |
|
zs = gr.State() |
|
do_inversion = gr.State(value=True) |
|
with gr.Row(): |
|
video_in = gr.Video(source="upload", type="filepath") |
|
|
|
|
|
|
|
|
|
final_vid = gr.Video() |
|
|
|
with gr.Row(): |
|
tar_prompt = gr.Textbox(lines=1, label="Describe your desired edited output", interactive=True) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1, min_width=100): |
|
edit_button = gr.Button("Run") |
|
|
|
|
|
|
|
with gr.Accordion("Advanced Options", open=False): |
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
src_prompt = gr.Textbox(lines=1, label="Source Prompt", interactive=True, placeholder="describe the original image") |
|
steps = gr.Number(value=100, precision=0, label="Num Diffusion Steps", interactive=True) |
|
cfg_scale_src = gr.Slider(minimum=1, maximum=15, value=3.5, label=f"Source Guidance Scale", interactive=True) |
|
with gr.Column(): |
|
|
|
skip = gr.Slider(minimum=0, maximum=60, value=36, step = 1, label="Skip Steps", interactive=True) |
|
cfg_scale_tar = gr.Slider(minimum=7, maximum=18,value=15, label=f"Target Guidance Scale", interactive=True) |
|
seed = gr.Number(value=0, precision=0, label="Seed", interactive=True) |
|
randomize_seed = gr.Checkbox(label='Randomize seed', value=False) |
|
|
|
|
|
edit_button.click( |
|
fn=infer, |
|
inputs=[video_in, |
|
do_inversion, wts, zs, |
|
src_prompt, |
|
tar_prompt, |
|
steps, |
|
cfg_scale_src, |
|
cfg_scale_tar, |
|
skip, |
|
seed,randomize_seed |
|
], |
|
outputs=[final_vid], |
|
) |
|
|
|
video_in.change( |
|
fn = reset_do_inversion, |
|
outputs = [do_inversion] |
|
) |
|
|
|
src_prompt.change( |
|
fn = reset_do_inversion, |
|
outputs = [do_inversion] |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.queue() |
|
demo.launch(share=False) |