File size: 3,208 Bytes
76c920d
 
cd4ffa3
 
f2943ab
cd4ffa3
 
 
 
 
 
f2943ab
76c920d
cd4ffa3
e674f0f
cd4ffa3
 
 
f2943ab
 
10af47d
 
 
3b67ecf
b80c840
e674f0f
 
 
 
f2943ab
cd4ffa3
4bcc00f
 
cd4ffa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8bc6ca
4bcc00f
f2943ab
e34fa5d
2d106b2
80d1c90
e674f0f
80d1c90
e674f0f
 
50ecf5f
a7bfa5f
e674f0f
cbec99d
e674f0f
f2943ab
 
e674f0f
 
 
 
f2943ab
a09c55b
5533283
 
4bcc00f
76c920d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr

import torch
from fastai.vision.all import *
from PIL import ImageFilter, ImageEnhance, ImageDraw
from diffusers.utils import make_image_grid
from tqdm import tqdm
from diffusers import AutoPipelineForInpainting, LCMScheduler, DDIMScheduler
from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel
import numpy as np
from PIL import Image
from datetime import datetime

preferred_device = "cuda" if torch.cuda.is_available() else "cpu"
preferred_dtype = torch.float32 if preferred_device == 'cpu' else torch.float16

def label_func(fn): return path/"labels"/f"{fn.stem}_P{fn.suffix}"

segmodel = load_learner("camvid-512.pkl")

if preferred_device == "cuda":
    segmodel = segmodel.to_fp16()

inpainting_pipeline = AutoPipelineForInpainting.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    revision="fp16",
    torch_dtype=preferred_dtype,
).to(preferred_device)

working_size = (512, 512)

default_inpainting_prompt = "watercolor of a leafy pedestrian mall at golden hour with multiracial genderqueer joggers and bicyclists and wheelchair users talking and laughing"

seg_vocabulary = ['Animal', 'Archway', 'Bicyclist', 'Bridge', 'Building', 'Car',
       'CartLuggagePram', 'Child', 'Column_Pole', 'Fence', 'LaneMkgsDriv',
       'LaneMkgsNonDriv', 'Misc_Text', 'MotorcycleScooter', 'OtherMoving',
       'ParkingBlock', 'Pedestrian', 'Road', 'RoadShoulder', 'Sidewalk',
       'SignSymbol', 'Sky', 'SUVPickupTruck', 'TrafficCone',
       'TrafficLight', 'Train', 'Tree', 'Truck_Bus', 'Tunnel',
       'VegetationMisc', 'Void', 'Wall']

ban_cars_mask = np.array([0, 0, 0, 0, 0, 1,
                 0, 0, 1, 0, 1,
                 1, 1, 0, 0,
                 1, 0, 1, 1, 1,
                 1, 0, 1, 1,
                 1, 0, 0, 0, 1,
                 0, 1, 0], dtype=np.uint8)

def get_seg_mask(img):
    mask = segmodel.predict(img)[0]
    return mask


def app(img, prompt):
    start_time = datetime.now().timestamp()
    old_size = Image.fromarray(img).size
    img = np.array(Image.fromarray(img).resize(working_size))
    mask = ban_cars_mask[get_seg_mask(img)] * 255
    mask_time = datetime.now().timestamp()
    print(prompt.__class__, img.__class__, mask.__class__, img.shape, mask.shape)
    overlay_img = inpainting_pipeline(
        prompt=prompt,
        image=img,
        mask_image=mask,
        strength=0.95,
        num_inference_steps=20,
    ).images[0]
    end_time = datetime.now().timestamp()
    draw = ImageDraw.Draw(overlay_img)
    # replace spaces with newlines after many words to line break prompt
    prompt = " ".join([prompt.split(" ")[i] if (i+1) % 5 else prompt.split(" ")[i] + "\n" for i in range(len(prompt.split(" ")))])

    draw.text((50, 10), f"Old size: {old_size}\nTotal duration: {int(1000 * (end_time - start_time))}ms\nSegmentation {int(1000 * (mask_time - start_time))}ms / inpainting {int(1000 * (end_time - mask_time))} \n<{prompt}>", fill=(255, 255, 255))
    return overlay_img

#ideally:
#iface = gr.Interface(app, gr.Image(sources=["webcam"], streaming=True), "image", live=True)
iface = gr.Interface(app, [gr.Image(), gr.Textbox(value=default_inpainting_prompt)], "image")
iface.launch()