Spaces:

nigeljw
/

ViewDiffusion

Paused

File size: 6,282 Bytes

7db9ab2
 
39544af
 
7db9ab2
 
39544af
 
 
 
f85d443
 
 
 
 
 
 
 
39544af
 
 
 
f85d443
 
8ca38ac
 
39544af
f85d443
 
 
78392d4
 
39544af
1200575
 
 
 
7db9ab2
1200575
78392d4
39544af
78392d4
 
 
 
f85d443
78392d4
1200575
 
 
39544af
 
7db9ab2
 
 
 
39544af
78392d4
39544af
 
78392d4
 
39544af
7db9ab2
1966b50
ff90cd8
 
 
 
 
 
78392d4
ff90cd8
 
f93e041
78392d4
7db9ab2
ff90cd8
 
 
 
78392d4
7db9ab2
ff90cd8
 
 
1200575
ff90cd8
7db9ab2

import gradio
import torch
import numpy
from PIL import Image
from torchvision import transforms
from diffusers import StableDiffusionInpaintPipeline
from diffusers import DPMSolverMultistepScheduler

deviceStr = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(deviceStr)
latents = None

def GenerateNewLatentsForInference():
    global latents
    if deviceStr == "cuda":
        latents = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16)
    else:
        latents = torch.randn((1, 4, 64, 64), device=device)

if deviceStr == "cuda":
    pipeline = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting",
                                                              revision="fp16",
                                                              torch_dtype=torch.float16)
                                                              #safety_checker=lambda images, **kwargs: (images, False))
    pipeline.to(device)
    pipeline.enable_xformers_memory_efficient_attention()
else:
    pipeline = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
                                                              #safety_checker=lambda images, **kwargs: (images, False))

imageSize = (512, 512)
lastImage = Image.new(mode="RGB", size=imageSize)

lastSeed = 4096
generator = torch.Generator(device).manual_seed(lastSeed)

GenerateNewLatentsForInference()

def diffuse(staticLatents, generatorSeed, inputImage, mask, pauseInference, prompt, negativePrompt, guidanceScale, numInferenceSteps):
    global latents, lastSeed, generator, deviceStr, lastImage

    if mask is None or pauseInference is True:
        return lastImage
    
    if staticLatents is False:
        GenerateNewLatentsForInference()
    
    if lastSeed != generatorSeed:
        generator = torch.Generator(device).manual_seed(generatorSeed)
        lastSeed = generatorSeed
        
    newImage = pipeline(prompt=prompt,
                    negative_prompt=negativePrompt,
                    image=inputImage,
                    mask_image=mask,
                    guidance_scale=guidanceScale,
                    num_inference_steps=numInferenceSteps,
                    latents=latents,
                    generator=generator).images[0]

    lastImage = newImage

    return newImage

defaultMask = Image.open("assets/masks/diamond.png")
numInfStepsDesc = "A higher value generally increases quality, but reduces the frames per second of the output stream."
staticLatentsDesc = "This setting increases the frame to frame determisn of the generation. If this is disabled, then the inerence will take continous large walks across the latent space between frames."
generatorSeedDesc = "Identical seeds allow for persistent scene generation between runs, and changing the seed will take a static large walk across the latent space to better control and alter the generation of scene scene content."
promptDesc = "This text will condition the generation of the scene to help guide the content creation."
negPromptDesc = "This text will help deter the generation from converging towards reconstructing the elements described in the text."
outputText = "This inferred imagery expands the field of view from the masked area of the input camera feed."

prompt = gradio.Textbox(label="Prompt", info=promptDesc, placeholder="A person in a room", lines=3)
negativePrompt = gradio.Textbox(label="Negative Prompt", info=negPromptDesc, placeholder="Facial hair", lines=3)
inputImage = gradio.Image(label="Input Feed", source="webcam", shape=[512,512], streaming=True)
mask = gradio.Image(label="Mask", type="pil", value=defaultMask)
outputImage = gradio.Image(label="Extrapolated Field of View")
guidanceScale = gradio.Slider(label="Guidance Scale", info="A higher value causes the generation to be more relative to the text prompt conditioning.", maximum=100, minimum=1, value=7.5)
numInferenceSteps = gradio.Slider(label="Number of Inference Steps", info=numInfStepsDesc, maximum=100, minimum=1, value=20)
generatorSeed = gradio.Slider(label="Generator Seed", info=generatorSeedDesc, maximum=10000, value=lastSeed)
staticLatents = gradio.Checkbox(label="Static Latents", info=staticLatentsDesc, value=True)
pauseInference = gradio.Checkbox(label="Pause Inference", value=False)

description = "This generative machine learning demonstration streams stable diffusion outpainting inference live from your camera on your computer or phone to expand your local reality and create an alternate world. High quality frame to frame determinism is a hard problem to solve for latent diffusion models as the generation is inherently relative to input noise distributions for the latents, and many factors such as the inherent Bayer noise from the camera images as well as anything that is altered between camera images (such as focus, white balance, etc). Some methods apply spationtemporal attention, but this demonstration focuses on the control over the input latents to navigate the latent space. Increase the lighting of your physical scene to improve the quality and consistency."
article = "This demonstration should initialize automatically, and run relatively well, but if the output is not an ideal reconstruction of your physical local space from caemra's perspective, then you should adjust the generator seed to take larger walks across the latent space. In addition, the static latents can be disable to continously walk the latent space, but this will increase frame to fram non-determinism. You can also condition the generation using prompts to re-enforce or change aspects of the scene. If you see a black image instead of a generated output image, then you are running into the safety checker. This can trigger inconsistently even when the generated content is purely PG. If this happens, then increase the lighting of the scene and also increase the number of inference steps to improve the generated predicition."

inputs=[staticLatents, generatorSeed, inputImage, mask, pauseInference, prompt, negativePrompt, guidanceScale, numInferenceSteps]
ux = gradio.Interface(fn=diffuse, title="View Diffusion", article=article, description=description, inputs=inputs, outputs=outputImage, live=True)
ux.launch()