File size: 4,169 Bytes
e5ba529
da9cad5
e5ba529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdeee8d
e5ba529
 
 
 
 
 
 
 
 
 
106fb6c
e5ba529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae0d3c0
 
 
 
e5ba529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73333e4
 
 
e5ba529
df59f0e
 
e5ba529
 
 
 
 
 
 
 
 
 
 
 
 
 
106fb6c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import os
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline
from PIL import Image
from styles import css, header_html, footer_html
from examples import examples
from transformers import pipeline

ars_model = pipeline("automatic-speech-recognition")


model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

# If you are running this code locally, you need to either do a 'huggingface-cli login` or paste your User Access Token from here https://huggingface.co/settings/tokens into the use_auth_token field below.
pipe = StableDiffusionPipeline.from_pretrained(
    model_id, use_auth_token=os.environ.get('auth_token'), revision="fp16", torch_dtype=torch.float16)
pipe = pipe.to(device)


def transcribe(audio):
    text = ars_model(audio)["text"]
    return text


def infer(audio, samples, steps, scale, seed):

    prompt = transcribe(audio)
    generator = torch.Generator(device=device).manual_seed(seed)

    # If you are running locally with CPU, you can remove the `with autocast("cuda")`
    if device == "cuda":
        with autocast("cuda"):
            images_list = pipe(
                [prompt] * samples,
                num_inference_steps=steps,
                guidance_scale=scale,
                generator=generator,
            )
    else:
        images_list = pipe(
            [prompt] * samples,
            num_inference_steps=steps,
            guidance_scale=scale,
            generator=generator,
        )
    images = []
    safe_image = Image.open(r"unsafe.png")
    for i, image in enumerate(images_list["sample"]):
        if(images_list["nsfw_content_detected"][i]):
            images.append(safe_image)
        else:
            images.append(image)
    return images


block = gr.Blocks(css=css)


with block:
    gr.HTML(header_html)
    with gr.Group():
        with gr.Box():
            with gr.Row().style(mobile_collapse=False, equal_height=True):
                audio = gr.Audio(
                    label="Describe a prompt",
                    source="microphone",
                    type="filepath"
              #  ).style(
              #     border=(True, False, True, True),
              #     rounded=(True, False, False, True),
              #    container=False,
                )
                btn = gr.Button("Generate image").style(
                    margin=False,
                    rounded=(False, True, True, False),
                )

        gallery = gr.Gallery(
            label="Generated images", show_label=False, elem_id="gallery"
        ).style(grid=[2], height="auto")

        advanced_button = gr.Button("Advanced options", elem_id="advanced-btn")

        with gr.Row(elem_id="advanced-options"):
            samples = gr.Slider(label="Images", minimum=1,
                                maximum=4, value=4, step=1)
            steps = gr.Slider(label="Steps", minimum=1,
                              maximum=50, value=45, step=1)
            scale = gr.Slider(
                label="Guidance Scale", minimum=0, maximum=50, value=7.5, step=0.1
            )
            seed = gr.Slider(
                label="Seed",
                minimum=0,
                maximum=2147483647,
                step=1,
                randomize=True,
            )

        # ex = gr.Examples(fn=infer, inputs=[
        #                 audio, samples, steps, scale, seed], outputs=gallery)
        # ex.dataset.headers = [""]

        # audio.submit(infer, inputs=[audio, samples,
        #                           steps, scale, seed], outputs=gallery)
        btn.click(infer, inputs=[audio, samples, steps,
                  scale, seed], outputs=gallery)
        advanced_button.click(
            None,
            [],
            audio,
            _js="""
            () => {
                const options = document.querySelector("body > gradio-app").querySelector("#advanced-options");
                options.style.display = ["none", ""].includes(options.style.display) ? "flex" : "none";
            }""",
        )
        gr.HTML(footer_html)

block.queue(max_size=25).launch()