File size: 2,903 Bytes
9fd51b2
6e00cc0
9fd51b2
 
 
 
 
 
 
 
 
 
6e00cc0
9fd51b2
 
 
 
 
 
 
 
 
 
99cb9ec
9fd51b2
9f29da8
9fd51b2
 
01f0cbf
9fd51b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99cb9ec
9fd51b2
99cb9ec
9fd51b2
99cb9ec
 
 
 
 
6e00cc0
bba0636
13cd12c
1a0284d
4fee78b
99cb9ec
 
bba0636
 
a3d88bf
 
6e00cc0
bba0636
 
 
1a0284d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import torch
import whisper
from PIL import Image

import os
MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')

from diffusers import StableDiffusionPipeline

whisper_model = whisper.load_model("small")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
pipe.to(device)

def get_transcribe(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    
    _, probs = whisper_model.detect_language(mel)
    
    options = whisper.DecodingOptions(task="translate", fp16 = False)
    result = whisper.decode(whisper_model, mel, options)
    
    print(result)
    print(result.text)
    return result.text

def get_images(audio): 
    prompt = get_transcribe(audio)
    #image = pipe(prompt, init_image=init_image)["sample"][0]
    images_list = pipe([prompt] * 2)
    images = []
    safe_image = Image.open(r"unsafe.png")
    for i, image in enumerate(images_list["sample"]):
        if(images_list["nsfw_content_detected"][i]):
            images.append(safe_image)
        else:
            images.append(image)
    
    return prompt, images
#inputs
audio = gr.Audio(label="Input Audio of an image description", show_label=True, source="microphone", type="filepath")
#outputs
translated_prompt = gr.Textbox(label="Translated audio", lines=6)
gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[1], height="auto")
title="Whisper to Stable Diffusion"
description="""
<p style='text-align: center;'>
This demo is running on CPU 🐢. Offered by Sylvain <a href='https://twitter.com/fffiloni' target='_blank'>@fffiloni</a> • <img id='visitor-badge' alt='visitor badge' src='https://visitor-badge.glitch.me/badge?page_id=gradio-blocks.whisper-to-stable-diffusion' style='display: inline-block' /><br />
Record an audio description of an image, stop recording, then hit the Submit button to get 2 images from Stable Diffusion.<br />
Your audio will be translated to English through OpenAI's Whisper, then sent as a prompt to Stable Diffusion.
Try it in French ! ;)<br /> 

</p>
"""

article="""
<p style='text-align: center;'>—<br />
Whisper is a general-purpose speech recognition model. <br />
It is trained on a large dataset of diverse audio and is also a multi-task model that can perform<br />multilingual speech recognition as well as speech translation and language identification.<br />
Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a>
</p>
"""
gr.Interface(fn=get_images, inputs=audio, outputs=[translated_prompt, gallery], title=title, description=description, article=article).queue(max_size=1000).launch(enable_queue=True)