Spaces:

fffiloni
/

whisper-to-stable-diffusion

Paused

File size: 2,903 Bytes

9fd51b2
6e00cc0
9fd51b2
 
 
 
 
 
 
 
 
 
6e00cc0
9fd51b2
 
 
 
 
 
 
 
 
 
99cb9ec
9fd51b2
9f29da8
9fd51b2
 
01f0cbf
9fd51b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99cb9ec
9fd51b2
99cb9ec
9fd51b2
99cb9ec
 
 
 
 
6e00cc0
bba0636
13cd12c
1a0284d
4fee78b
99cb9ec
 
bba0636
 
a3d88bf
 
6e00cc0
bba0636
 
 
1a0284d

import gradio as gr
import torch
import whisper
from PIL import Image

import os
MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')

from diffusers import StableDiffusionPipeline

whisper_model = whisper.load_model("small")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
pipe.to(device)

def get_transcribe(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    
    _, probs = whisper_model.detect_language(mel)
    
    options = whisper.DecodingOptions(task="translate", fp16 = False)
    result = whisper.decode(whisper_model, mel, options)
    
    print(result)
    print(result.text)
    return result.text

def get_images(audio): 
    prompt = get_transcribe(audio)
    #image = pipe(prompt, init_image=init_image)["sample"][0]
    images_list = pipe([prompt] * 2)
    images = []
    safe_image = Image.open(r"unsafe.png")
    for i, image in enumerate(images_list["sample"]):
        if(images_list["nsfw_content_detected"][i]):
            images.append(safe_image)
        else:
            images.append(image)
    
    return prompt, images
#inputs
audio = gr.Audio(label="Input Audio of an image description", show_label=True, source="microphone", type="filepath")
#outputs
translated_prompt = gr.Textbox(label="Translated audio", lines=6)
gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[1], height="auto")
title="Whisper to Stable Diffusion"
description="""
<p style='text-align: center;'>
This demo is running on CPU 🐢. Offered by Sylvain <a href='https://twitter.com/fffiloni' target='_blank'>@fffiloni</a> • <img id='visitor-badge' alt='visitor badge' src='https://visitor-badge.glitch.me/badge?page_id=gradio-blocks.whisper-to-stable-diffusion' style='display: inline-block' /><br />
Record an audio description of an image, stop recording, then hit the Submit button to get 2 images from Stable Diffusion.<br />
Your audio will be translated to English through OpenAI's Whisper, then sent as a prompt to Stable Diffusion.
Try it in French ! ;)<br /> 
—
</p>
"""

article="""
<p style='text-align: center;'>—<br />
Whisper is a general-purpose speech recognition model. <br />
It is trained on a large dataset of diverse audio and is also a multi-task model that can perform<br />multilingual speech recognition as well as speech translation and language identification.<br />
Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a>
</p>
"""
gr.Interface(fn=get_images, inputs=audio, outputs=[translated_prompt, gallery], title=title, description=description, article=article).queue(max_size=1000).launch(enable_queue=True)