import gradio as gr import torch import whisper from PIL import Image import os MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD') from diffusers import StableDiffusionPipeline whisper_model = whisper.load_model("small") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN) pipe.to(device) def get_transcribe(audio): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) _, probs = whisper_model.detect_language(mel) options = whisper.DecodingOptions(task="translate", fp16 = False) result = whisper.decode(whisper_model, mel, options) print(result) print(result.text) return result.text def get_images(audio): prompt = get_transcribe(audio) #image = pipe(prompt, init_image=init_image)["sample"][0] images_list = pipe([prompt] * 2) images = [] safe_image = Image.open(r"unsafe.png") for i, image in enumerate(images_list["sample"]): if(images_list["nsfw_content_detected"][i]): images.append(safe_image) else: images.append(image) return prompt, images #inputs audio = gr.Audio(label="Input Audio of an image description", show_label=True, source="microphone", type="filepath") #outputs translated_prompt = gr.Textbox(label="Translated audio", lines=6) gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[1], height="auto") title="Whisper to Stable Diffusion" description="""

This demo is running on CPU 🐢. Offered by Sylvain @fffilonivisitor badge
Record an audio description of an image, stop recording, then hit the Submit button to get 2 images from Stable Diffusion.
Your audio will be translated to English through OpenAI's Whisper, then sent as a prompt to Stable Diffusion. Try it in French ! ;)

""" article="""


Whisper is a general-purpose speech recognition model.
It is trained on a large dataset of diverse audio and is also a multi-task model that can perform
multilingual speech recognition as well as speech translation and language identification.
Model by OpenAI

""" gr.Interface(fn=get_images, inputs=audio, outputs=[translated_prompt, gallery], title=title, description=description, article=article).queue(max_size=1000).launch(enable_queue=True)