Whisper2Image / app.py
calebaryee321's picture
Create app.py
3c53ea4
raw
history blame
1.58 kB
import gradio as gr
import time
import sounddevice as sd
import soundfile as sf
import time
import whisper
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
def SpeechToText(audio):
if audio == None : return ""
model = whisper.load_model("base")
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# Detect the Max probability of language ?
_, probs = model.detect_language(mel)
lang = f"Language: {max(probs, key=probs.get)}"
# Decode audio to Text
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
return result.text
def img_Generation(text):
print(text)
model_id = "stabilityai/stable-diffusion-2"
# Use the Euler scheduler here instead
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
pipe = pipe.to("cuda")
image = pipe(text, num_inference_steps = 150).images[0]
image.save("img_1.png")
return image
def transcribe(audio):
text = SpeechToText(audio)
image = img_Generation(text)
return image
gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="image",description="A Speech to Image Generation App Using OpenAI's Whisper",title= "Whisper2IMG").launch(share="True")