import gradio as gr import time import sounddevice as sd import soundfile as sf import time import whisper from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler def SpeechToText(audio): if audio == None : return "" model = whisper.load_model("base") audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # Detect the Max probability of language ? _, probs = model.detect_language(mel) lang = f"Language: {max(probs, key=probs.get)}" # Decode audio to Text options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) return result.text def img_Generation(text): print(text) model_id = "stabilityai/stable-diffusion-2" # Use the Euler scheduler here instead scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16) pipe = pipe.to("cuda") image = pipe(text, num_inference_steps = 150).images[0] image.save("img_1.png") return image def transcribe(audio): text = SpeechToText(audio) image = img_Generation(text) return image gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="image",description="A Speech to Image Generation App Using OpenAI's Whisper",title= "Whisper2IMG").launch(share="True")