# !pip install -q gradio import os os.system("pip install -q pyChatGPT") os.system("pip install -q --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy") os.system("pip install git+https://github.com/openai/whisper.git") import whisper import gradio as gr import time import warnings import torch from pyChatGPT import ChatGPT from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler warnings.filterwarnings("ignore") secret_token = "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" model = whisper.load_model("base") # from diffusers import DiffusionPipeline # pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") model_id = "stabilityai/stable-diffusion-2-1" scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") pipeline = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16) # pipeline = pipeline.to("cuda") def transcribe(audio): # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) # decode the audio options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) result_text = result.text # Pass the generated text to Audio chatgpt_api = ChatGPT(secret_token) resp = chatgpt_api.send_message(result_text) out_result = resp['message'] out_image = pipeline(out_result, height=768, width=768).images[0] return [result_text, out_result, out_image] output_1 = gr.Textbox(label="Speech to Text") output_2 = gr.Textbox(label="ChatGPT Output") output_3 = gr.Image(label="Diffusion Output") gr.Interface( title = 'Stable Diffusion with ChatGPT', fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type="filepath") ], outputs=[ output_1, output_2, output_3 ], live=True).launch()