|
|
|
|
|
import os |
|
os.system("pip install -q pyChatGPT") |
|
os.system("pip install -q --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy") |
|
os.system("pip install git+https://github.com/openai/whisper.git") |
|
import whisper |
|
import gradio as gr |
|
import time |
|
import warnings |
|
import torch |
|
|
|
from pyChatGPT import ChatGPT |
|
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
secret_token = "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" |
|
|
|
model = whisper.load_model("base") |
|
|
|
|
|
|
|
|
|
|
|
model_id = "stabilityai/stable-diffusion-2-1" |
|
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, |
|
subfolder="scheduler") |
|
|
|
pipeline = StableDiffusionPipeline.from_pretrained(model_id, |
|
scheduler=scheduler, |
|
revision="fp16", |
|
torch_dtype=torch.float16) |
|
|
|
|
|
def transcribe(audio): |
|
|
|
|
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
|
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
|
|
|
|
_, probs = model.detect_language(mel) |
|
|
|
|
|
options = whisper.DecodingOptions() |
|
result = whisper.decode(model, mel, options) |
|
result_text = result.text |
|
|
|
|
|
chatgpt_api = ChatGPT(secret_token) |
|
resp = chatgpt_api.send_message(result_text) |
|
out_result = resp['message'] |
|
|
|
out_image = pipeline(out_result, height=768, width=768).images[0] |
|
|
|
return [result_text, out_result, out_image] |
|
|
|
output_1 = gr.Textbox(label="Speech to Text") |
|
output_2 = gr.Textbox(label="ChatGPT Output") |
|
output_3 = gr.Image(label="Diffusion Output") |
|
|
|
gr.Interface( |
|
title = 'Stable Diffusion with ChatGPT', |
|
fn=transcribe, |
|
inputs=[ |
|
gr.inputs.Audio(source="microphone", type="filepath") |
|
], |
|
|
|
outputs=[ |
|
output_1, output_2, output_3 |
|
], |
|
live=True).launch() |