dipesh1701's picture
change title
07b58c6
raw
history blame
No virus
2.37 kB
# !pip install -q gradio
import os
os.system("pip install -q pyChatGPT")
os.system("pip install -q --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy")
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
import gradio as gr
import time
import warnings
import torch
from pyChatGPT import ChatGPT
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
warnings.filterwarnings("ignore")
secret_token = "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"
model = whisper.load_model("base")
# from diffusers import DiffusionPipeline
# pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
model_id = "stabilityai/stable-diffusion-2-1"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id,
subfolder="scheduler")
pipeline = StableDiffusionPipeline.from_pretrained(model_id,
scheduler=scheduler,
revision="fp16",
torch_dtype=torch.float16)
# pipeline = pipeline.to("cuda")
def transcribe(audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
result_text = result.text
# Pass the generated text to Audio
chatgpt_api = ChatGPT(secret_token)
resp = chatgpt_api.send_message(result_text)
out_result = resp['message']
out_image = pipeline(out_result, height=768, width=768).images[0]
return [result_text, out_result, out_image]
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Image(label="Diffusion Output")
gr.Interface(
title = 'Stable Diffusion with ChatGPT',
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath")
],
outputs=[
output_1, output_2, output_3
],
live=True).launch()