File size: 2,365 Bytes
da780f0 f13866d 9df7a04 f13866d 9df7a04 9abf363 f13866d 9abf363 e1b7667 f0501bc e1b7667 9abf363 de88760 9abf363 9df7a04 9abf363 f13866d 9abf363 f13866d 635cd84 f13866d 635cd84 f13866d 4e092aa 9abf363 9df7a04 9abf363 9df7a04 9abf363 9df7a04 9abf363 9df7a04 9abf363 635cd84 9abf363 07b58c6 9abf363 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# !pip install -q gradio
import os
os.system("pip install -q pyChatGPT")
os.system("pip install -q --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy")
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
import gradio as gr
import time
import warnings
import torch
from pyChatGPT import ChatGPT
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
warnings.filterwarnings("ignore")
secret_token = "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"
model = whisper.load_model("base")
# from diffusers import DiffusionPipeline
# pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
model_id = "stabilityai/stable-diffusion-2-1"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id,
subfolder="scheduler")
pipeline = StableDiffusionPipeline.from_pretrained(model_id,
scheduler=scheduler,
revision="fp16",
torch_dtype=torch.float16)
# pipeline = pipeline.to("cuda")
def transcribe(audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
result_text = result.text
# Pass the generated text to Audio
chatgpt_api = ChatGPT(secret_token)
resp = chatgpt_api.send_message(result_text)
out_result = resp['message']
out_image = pipeline(out_result, height=768, width=768).images[0]
return [result_text, out_result, out_image]
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Image(label="Diffusion Output")
gr.Interface(
title = 'Stable Diffusion with ChatGPT',
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath")
],
outputs=[
output_1, output_2, output_3
],
live=True).launch() |