File size: 2,365 Bytes
da780f0
f13866d
9df7a04
 
f13866d
9df7a04
 
9abf363
 
 
f13866d
9abf363
e1b7667
f0501bc
e1b7667
9abf363
 
de88760
9abf363
9df7a04
9abf363
f13866d
9abf363
f13866d
635cd84
f13866d
 
 
635cd84
f13866d
 
 
 
4e092aa
9abf363
 
 
 
9df7a04
 
9abf363
 
9df7a04
9abf363
 
 
 
 
9df7a04
 
9abf363
 
 
9df7a04
9abf363
 
 
635cd84
9abf363
 
 
 
 
 
 
 
07b58c6
9abf363
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# !pip install -q gradio

import os
os.system("pip install -q pyChatGPT")
os.system("pip install -q --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy")
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
import gradio as gr 
import time
import warnings
import torch

from pyChatGPT import ChatGPT
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler

warnings.filterwarnings("ignore")

secret_token = "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"

model = whisper.load_model("base")

# from diffusers import DiffusionPipeline

# pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")

model_id = "stabilityai/stable-diffusion-2-1"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, 
                                                   subfolder="scheduler")

pipeline = StableDiffusionPipeline.from_pretrained(model_id, 
                                               scheduler=scheduler, 
                                               revision="fp16", 
                                               torch_dtype=torch.float16)
# pipeline = pipeline.to("cuda")

def transcribe(audio):

    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)

    # decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    result_text = result.text

    # Pass the generated text to Audio
    chatgpt_api = ChatGPT(secret_token)
    resp = chatgpt_api.send_message(result_text)
    out_result = resp['message']

    out_image = pipeline(out_result, height=768, width=768).images[0]

    return [result_text, out_result, out_image]

output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Image(label="Diffusion Output")

gr.Interface(
    title = 'Stable Diffusion with ChatGPT', 
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath")
    ],

    outputs=[
        output_1,  output_2, output_3
    ],
    live=True).launch()