nomnomnonono
commited on
Commit
Β·
eb90ab7
1
Parent(s):
18d25ba
initial
Browse files- .gitignore +3 -0
- README.md +6 -8
- app.py +62 -0
- requirements.txt +68 -0
- utils.py +56 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
*.ipynb
|
3 |
+
__pycache__
|
README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.23.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
-
---
|
11 |
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
title: Siri via Whisper and ChatGPT
|
2 |
+
emoji: π
|
3 |
+
colorFrom: red
|
4 |
+
colorTo: purple
|
5 |
+
python: 3.9.7
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.23.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
|
|
10 |
|
|
app.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils import answer_by_chat, transcribe
|
3 |
+
|
4 |
+
with gr.Blocks() as demo:
|
5 |
+
gr.Markdown("Siri-like application via Whisper and ChatGPT")
|
6 |
+
with gr.Tabs():
|
7 |
+
with gr.TabItem(label="General"):
|
8 |
+
with gr.Row():
|
9 |
+
with gr.Column(scale=1):
|
10 |
+
api_key = gr.Textbox(label="Paste your own openai-api-key")
|
11 |
+
with gr.Row():
|
12 |
+
audio_input = gr.Audio(
|
13 |
+
source="microphone",
|
14 |
+
type="filepath",
|
15 |
+
label="Record from microphone",
|
16 |
+
)
|
17 |
+
audio_button = gr.Button("Transcribe")
|
18 |
+
audio_output = gr.Textbox()
|
19 |
+
with gr.Column(scale=1):
|
20 |
+
chat_button = gr.Button("Questions to ChatGPT")
|
21 |
+
chat_audio_output = gr.Audio()
|
22 |
+
chat_text_output = gr.Textbox()
|
23 |
+
with gr.TabItem(label="Setting"):
|
24 |
+
gr.Markdown("Prompt Setting")
|
25 |
+
with gr.Row():
|
26 |
+
role1 = gr.Dropdown(["system", "user", "assistant"], value="system")
|
27 |
+
content1 = gr.Textbox(value="γγͺγγ―ε½Ήγ«η«γ€γ’γ·γΉγΏγ³γγ§γγ")
|
28 |
+
with gr.Row():
|
29 |
+
role2 = gr.Dropdown(["system", "user", "assistant"])
|
30 |
+
content2 = gr.Textbox()
|
31 |
+
with gr.Row():
|
32 |
+
role3 = gr.Dropdown(["system", "user", "assistant"])
|
33 |
+
content3 = gr.Textbox()
|
34 |
+
with gr.Row():
|
35 |
+
role4 = gr.Dropdown(["system", "user", "assistant"])
|
36 |
+
content4 = gr.Textbox()
|
37 |
+
with gr.Row():
|
38 |
+
role5 = gr.Dropdown(["system", "user", "assistant"])
|
39 |
+
content5 = gr.Textbox()
|
40 |
+
audio_button.click(
|
41 |
+
transcribe, inputs=[audio_input], outputs=[audio_output], api_name="transcribe"
|
42 |
+
)
|
43 |
+
chat_button.click(
|
44 |
+
answer_by_chat,
|
45 |
+
inputs=[
|
46 |
+
audio_output,
|
47 |
+
role1,
|
48 |
+
content1,
|
49 |
+
role2,
|
50 |
+
content2,
|
51 |
+
role3,
|
52 |
+
content3,
|
53 |
+
role4,
|
54 |
+
content4,
|
55 |
+
role5,
|
56 |
+
content5,
|
57 |
+
api_key,
|
58 |
+
],
|
59 |
+
outputs=[chat_text_output, chat_audio_output],
|
60 |
+
)
|
61 |
+
|
62 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
async-timeout==4.0.2
|
7 |
+
attrs==22.2.0
|
8 |
+
certifi==2022.12.7
|
9 |
+
charset-normalizer==3.1.0
|
10 |
+
click==8.1.3
|
11 |
+
contourpy==1.0.7
|
12 |
+
cycler==0.11.0
|
13 |
+
entrypoints==0.4
|
14 |
+
fastapi==0.95.0
|
15 |
+
ffmpy==0.3.0
|
16 |
+
filelock==3.10.7
|
17 |
+
fonttools==4.39.3
|
18 |
+
frozenlist==1.3.3
|
19 |
+
fsspec==2023.3.0
|
20 |
+
gradio==3.24.1
|
21 |
+
gradio_client==0.0.5
|
22 |
+
gTTS==2.3.1
|
23 |
+
h11==0.14.0
|
24 |
+
httpcore==0.16.3
|
25 |
+
httpx==0.23.3
|
26 |
+
huggingface-hub==0.13.3
|
27 |
+
idna==3.4
|
28 |
+
importlib-resources==5.12.0
|
29 |
+
Jinja2==3.1.2
|
30 |
+
jsonschema==4.17.3
|
31 |
+
kiwisolver==1.4.4
|
32 |
+
linkify-it-py==2.0.0
|
33 |
+
markdown-it-py==2.2.0
|
34 |
+
MarkupSafe==2.1.2
|
35 |
+
matplotlib==3.7.1
|
36 |
+
mdit-py-plugins==0.3.3
|
37 |
+
mdurl==0.1.2
|
38 |
+
multidict==6.0.4
|
39 |
+
numpy==1.24.2
|
40 |
+
openai==0.27.2
|
41 |
+
orjson==3.8.9
|
42 |
+
packaging==23.0
|
43 |
+
pandas==1.5.3
|
44 |
+
Pillow==9.5.0
|
45 |
+
pydantic==1.10.7
|
46 |
+
pydub==0.25.1
|
47 |
+
pyparsing==3.0.9
|
48 |
+
pyrsistent==0.19.3
|
49 |
+
python-dateutil==2.8.2
|
50 |
+
python-multipart==0.0.6
|
51 |
+
pytz==2023.3
|
52 |
+
PyYAML==6.0
|
53 |
+
requests==2.28.2
|
54 |
+
rfc3986==1.5.0
|
55 |
+
semantic-version==2.10.0
|
56 |
+
six==1.16.0
|
57 |
+
sniffio==1.3.0
|
58 |
+
starlette==0.26.1
|
59 |
+
toolz==0.12.0
|
60 |
+
tqdm==4.65.0
|
61 |
+
typing_extensions==4.5.0
|
62 |
+
uc-micro-py==1.0.1
|
63 |
+
urllib3==1.26.15
|
64 |
+
uvicorn==0.21.1
|
65 |
+
websockets==11.0
|
66 |
+
whisper==1.1.10
|
67 |
+
yarl==1.8.2
|
68 |
+
zipp==3.15.0
|
utils.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import whisper
|
3 |
+
from gtts import gTTS
|
4 |
+
|
5 |
+
model = whisper.load_model("small")
|
6 |
+
|
7 |
+
|
8 |
+
def transcribe(filepath):
|
9 |
+
audio = whisper.load_audio(filepath)
|
10 |
+
audio = whisper.pad_or_trim(audio)
|
11 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
12 |
+
_, probs = model.detect_language(mel)
|
13 |
+
global language
|
14 |
+
language = max(probs, key=probs.get)
|
15 |
+
options = whisper.DecodingOptions(fp16=False)
|
16 |
+
result = whisper.decode(model, mel, options)
|
17 |
+
return result.text
|
18 |
+
|
19 |
+
|
20 |
+
def answer_by_chat(
|
21 |
+
question,
|
22 |
+
role1,
|
23 |
+
content1,
|
24 |
+
role2,
|
25 |
+
content2,
|
26 |
+
role3,
|
27 |
+
content3,
|
28 |
+
role4,
|
29 |
+
content4,
|
30 |
+
role5,
|
31 |
+
content5,
|
32 |
+
api_key,
|
33 |
+
):
|
34 |
+
openai.api_key = api_key
|
35 |
+
messages = [
|
36 |
+
{"role": role, "content": content}
|
37 |
+
for role, content in [
|
38 |
+
[role1, content1],
|
39 |
+
[role2, content2],
|
40 |
+
[role3, content3],
|
41 |
+
[role4, content4],
|
42 |
+
[role5, content5],
|
43 |
+
]
|
44 |
+
if role != "" and content != ""
|
45 |
+
]
|
46 |
+
messages.append({"role": "user", "content": question})
|
47 |
+
response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
|
48 |
+
response_text = response["choices"][0]["message"]["content"]
|
49 |
+
response_audio = speech_synthesis(response_text)
|
50 |
+
return response_text, response_audio
|
51 |
+
|
52 |
+
|
53 |
+
def speech_synthesis(sentence):
|
54 |
+
tts = gTTS(sentence, lang=language)
|
55 |
+
tts.save("tmp.mp3")
|
56 |
+
return "tmp.mp3"
|