Spaces:

nomnomnonono
/

Siri-via-Whisper-ChatGPT

Sleeping

App Files Files Community

nomnomnonono commited on Apr 11, 2023

Commit

9ecdb48

1 Parent(s): a06e71d

udpate

Browse files

Files changed (4) hide show

app.py +25 -13
chat.py +67 -0
requirements.txt +3 -1
utils.py +0 -56

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import gradio as gr
-from utils import answer_by_chat, transcribe
 with gr.Blocks() as demo:
     gr.Markdown("Siri-like application via Whisper and ChatGPT")
@@ -8,23 +11,23 @@ with gr.Blocks() as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     api_key = gr.Textbox(label="Paste your own openai-api-key")
                     with gr.Row():
                         audio_input = gr.Audio(
                             source="microphone",
-                            type="filepath",
                             label="Record from microphone",
                         )
                         audio_button = gr.Button("Transcribe")
                     audio_output = gr.Textbox()
-                with gr.Column(scale=1):
                     chat_button = gr.Button("Questions to ChatGPT")
-                    chat_audio_output = gr.Audio()
-                    chat_text_output = gr.Textbox()
         with gr.TabItem(label="Setting"):
             gr.Markdown("Prompt Setting")
             with gr.Row():
                 role1 = gr.Dropdown(["system", "user", "assistant"], value="system")
-                content1 = gr.Textbox(value="あなたは役に立つアシスタントです。")
             with gr.Row():
                 role2 = gr.Dropdown(["system", "user", "assistant"])
                 content2 = gr.Textbox()
@@ -37,13 +40,10 @@ with gr.Blocks() as demo:
             with gr.Row():
                 role5 = gr.Dropdown(["system", "user", "assistant"])
                 content5 = gr.Textbox()
-    audio_button.click(
-        transcribe, inputs=[audio_input], outputs=[audio_output], api_name="transcribe"
-    )
-    chat_button.click(
-        answer_by_chat,
         inputs=[
-            audio_output,
             role1,
             content1,
             role2,
@@ -55,8 +55,20 @@ with gr.Blocks() as demo:
             role5,
             content5,
             api_key,
         ],
-        outputs=[chat_text_output, chat_audio_output],
     )
 demo.launch()

 import gradio as gr
+from chat import CahtBOT
+chat = CahtBOT()
 with gr.Blocks() as demo:
     gr.Markdown("Siri-like application via Whisper and ChatGPT")
             with gr.Row():
                 with gr.Column(scale=1):
                     api_key = gr.Textbox(label="Paste your own openai-api-key")
+                    api_button = gr.Button("SetUp")
                     with gr.Row():
                         audio_input = gr.Audio(
                             source="microphone",
                             label="Record from microphone",
                         )
                         audio_button = gr.Button("Transcribe")
                     audio_output = gr.Textbox()
                     chat_button = gr.Button("Questions to ChatGPT")
+                with gr.Column(scale=1):
+                    chatbot = gr.Chatbot([], elemid="chatbot").style(height=750)
         with gr.TabItem(label="Setting"):
             gr.Markdown("Prompt Setting")
+            language = gr.Dropdown(["Japanese", "English"], value="English")
             with gr.Row():
                 role1 = gr.Dropdown(["system", "user", "assistant"], value="system")
+                content1 = gr.Textbox(value="You're helpful assistant.")
             with gr.Row():
                 role2 = gr.Dropdown(["system", "user", "assistant"])
                 content2 = gr.Textbox()
             with gr.Row():
                 role5 = gr.Dropdown(["system", "user", "assistant"])
                 content5 = gr.Textbox()
+    api_button.click(
+        chat.setup,
         inputs=[
             role1,
             content1,
             role2,
             role5,
             content5,
             api_key,
+            language,
         ],
+        outputs=None,
+    )
+    audio_button.click(
+        chat.transcribe,
+        inputs=[audio_input],
+        outputs=[audio_output],
+        api_name="transcribe",
+    )
+    chat_button.click(
+        chat.answer_by_chat,
+        inputs=[chatbot, audio_output],
+        outputs=[chatbot],
     )
 demo.launch()

chat.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import openai
+import soundfile
+# import whisper
+from gtts import gTTS
+dic = {"Japanese": "ja", "English": "en"}
+class CahtBOT:
+    def __init__(self):
+        self.messages = None
+    def setup(
+        self,
+        role1,
+        content1,
+        role2,
+        content2,
+        role3,
+        content3,
+        role4,
+        content4,
+        role5,
+        content5,
+        api_key,
+        language,
+    ):
+        openai.api_key = api_key
+        self.language = dic[language]
+        self.messages = [
+            {"role": role, "content": content}
+            for role, content in [
+                [role1, content1],
+                [role2, content2],
+                [role3, content3],
+                [role4, content4],
+                [role5, content5],
+            ]
+            if role != "" and content != ""
+        ]
+    def transcribe(self, audio):
+        sample_rate, data = audio
+        soundfile.write(file="tmp.wav", data=data, samplerate=sample_rate)
+        audio_file = open("tmp.wav", "rb")
+        transcript = openai.Audio.transcribe("whisper-1", audio_file)
+        return transcript.text
+    def answer_by_chat(self, history, question):
+        self.messages.append({"role": "user", "content": question})
+        history += [(question, None)]
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo", messages=self.messages
+        )
+        response_text = response["choices"][0]["message"]["content"]
+        response_role = response["choices"][0]["message"]["role"]
+        response_audio = self.speech_synthesis(response_text)
+        self.messages.append({"role": response_role, "content": response_text})
+        # history += [(None, response_text)]
+        history += [(None, (response_audio,))]
+        return history
+    def speech_synthesis(self, sentence):
+        tts = gTTS(sentence, lang=self.language)
+        tts.save("tmp.wav")
+        return "tmp.wav"

requirements.txt CHANGED Viewed

@@ -6,6 +6,7 @@ anyio==3.6.2
 async-timeout==4.0.2
 attrs==22.2.0
 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
 contourpy==1.0.7
@@ -51,6 +52,7 @@ orjson==3.8.9
 packaging==23.0
 pandas==1.5.3
 Pillow==9.5.0
 pydantic==1.10.7
 pydub==0.25.1
 pyparsing==3.0.9
@@ -65,6 +67,7 @@ rfc3986==1.5.0
 semantic-version==2.10.0
 six==1.16.0
 sniffio==1.3.0
 starlette==0.26.1
 sympy==1.11.1
 tiktoken==0.3.1
@@ -76,6 +79,5 @@ uc-micro-py==1.0.1
 urllib3==1.26.15
 uvicorn==0.21.1
 websockets==11.0
-whisper==1.1.10
 yarl==1.8.2
 zipp==3.15.0

 async-timeout==4.0.2
 attrs==22.2.0
 certifi==2022.12.7
+cffi==1.15.1
 charset-normalizer==3.1.0
 click==8.1.3
 contourpy==1.0.7
 packaging==23.0
 pandas==1.5.3
 Pillow==9.5.0
+pycparser==2.21
 pydantic==1.10.7
 pydub==0.25.1
 pyparsing==3.0.9
 semantic-version==2.10.0
 six==1.16.0
 sniffio==1.3.0
+soundfile==0.12.1
 starlette==0.26.1
 sympy==1.11.1
 tiktoken==0.3.1
 urllib3==1.26.15
 uvicorn==0.21.1
 websockets==11.0
 yarl==1.8.2
 zipp==3.15.0

utils.py DELETED Viewed

@@ -1,56 +0,0 @@
-import openai
-import whisper
-from gtts import gTTS
-model = whisper.load_model("small")
-def transcribe(filepath):
-    audio = whisper.load_audio(filepath)
-    audio = whisper.pad_or_trim(audio)
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    _, probs = model.detect_language(mel)
-    global language
-    language = max(probs, key=probs.get)
-    options = whisper.DecodingOptions(fp16=False)
-    result = whisper.decode(model, mel, options)
-    return result.text
-def answer_by_chat(
-    question,
-    role1,
-    content1,
-    role2,
-    content2,
-    role3,
-    content3,
-    role4,
-    content4,
-    role5,
-    content5,
-    api_key,
-):
-    openai.api_key = api_key
-    messages = [
-        {"role": role, "content": content}
-        for role, content in [
-            [role1, content1],
-            [role2, content2],
-            [role3, content3],
-            [role4, content4],
-            [role5, content5],
-        ]
-        if role != "" and content != ""
-    ]
-    messages.append({"role": "user", "content": question})
-    response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
-    response_text = response["choices"][0]["message"]["content"]
-    response_audio = speech_synthesis(response_text)
-    return response_text, response_audio
-def speech_synthesis(sentence):
-    tts = gTTS(sentence, lang=language)
-    tts.save("tmp.mp3")
-    return "tmp.mp3"