mipbkhn commited on
Commit
cbe6c2e
1 Parent(s): e3aee65
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.ipynb +92 -0
  3. app.py +58 -0
  4. gradio_article.md +16 -0
  5. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tmp.ipynb
2
+ tmp.mp3
app.ipynb ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.\n"
13
+ ]
14
+ }
15
+ ],
16
+ "source": [
17
+ "import openai\n",
18
+ "from playsound import playsound\n",
19
+ "from gtts import gTTS\n",
20
+ "import speech_recognition as sr\n",
21
+ "import gradio as gr\n",
22
+ "\n",
23
+ "openai.api_key = \"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\"\n",
24
+ "# will hide the api key:\n",
25
+ "# import openai_secret_manager\n",
26
+ "# assert \"openai\" in openai_secret_manager.get_services()\n",
27
+ "# secrets = openai_secret_manager.get_secret(\"openai\")\n",
28
+ "# openai.api_key = secrets[\"api_key\"]"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "def generate_response(prompt):\n",
38
+ " prompt = (f\"{prompt}\")\n",
39
+ "\n",
40
+ " response = openai.ChatCompletion.create(\n",
41
+ " model=\"gpt-3.5-turbo\",\n",
42
+ " messages=[\n",
43
+ " {\"role\": \"user\", \"content\": f\"{prompt}\"},\n",
44
+ " ])\n",
45
+ "\n",
46
+ " message = response.choices[0]['message']['content']\n",
47
+ " return message"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 8,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "data": {
57
+ "text/plain": [
58
+ "\"I apologize, but as a language model AI, I don't have access to real-time information. Could you please check the time on your device or ask a nearby clock?\""
59
+ ]
60
+ },
61
+ "execution_count": 8,
62
+ "metadata": {},
63
+ "output_type": "execute_result"
64
+ }
65
+ ],
66
+ "source": [
67
+ "generate_response(\"What time is it?\")"
68
+ ]
69
+ }
70
+ ],
71
+ "metadata": {
72
+ "kernelspec": {
73
+ "display_name": "base",
74
+ "language": "python",
75
+ "name": "python3"
76
+ },
77
+ "language_info": {
78
+ "codemirror_mode": {
79
+ "name": "ipython",
80
+ "version": 3
81
+ },
82
+ "file_extension": ".py",
83
+ "mimetype": "text/x-python",
84
+ "name": "python",
85
+ "nbconvert_exporter": "python",
86
+ "pygments_lexer": "ipython3",
87
+ "version": "3.8.13"
88
+ }
89
+ },
90
+ "nbformat": 4,
91
+ "nbformat_minor": 2
92
+ }
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from playsound import playsound
3
+ from gtts import gTTS
4
+ import speech_recognition as sr
5
+ import gradio as gr
6
+
7
+ openai.api_key = api_key
8
+
9
+ def generate_response(prompt):
10
+ prompt = (f"{prompt}")
11
+
12
+ response = openai.ChatCompletion.create(
13
+ model="gpt-3.5-turbo",
14
+ messages=[
15
+ {"role": "user", "content": f"{prompt}"},
16
+ ])
17
+
18
+ message = response.choices[0]['message']['content']
19
+ return message
20
+
21
+ r = sr.Recognizer()
22
+ from pydub import AudioSegment
23
+ def transcribe(audio, lang):
24
+ with sr.AudioFile(audio) as source: audio = r.record(source)
25
+ text = r.recognize_google(audio, language=lang)
26
+ text = generate_response(text)
27
+ tts = gTTS(text=text, lang=lang)
28
+ out = "tmp.mp3"
29
+ tts.save(out)
30
+ return out
31
+
32
+ with open('gradio_article.md') as f:
33
+ article = f.read()
34
+
35
+ interface_options = {
36
+ "title": "Smart GPT",
37
+ "description": "Let's have a chat! Talk to me, and I'll respond in a jiffy",
38
+ "article": article,
39
+ "layout": "horizontal",
40
+ "theme": "default",
41
+ }
42
+
43
+ inputs = gr.Audio(source="microphone", type="filepath")
44
+ outputs = "audio"
45
+
46
+ lang = gr.Dropdown(choices=["en", "vi", "nl"], value="en")
47
+
48
+ gr.Interface(fn=transcribe, inputs=[inputs, lang], outputs=outputs, live=True,
49
+ **interface_options).launch()
50
+
51
+ # TODO
52
+ # Custom voice
53
+ # VALL-E
54
+ # https://cloud.google.com/text-to-speech/custom-voice/docs/quickstart
55
+ # Mozilla TTS
56
+ # OpenSeq2Seq
57
+ # Best VN: Vbee, FPT
58
+ # Elevenlabs for English
gradio_article.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Description
2
+ Hey there,
3
+
4
+ I made this cool app, mainly for my little son. It's a fun voice-controlled thing where he can chat with it and get spoken answers. For example, he can ask it to tell a story or find out stuff he's curious about.
5
+
6
+ Here's how it works:
7
+
8
+ I used a bunch of libraries like openai, playsound, gtts, speech_recognition, and gradio. The app records what you say into a microphone and sends it to OpenAI for answers. Then, it turns those answers into speech and plays them back to you.
9
+
10
+ There's a "transcribe" function that takes audio and a language code, listens to what you say, asks OpenAI for answers, turns those answers into speech, and stores it temporarily. Then it gives you the path to the spoken response.
11
+
12
+ The app also has a simple user interface made with gradio. You can pick a language and talk into the microphone. The app transcribes your words, gets answers, and plays them back to you.
13
+
14
+ I'm thinking of adding even more fun stuff like custom voices from VALL-E, Mozilla TTS, OpenSeq2Seq, Vbee, FPT, or Elevenlabs. So I can train my own voice to use for the generated response. Isn't it interesting?
15
+
16
+ This is a fun and easy way to have voice conversations and learn new things. So, it's not just for my son; anyone can enjoy it!
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.39.0
2
+ gTTS==2.3.1
3
+ openai==0.27.4
4
+ playsound==1.3.0
5
+ SpeechRecognition==3.9.0
6
+ transformers
7
+ torch
8
+ # pygobject
9
+ ffmpeg