asiffarhankhan commited on
Commit
66eba67
1 Parent(s): 78e7c24

Add update to buzz user on being idle

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. app.py +38 -94
  3. app_utils.py +90 -0
  4. assets/char_poses_base64.py +0 -0
  5. assets/timeout_audio.mp3 +0 -0
.gitignore CHANGED
@@ -3,3 +3,4 @@ __pycache__
3
  .chroma
4
  initialize.sh
5
  conversations.log
 
 
3
  .chroma
4
  initialize.sh
5
  conversations.log
6
+ custom_gpt_voice assistant_demo.mp4
app.py CHANGED
@@ -1,100 +1,45 @@
1
-
2
  import os
3
- import boto3
4
  import openai
5
- import whisper
6
- import logging
7
- import base64
8
  import gradio as gr
9
- from io import BytesIO
10
-
11
- from langchain import OpenAI
12
- from langchain.chains import RetrievalQA
13
- from langchain.vectorstores import Chroma
14
- from langchain.document_loaders import DirectoryLoader
15
- from langchain.embeddings.openai import OpenAIEmbeddings
16
- from langchain.text_splitter import CharacterTextSplitter
17
- from assets.char_poses_base64 import idle_html_base_64, thinking_html_base_64, talking_html_base64
18
-
19
- logging.basicConfig(level="INFO",
20
- filename='conversations.log',
21
- filemode='a',
22
- format='%(asctime)s %(message)s',
23
- datefmt='%H:%M:%S')
24
 
25
- logger = logging.getLogger('voice_agent')
 
26
 
 
 
 
27
 
28
  global FUNC_CALL
29
  FUNC_CALL = 0
30
 
31
- OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
32
- AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
33
- AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
34
- AWS_REGION_NAME = 'ap-south-1'
35
 
36
  GENERAL_RSPONSE_TRIGGERS = ["I don't understand the question.", "I don't know", "Hello, my name is", "mentioned in the context provided"]
37
  MESSAGES = [{"role": "system", "content": "You are a helpful assistant.."}]
38
 
39
- CHAR_IDLE = f'<img src="{idle_html_base_64}"></img>'
40
- CHAR_TALKING = f'<img src="{talking_html_base64}"></img>'
41
- CHAR_THINKING = f'<img src="{thinking_html_base_64}"></img>'
42
  AUDIO_HTML = ''
43
 
44
  # Uncomment If this is your first Run:
45
- import nltk
46
  nltk.download('averaged_perceptron_tagger')
 
47
 
48
 
49
- def initialize_knowledge_base():
50
-
51
- loader = DirectoryLoader('profiles', glob='**/*.txt')
52
- docs = loader.load()
53
-
54
- char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
55
- doc_texts = char_text_splitter.split_documents(docs)
56
-
57
- openAI_embeddings = OpenAIEmbeddings()
58
- vStore = Chroma.from_documents(doc_texts, openAI_embeddings)
59
-
60
- conv_model = RetrievalQA.from_chain_type(
61
- llm=OpenAI(),
62
- chain_type="stuff",
63
- retriever=vStore.as_retriever(
64
- search_kwargs={"k": 1}
65
- )
66
- )
67
- voice_model = whisper.load_model("tiny")
68
-
69
- return conv_model, voice_model
70
-
71
-
72
- def text_to_speech_gen(answer):
73
-
74
- polly = boto3.client('polly',
75
- aws_access_key_id=AWS_ACCESS_KEY_ID,
76
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
77
- region_name=AWS_REGION_NAME)
78
-
79
- response = polly.synthesize_speech(
80
- Text=answer,
81
- VoiceId='Matthew',
82
- OutputFormat='mp3',
83
- Engine = "neural")
84
-
85
- audio_stream = response['AudioStream'].read()
86
- audio_html = audio_to_html(audio_stream)
87
-
88
- return audio_html
89
-
90
 
91
- def audio_to_html(audio_bytes):
92
- audio_io = BytesIO(audio_bytes)
93
- audio_io.seek(0)
94
- audio_base64 = base64.b64encode(audio_io.read()).decode("utf-8")
95
- audio_html = f'<audio src="data:audio/mpeg;base64,{audio_base64}" controls autoplay></audio>'
96
 
97
- return audio_html
 
 
98
 
99
 
100
  def update_img():
@@ -102,17 +47,9 @@ def update_img():
102
  FUNC_CALL += 1
103
 
104
  if FUNC_CALL % 2== 0:
105
- CHARACTER_STATE = CHAR_TALKING
106
  else:
107
- CHARACTER_STATE = CHAR_THINKING
108
-
109
- return CHARACTER_STATE
110
-
111
-
112
- def user(user_message, history):
113
- return "", history + [[user_message, None]]
114
-
115
- conv_model, voice_model = initialize_knowledge_base()
116
 
117
 
118
  def get_response(history, audio_input):
@@ -120,6 +57,9 @@ def get_response(history, audio_input):
120
  query_type = 'text'
121
  question =history[-1][0]
122
 
 
 
 
123
  if not question:
124
  if audio_input:
125
  query_type = 'audio'
@@ -130,8 +70,8 @@ def get_response(history, audio_input):
130
  else:
131
  return None, None
132
 
133
- logger.info("\nquery_type: %s", query_type)
134
- logger.info("query_text: %s", question)
135
  print('\nquery_type:', query_type)
136
  print('\nquery_text:', question)
137
 
@@ -139,7 +79,7 @@ def get_response(history, audio_input):
139
  question = 'hello'
140
 
141
  answer = conv_model.run(question)
142
- logger.info("\ndocument_response: %s", answer)
143
  print('\ndocument_response:', answer)
144
 
145
  for trigger in GENERAL_RSPONSE_TRIGGERS:
@@ -154,7 +94,7 @@ def get_response(history, audio_input):
154
  )
155
  answer = chat.choices[0].message.content
156
  MESSAGES.append({"role": "assistant", "content": answer})
157
- logger.info("general_response: %s", answer)
158
  print('\ngeneral_response:', answer)
159
 
160
  AUDIO_HTML = text_to_speech_gen(answer)
@@ -162,12 +102,14 @@ def get_response(history, audio_input):
162
 
163
  return history, AUDIO_HTML
164
 
 
165
 
166
  with gr.Blocks(title="Your Assistance Pal!") as demo:
 
167
  with gr.Row():
168
  output_html = gr.HTML(label="Felix's Voice", value=AUDIO_HTML)
169
  output_html.visible = False
170
- assistant_character = gr.HTML(label=None, value=CHAR_IDLE, show_label=False)
171
  with gr.Column(scale=0.1):
172
  chatbot = gr.Chatbot(label='Send a text or a voice input').style(height=285)
173
  with gr.Row():
@@ -176,14 +118,16 @@ with gr.Blocks(title="Your Assistance Pal!") as demo:
176
  audio_input = gr.Audio(source="microphone", type='filepath', show_label=False).style(container=False)
177
  button = gr.Button(value="Send")
178
 
179
- msg.submit(user, [msg, chatbot], [msg, chatbot]
180
  ).then(update_img, outputs=[assistant_character]
181
  ).then(get_response, [chatbot, audio_input], [chatbot, output_html]
182
  ).then(update_img, outputs=[assistant_character])
183
 
184
- button.click(user, [msg, chatbot], [msg, chatbot]
185
  ).then(update_img, outputs=[assistant_character]
186
  ).then(get_response, [chatbot, audio_input], [chatbot, output_html]
187
  ).then(update_img, outputs=[assistant_character])
188
 
189
- demo.launch(debug=False, favicon_path='assets/favicon.png', show_api=False, share=False)
 
 
 
 
1
  import os
2
+ import nltk
3
  import openai
4
+ import time
 
 
5
  import gradio as gr
6
+ from threading import Thread
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ from assets.char_poses_base64 import (
9
+ CHAR_IDLE_HTML, CHAR_THINKING_HTML, CHAR_TALKING_HTML)
10
 
11
+ from app_utils import (
12
+ get_chat_history, initialize_knowledge_base,
13
+ text_to_speech_gen, logging, buzz_user)
14
 
15
  global FUNC_CALL
16
  FUNC_CALL = 0
17
 
18
+ global BUZZ_TIMEOUT
19
+ BUZZ_TIMEOUT = 60
 
 
20
 
21
  GENERAL_RSPONSE_TRIGGERS = ["I don't understand the question.", "I don't know", "Hello, my name is", "mentioned in the context provided"]
22
  MESSAGES = [{"role": "system", "content": "You are a helpful assistant.."}]
23
 
24
+ LOGGER = logging.getLogger('voice_agent')
 
 
25
  AUDIO_HTML = ''
26
 
27
  # Uncomment If this is your first Run:
 
28
  nltk.download('averaged_perceptron_tagger')
29
+ conv_model, voice_model = initialize_knowledge_base()
30
 
31
 
32
+ def idle_timer():
33
+ global BUZZ_TIMEOUT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ while True:
36
+ print('started countdown')
37
+ time.sleep(BUZZ_TIMEOUT)
38
+ buzz_user()
 
39
 
40
+ if BUZZ_TIMEOUT == 80:
41
+ time.sleep(BUZZ_TIMEOUT)
42
+ BUZZ_TIMEOUT = 60
43
 
44
 
45
  def update_img():
 
47
  FUNC_CALL += 1
48
 
49
  if FUNC_CALL % 2== 0:
50
+ return CHAR_TALKING_HTML
51
  else:
52
+ return CHAR_THINKING_HTML
 
 
 
 
 
 
 
 
53
 
54
 
55
  def get_response(history, audio_input):
 
57
  query_type = 'text'
58
  question =history[-1][0]
59
 
60
+ global BUZZ_TIMEOUT
61
+ BUZZ_TIMEOUT = 80
62
+
63
  if not question:
64
  if audio_input:
65
  query_type = 'audio'
 
70
  else:
71
  return None, None
72
 
73
+ LOGGER.info("\nquery_type: %s", query_type)
74
+ LOGGER.info("query_text: %s", question)
75
  print('\nquery_type:', query_type)
76
  print('\nquery_text:', question)
77
 
 
79
  question = 'hello'
80
 
81
  answer = conv_model.run(question)
82
+ LOGGER.info("\ndocument_response: %s", answer)
83
  print('\ndocument_response:', answer)
84
 
85
  for trigger in GENERAL_RSPONSE_TRIGGERS:
 
94
  )
95
  answer = chat.choices[0].message.content
96
  MESSAGES.append({"role": "assistant", "content": answer})
97
+ LOGGER.info("general_response: %s", answer)
98
  print('\ngeneral_response:', answer)
99
 
100
  AUDIO_HTML = text_to_speech_gen(answer)
 
102
 
103
  return history, AUDIO_HTML
104
 
105
+ buzz_usr_proc = Thread(target=idle_timer)
106
 
107
  with gr.Blocks(title="Your Assistance Pal!") as demo:
108
+
109
  with gr.Row():
110
  output_html = gr.HTML(label="Felix's Voice", value=AUDIO_HTML)
111
  output_html.visible = False
112
+ assistant_character = gr.HTML(label=None, value=CHAR_IDLE_HTML, show_label=False)
113
  with gr.Column(scale=0.1):
114
  chatbot = gr.Chatbot(label='Send a text or a voice input').style(height=285)
115
  with gr.Row():
 
118
  audio_input = gr.Audio(source="microphone", type='filepath', show_label=False).style(container=False)
119
  button = gr.Button(value="Send")
120
 
121
+ msg.submit(get_chat_history, [msg, chatbot], [msg, chatbot]
122
  ).then(update_img, outputs=[assistant_character]
123
  ).then(get_response, [chatbot, audio_input], [chatbot, output_html]
124
  ).then(update_img, outputs=[assistant_character])
125
 
126
+ button.click(get_chat_history, [msg, chatbot], [msg, chatbot]
127
  ).then(update_img, outputs=[assistant_character]
128
  ).then(get_response, [chatbot, audio_input], [chatbot, output_html]
129
  ).then(update_img, outputs=[assistant_character])
130
 
131
+ buzz_usr_proc.start()
132
+
133
+ demo.launch(debug=False, favicon_path='assets/favicon.png', show_api=False, share=True)
app_utils.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import whisper
3
+ from io import BytesIO
4
+ import base64
5
+ import boto3
6
+ from pydub import AudioSegment
7
+ from pydub.playback import play
8
+ import logging
9
+
10
+ from langchain import OpenAI
11
+ from langchain.chains import RetrievalQA
12
+ from langchain.vectorstores import Chroma
13
+ from langchain.document_loaders import DirectoryLoader
14
+ from langchain.embeddings.openai import OpenAIEmbeddings
15
+ from langchain.text_splitter import CharacterTextSplitter
16
+
17
+
18
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
19
+ AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
20
+ AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
21
+ AWS_REGION_NAME = 'ap-south-1'
22
+
23
+
24
+ logging.basicConfig(level="INFO",
25
+ filename='conversations.log',
26
+ filemode='a',
27
+ format='%(asctime)s %(message)s',
28
+ datefmt='%H:%M:%S')
29
+
30
+
31
+ def buzz_user():
32
+ input_prompt = AudioSegment.from_mp3('assets/timeout_audio.mp3')
33
+ play(input_prompt)
34
+
35
+
36
+ def initialize_knowledge_base():
37
+
38
+ loader = DirectoryLoader('profiles', glob='**/*.txt')
39
+ docs = loader.load()
40
+
41
+ char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
42
+ doc_texts = char_text_splitter.split_documents(docs)
43
+
44
+ openAI_embeddings = OpenAIEmbeddings()
45
+ vStore = Chroma.from_documents(doc_texts, openAI_embeddings)
46
+
47
+ conv_model = RetrievalQA.from_chain_type(
48
+ llm=OpenAI(),
49
+ chain_type="stuff",
50
+ retriever=vStore.as_retriever(
51
+ search_kwargs={"k": 1}
52
+ )
53
+ )
54
+ voice_model = whisper.load_model("tiny")
55
+
56
+ return conv_model, voice_model
57
+
58
+
59
+ def text_to_speech_gen(answer):
60
+
61
+ polly = boto3.client('polly',
62
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
63
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
64
+ region_name=AWS_REGION_NAME)
65
+
66
+ response = polly.synthesize_speech(
67
+ Text=answer,
68
+ VoiceId='Matthew',
69
+ OutputFormat='mp3',
70
+ Engine = "neural")
71
+
72
+ audio_stream = response['AudioStream'].read()
73
+ audio_html = audio_to_html(audio_stream)
74
+
75
+ return audio_html
76
+
77
+
78
+ def audio_to_html(audio_bytes):
79
+ audio_io = BytesIO(audio_bytes)
80
+ audio_io.seek(0)
81
+ audio_base64 = base64.b64encode(audio_io.read()).decode("utf-8")
82
+ audio_html = f'<audio src="data:audio/mpeg;base64,{audio_base64}" controls autoplay></audio>'
83
+
84
+ return audio_html
85
+
86
+
87
+ def get_chat_history(user_message, history):
88
+ return "", history + [[user_message, None]]
89
+
90
+
assets/char_poses_base64.py CHANGED
The diff for this file is too large to render. See raw diff
 
assets/timeout_audio.mp3 ADDED
Binary file (21.9 kB). View file