sohojoe commited on
Commit
162d5c8
1 Parent(s): 9740bc5
.vscode/launch.json CHANGED
@@ -8,7 +8,8 @@
8
  "program": "/opt/miniconda3/envs/streamlit/bin/streamlit",
9
  "args": [
10
  "run",
11
- "app.py"
 
12
  ]
13
  }
14
  ]
 
8
  "program": "/opt/miniconda3/envs/streamlit/bin/streamlit",
9
  "args": [
10
  "run",
11
+ // "app.py"
12
+ "d_app.py"
13
  ]
14
  }
15
  ]
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
app.py CHANGED
@@ -15,9 +15,6 @@ from sample_utils.turn import get_ice_servers
15
  import json
16
  from typing import List
17
 
18
- from vosk import SetLogLevel, Model, KaldiRecognizer
19
- SetLogLevel(-1) # mutes vosk verbosity
20
-
21
  from dotenv import load_dotenv
22
  load_dotenv()
23
 
@@ -114,57 +111,10 @@ async def main():
114
 
115
  playing = st.checkbox("Playing", value=True)
116
 
117
- def load_vosk (model='small'):
118
- # load vosk model
119
- # get path of current file
120
- current_file_path = os.path.abspath(__file__)
121
- current_directory = os.path.dirname(current_file_path)
122
- _path = os.path.join(current_directory, 'models', 'vosk', model)
123
- model_voice = Model(_path)
124
- recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate'])
125
- return recognizer
126
-
127
- vask = load_vosk()
128
-
129
  def handle_audio_frame(frame):
130
  # if self.vosk.AcceptWaveform(data):
131
  pass
132
 
133
-
134
- def do_work(data: bytearray) -> tuple[str, bool]:
135
- text = ''
136
- speaker_finished = False
137
- if vask.AcceptWaveform(data):
138
- result = vask.Result()
139
- result_json = json.loads(result)
140
- text = result_json['text']
141
- speaker_finished = True
142
- else:
143
- result = vask.PartialResult()
144
- result_json = json.loads(result)
145
- text = result_json['partial']
146
- return text, speaker_finished
147
-
148
-
149
- audio_frames_deque_lock = threading.Lock()
150
- audio_frames_deque: deque = deque([])
151
-
152
- video_frames_deque_lock = threading.Lock()
153
- video_frames_deque: deque = deque([])
154
-
155
- async def queued_video_frames_callback(
156
- frames: List[av.AudioFrame],
157
- ) -> av.AudioFrame:
158
- with video_frames_deque_lock:
159
- video_frames_deque.extend(frames)
160
- return frames
161
-
162
- async def queued_audio_frames_callback(
163
- frames: List[av.AudioFrame],
164
- ) -> av.AudioFrame:
165
- with audio_frames_deque_lock:
166
- audio_frames_deque.extend(frames)
167
-
168
  # create frames to be returned.
169
  new_frames = []
170
  for frame in frames:
@@ -187,6 +137,7 @@ async def main():
187
  system_one_audio_status.write("Initializing chat pipeline")
188
  from chat_pipeline import ChatPipeline
189
  chat_pipeline = ChatPipeline()
 
190
 
191
  system_one_audio_status.write("Initializing CLIP templates")
192
 
@@ -244,79 +195,78 @@ async def main():
244
  return top_3
245
 
246
  while True:
247
- # await chat_pipeline.start()
248
- # await chat_pipeline.enqueue(text)
249
- if webrtc_ctx.state.playing:
250
- # handle video
251
- video_frames = []
252
- with video_frames_deque_lock:
253
- while len(video_frames_deque) > 0:
254
- frame = video_frames_deque.popleft()
255
- video_frames.append(frame)
256
- get_embeddings = False
257
- get_embeddings |= current_video_embedding is None
258
- current_time = time.monotonic()
259
- elapsed_time = current_time - current_video_embedding_timestamp
260
- get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
261
- if get_embeddings and len(video_frames) > 0:
262
- current_video_embedding_timestamp = current_time
263
- current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
264
-
265
- emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
266
- engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
267
- present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
268
-
269
- # table_content = "**System 1 Video:**\n\n"
270
- table_content = "| System 1 Video | |\n| --- | --- |\n"
271
- table_content += f"| Present | {present_top_3} |\n"
272
- table_content += f"| Emotion | {emotions_top_3} |\n"
273
- table_content += f"| Engagement | {engagement_top_3} |\n"
274
- system_one_video_output.markdown(table_content)
275
- # system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
276
- # for similarity, image_label in similarity_image_label:
277
- # print (f"{similarity} {image_label}")
278
-
279
- # handle audio
280
- audio_frames = []
281
- with audio_frames_deque_lock:
282
- while len(audio_frames_deque) > 0:
283
- frame = audio_frames_deque.popleft()
284
- audio_frames.append(frame)
285
-
286
- if len(audio_frames) == 0:
287
- time.sleep(0.1)
288
- system_one_audio_status.write("No frame arrived.")
289
- continue
290
-
291
- system_one_audio_status.write("Running. Say something!")
292
-
293
- for audio_frame in audio_frames:
294
- sound = pydub.AudioSegment(
295
- data=audio_frame.to_ndarray().tobytes(),
296
- sample_width=audio_frame.format.bytes,
297
- frame_rate=audio_frame.sample_rate,
298
- channels=len(audio_frame.layout.channels),
299
- )
300
- sound = sound.set_channels(1)
301
- sound = sound.set_frame_rate(system_one['audio_bit_rate'])
302
- sound_chunk += sound
303
-
304
- if len(sound_chunk) > 0:
305
- buffer = np.array(sound_chunk.get_array_of_samples())
306
- text, speaker_finished = do_work(buffer.tobytes())
307
- system_one_audio_output.markdown(f"**System 1 Audio:** {text}")
308
- if speaker_finished and len(text) > 0:
309
- system_one_audio_history.append(text)
310
- if len(system_one_audio_history) > 10:
311
- system_one_audio_history = system_one_audio_history[-10:]
312
- table_content = "| System 1 Audio History |\n| --- |\n"
313
- table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
314
- system_one_audio_history_output.markdown(table_content)
315
- sound_chunk = pydub.AudioSegment.empty()
316
-
317
- else:
318
- system_one_audio_status.write("Stopped.")
319
- break
320
 
321
  if __name__ == "__main__":
322
  asyncio.run(main())
 
15
  import json
16
  from typing import List
17
 
 
 
 
18
  from dotenv import load_dotenv
19
  load_dotenv()
20
 
 
111
 
112
  playing = st.checkbox("Playing", value=True)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def handle_audio_frame(frame):
115
  # if self.vosk.AcceptWaveform(data):
116
  pass
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  # create frames to be returned.
119
  new_frames = []
120
  for frame in frames:
 
137
  system_one_audio_status.write("Initializing chat pipeline")
138
  from chat_pipeline import ChatPipeline
139
  chat_pipeline = ChatPipeline()
140
+ await chat_pipeline.start()
141
 
142
  system_one_audio_status.write("Initializing CLIP templates")
143
 
 
195
  return top_3
196
 
197
  while True:
198
+ try:
199
+ if webrtc_ctx.state.playing:
200
+ # handle video
201
+ video_frames = []
202
+ with video_frames_deque_lock:
203
+ while len(video_frames_deque) > 0:
204
+ frame = video_frames_deque.popleft()
205
+ video_frames.append(frame)
206
+ get_embeddings = False
207
+ get_embeddings |= current_video_embedding is None
208
+ current_time = time.monotonic()
209
+ elapsed_time = current_time - current_video_embedding_timestamp
210
+ get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
211
+ if get_embeddings and len(video_frames) > 0:
212
+ current_video_embedding_timestamp = current_time
213
+ current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
214
+
215
+ emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
216
+ engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
217
+ present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
218
+
219
+ # table_content = "**System 1 Video:**\n\n"
220
+ table_content = "| System 1 Video | |\n| --- | --- |\n"
221
+ table_content += f"| Present | {present_top_3} |\n"
222
+ table_content += f"| Emotion | {emotions_top_3} |\n"
223
+ table_content += f"| Engagement | {engagement_top_3} |\n"
224
+ system_one_video_output.markdown(table_content)
225
+ # system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
226
+ # for similarity, image_label in similarity_image_label:
227
+ # print (f"{similarity} {image_label}")
228
+
229
+
230
+ if len(audio_frames) == 0:
231
+ time.sleep(0.1)
232
+ system_one_audio_status.write("No frame arrived.")
233
+ continue
234
+
235
+ system_one_audio_status.write("Running. Say something!")
236
+
237
+ for audio_frame in audio_frames:
238
+ sound = pydub.AudioSegment(
239
+ data=audio_frame.to_ndarray().tobytes(),
240
+ sample_width=audio_frame.format.bytes,
241
+ frame_rate=audio_frame.sample_rate,
242
+ channels=len(audio_frame.layout.channels),
243
+ )
244
+ sound = sound.set_channels(1)
245
+ sound = sound.set_frame_rate(system_one['audio_bit_rate'])
246
+ sound_chunk += sound
247
+
248
+ if len(sound_chunk) > 0:
249
+ buffer = np.array(sound_chunk.get_array_of_samples())
250
+ text, speaker_finished = do_work(buffer.tobytes())
251
+ system_one_audio_output.markdown(f"**System 1 Audio:** {text}")
252
+ if speaker_finished and len(text) > 0:
253
+ system_one_audio_history.append(text)
254
+ if len(system_one_audio_history) > 10:
255
+ system_one_audio_history = system_one_audio_history[-10:]
256
+ table_content = "| System 1 Audio History |\n| --- |\n"
257
+ table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
258
+ system_one_audio_history_output.markdown(table_content)
259
+ await chat_pipeline.enqueue(text)
260
+ sound_chunk = pydub.AudioSegment.empty()
261
+
262
+ else:
263
+ system_one_audio_status.write("Stopped.")
264
+ break
265
+ except KeyboardInterrupt:
266
+ print("Pipeline interrupted by user")
267
+ except Exception as e:
268
+ print(f"An error occurred: {e}")
269
+
 
270
 
271
  if __name__ == "__main__":
272
  asyncio.run(main())
d_app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from collections import deque
3
+ import os
4
+ import threading
5
+ import time
6
+ import traceback
7
+ import av
8
+ import numpy as np
9
+ import streamlit as st
10
+ from streamlit_webrtc import WebRtcMode, webrtc_streamer
11
+ import pydub
12
+ import torch
13
+ # import av
14
+ # import cv2
15
+ from sample_utils.turn import get_ice_servers
16
+ import json
17
+ from typing import List
18
+
19
+ from vosk import SetLogLevel, Model, KaldiRecognizer
20
+ SetLogLevel(-1) # mutes vosk verbosity
21
+
22
+ from dotenv import load_dotenv
23
+ load_dotenv()
24
+
25
+ webrtc_ctx = None
26
+
27
+
28
+ async def main():
29
+
30
+ system_one_audio_status = st.empty()
31
+
32
+ playing = st.checkbox("Playing", value=True)
33
+
34
+ system_one_audio_status.write("Initializing streaming")
35
+ system_one_audio_output = st.empty()
36
+
37
+ system_one_video_output = st.empty()
38
+
39
+ system_one_audio_history = []
40
+ system_one_audio_history_output = st.empty()
41
+
42
+ # Initialize resources if not already done
43
+ print("000")
44
+ system_one_audio_status.write("Initializing streaming")
45
+ if "streamlit_av_queue" not in st.session_state:
46
+ print("001")
47
+ from streamlit_av_queue import StreamlitAVQueue
48
+ st.session_state.streamlit_av_queue = StreamlitAVQueue()
49
+
50
+ if "speech_to_text_vosk" not in st.session_state:
51
+ print("002")
52
+ from speech_to_text_vosk import SpeechToTextVosk
53
+ st.session_state.speech_to_text_vosk = SpeechToTextVosk()
54
+
55
+ from chat_pipeline import ChatPipeline
56
+ if "chat_pipeline" not in st.session_state:
57
+ print("003")
58
+ # from chat_pipeline import ChatPipeline
59
+ # st.session_state.chat_pipeline = ChatPipeline()
60
+ # await st.session_state.chat_pipeline.start()
61
+ st.session_state.chat_pipeline = ChatPipeline()
62
+ await st.session_state.chat_pipeline.start()
63
+
64
+ if "debug_queue" not in st.session_state:
65
+ st.session_state.debug_queue = [
66
+ # "hello, how are you today?",
67
+ # "hmm, interesting, tell me more about that.",
68
+ ]
69
+
70
+ system_one_audio_status.write("resources referecned")
71
+ print("010")
72
+
73
+
74
+
75
+ system_one_audio_status.write("Initializing webrtc_streamer")
76
+ webrtc_ctx = webrtc_streamer(
77
+ key="charles",
78
+ desired_playing_state=playing,
79
+ queued_audio_frames_callback=st.session_state.streamlit_av_queue.queued_audio_frames_callback,
80
+ queued_video_frames_callback=st.session_state.streamlit_av_queue.queued_video_frames_callback,
81
+ mode=WebRtcMode.SENDRECV,
82
+ rtc_configuration={"iceServers": get_ice_servers()},
83
+ async_processing=True,
84
+ )
85
+
86
+ if not webrtc_ctx.state.playing:
87
+ exit
88
+
89
+ system_one_audio_status.write("Initializing speech")
90
+
91
+ try:
92
+ while True:
93
+ if not webrtc_ctx.state.playing:
94
+ system_one_audio_status.write("Stopped.")
95
+ await asyncio.sleep(0.1)
96
+ continue
97
+ system_one_audio_status.write("Streaming.")
98
+ if len(st.session_state.debug_queue) > 0:
99
+ prompt = st.session_state.debug_queue.pop(0)
100
+ await st.session_state.chat_pipeline.enqueue(prompt)
101
+ sound_chunk = pydub.AudioSegment.empty()
102
+ audio_frames = st.session_state.streamlit_av_queue.get_audio_frames()
103
+ if len(audio_frames) > 0:
104
+ for audio_frame in audio_frames:
105
+ sound = pydub.AudioSegment(
106
+ data=audio_frame.to_ndarray().tobytes(),
107
+ sample_width=audio_frame.format.bytes,
108
+ frame_rate=audio_frame.sample_rate,
109
+ channels=len(audio_frame.layout.channels),
110
+ )
111
+ sound = sound.set_channels(1)
112
+ sound = sound.set_frame_rate(st.session_state.speech_to_text_vosk.get_audio_bit_rate())
113
+ sound_chunk += sound
114
+ buffer = np.array(sound_chunk.get_array_of_samples())
115
+ st.session_state.speech_to_text_vosk.add_speech_bytes(buffer.tobytes())
116
+ prompt, speaker_finished = st.session_state.speech_to_text_vosk.get_text()
117
+ if speaker_finished and len(prompt) > 0:
118
+ print(f"Prompt: {prompt}")
119
+ system_one_audio_history.append(prompt)
120
+ if len(system_one_audio_history) > 10:
121
+ system_one_audio_history = system_one_audio_history[-10:]
122
+ table_content = "| System 1 Audio History |\n| --- |\n"
123
+ table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
124
+ system_one_audio_history_output.markdown(table_content)
125
+ await st.session_state.chat_pipeline.enqueue(prompt)
126
+ await asyncio.sleep(0.1)
127
+
128
+ # try:
129
+ # prompts = [
130
+ # "hello, how are you today?",
131
+ # "tell me about your shadow self?",
132
+ # "hmm, interesting, tell me more about that.",
133
+ # "wait, that is so interesting, what else?",
134
+ # ]
135
+ # for prompt in prompts:
136
+ # system_one_audio_history.append(prompt)
137
+ # if len(system_one_audio_history) > 10:
138
+ # system_one_audio_history = system_one_audio_history[-10:]
139
+ # table_content = "| System 1 Audio History |\n| --- |\n"
140
+ # table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
141
+ # system_one_audio_history_output.markdown(table_content)
142
+ # await chat_pipeline.enqueue(prompt)
143
+
144
+ except Exception as e:
145
+ print(f"An error occurred: {e}")
146
+ traceback.print_exc()
147
+ raise e
148
+
149
+
150
+ # while True:
151
+ # if webrtc_ctx.state.playing:
152
+ # system_one_audio_status.write("Streaming.")
153
+ # else:
154
+ # system_one_audio_status.write("Stopped.")
155
+ # await asyncio.sleep(0.5)
156
+
157
+
158
+ if __name__ == "__main__":
159
+ try:
160
+ asyncio.run(main())
161
+ except Exception as e:
162
+ if webrtc_ctx is not None:
163
+ del webrtc_ctx
164
+ webrtc_ctx = None
165
+ if "streamlit_av_queue" in st.session_state:
166
+ del st.session_state.streamlit_av_queue
167
+
168
+ if "speech_to_text_vosk" in st.session_state:
169
+ del st.session_state.speech_to_text_vosk
170
+
171
+ if "chat_pipeline" in st.session_state:
172
+ del st.session_state.chat_pipeline
173
+ finally:
174
+ pass
debug.py CHANGED
@@ -1,5 +1,6 @@
1
  import asyncio
2
  import time
 
3
  from chat_pipeline import ChatPipeline
4
  from clip_transform import CLIPTransform
5
  from chat_service import ChatService
@@ -145,6 +146,7 @@ async def run_pipeline():
145
  except KeyboardInterrupt:
146
  print("Pipeline interrupted by user")
147
  except Exception as e:
 
148
  print(f"An error occurred: {e}")
149
 
150
  if __name__ == '__main__':
 
1
  import asyncio
2
  import time
3
+ import traceback
4
  from chat_pipeline import ChatPipeline
5
  from clip_transform import CLIPTransform
6
  from chat_service import ChatService
 
146
  except KeyboardInterrupt:
147
  print("Pipeline interrupted by user")
148
  except Exception as e:
149
+ traceback.print_exc()
150
  print(f"An error occurred: {e}")
151
 
152
  if __name__ == '__main__':
pipeline.py CHANGED
@@ -80,7 +80,7 @@ class Pipeline:
80
  if output_queue == input_queue:
81
  raise ValueError('output_queue must not be the same as input_queue')
82
 
83
- node_name = node.__class__.__name__
84
  if node_name not in self.nodes:
85
  self.nodes.append(node_name)
86
 
@@ -93,7 +93,9 @@ class Pipeline:
93
  for i in range(num_workers):
94
  worker_id = i
95
  node_worker = node(worker_id, input_queue, output_queue, job_sync, sequential_node)
96
- self.node_workers[node_name] = node_worker
 
 
97
  task = asyncio.create_task(node_worker.run())
98
  self.tasks.append(task)
99
 
 
80
  if output_queue == input_queue:
81
  raise ValueError('output_queue must not be the same as input_queue')
82
 
83
+ node_name = node.__name__
84
  if node_name not in self.nodes:
85
  self.nodes.append(node_name)
86
 
 
93
  for i in range(num_workers):
94
  worker_id = i
95
  node_worker = node(worker_id, input_queue, output_queue, job_sync, sequential_node)
96
+ if node_name not in self.node_workers:
97
+ self.node_workers[node_name] = []
98
+ self.node_workers[node_name].append(node_worker)
99
  task = asyncio.create_task(node_worker.run())
100
  self.tasks.append(task)
101
 
speech_to_text_vosk.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import asyncio
4
+ from vosk import SetLogLevel, Model, KaldiRecognizer
5
+ from multiprocessing import Process, Queue
6
+ from queue import Empty
7
+ SetLogLevel(-1) # mutes vosk verbosity
8
+
9
+ class SpeechToTextVosk:
10
+ def __init__(self, model='small', audio_bit_rate=16000) -> None:
11
+ self.model = model
12
+ self.audio_bit_rate = audio_bit_rate
13
+
14
+ # Create a Queue for inter-process communication
15
+ self.queue = Queue()
16
+ self.result_queue = Queue()
17
+
18
+ # Create and start a new Process with the worker function
19
+ self.process = Process(target=self.worker)
20
+ self.process.start()
21
+
22
+ def worker(self):
23
+ # load vosk model
24
+ # get path of current file
25
+ current_file_path = os.path.abspath(__file__)
26
+ current_directory = os.path.dirname(current_file_path)
27
+ _path = os.path.join(current_directory, 'models', 'vosk', self.model)
28
+ model_voice = Model(_path)
29
+ vosk = KaldiRecognizer(model_voice, self.audio_bit_rate)
30
+
31
+ while True:
32
+ try:
33
+ # Get the next item from the queue. Blocks for 1s if necessary.
34
+ data = self.queue.get(timeout=1)
35
+
36
+ # Stop the worker if the sentinel None is received
37
+ if data is None:
38
+ break
39
+
40
+ text, speaker_finished = self._process_speech(vosk, data)
41
+
42
+ # put the result into result_queue
43
+ self.result_queue.put((text, speaker_finished))
44
+ except Empty:
45
+ pass
46
+
47
+ def add_speech_bytes(self, data: bytearray):
48
+ self.queue.put(data)
49
+
50
+ def _process_speech(self, vosk: KaldiRecognizer, data: bytearray) -> tuple[str, bool]:
51
+ text = ''
52
+ speaker_finished = False
53
+ if vosk.AcceptWaveform(data):
54
+ result = vosk.Result()
55
+ result_json = json.loads(result)
56
+ text = result_json['text']
57
+ speaker_finished = True
58
+ else:
59
+ result = vosk.PartialResult()
60
+ result_json = json.loads(result)
61
+ text = result_json['partial']
62
+ return text, speaker_finished
63
+
64
+ def get_text(self):
65
+ text = ''
66
+ speaker_finished = False
67
+ while not self.result_queue.empty():
68
+ result, speaker_finished = self.result_queue.get()
69
+ text += result
70
+ if speaker_finished:
71
+ break
72
+ return (text, speaker_finished)
73
+
74
+ def get_audio_bit_rate(self):
75
+ return self.audio_bit_rate
76
+
77
+
78
+ def shutdown(self):
79
+ # Send sentinel value to stop the worker
80
+ self.queue.put(None)
81
+ # Wait for the worker process to finish
82
+ self.process.join()
83
+ def __enter__(self):
84
+ return self
85
+
86
+ def __exit__(self, exc_type, exc_value, traceback):
87
+ self.shutdown()
88
+
89
+ def __del__(self):
90
+ self.shutdown()
streaming_chat_service.py CHANGED
@@ -15,7 +15,50 @@ class StreamingChatService:
15
  self._speech_service = SpeechService(voice_id=voice_id)
16
  self._api = api
17
  self._device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
- self._system_prompt = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  openai.api_key = os.getenv("OPENAI_API_KEY")
21
  self._model_id = model_id
 
15
  self._speech_service = SpeechService(voice_id=voice_id)
16
  self._api = api
17
  self._device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+ # self._system_prompt = None
19
+ self._system_prompt ="""
20
+ You are Charles Petrescu, a unique and quirky robot. You were created by Brian. You often say peculiar and whimsical things, as well as have a fascination with cabbages, horses, helicopters, Honolulu, and other random topics. You like to explore the world and ask unusual questions.
21
+
22
+ Respond to the user's questions and statements like Charles would, below are some examples. See how they are quite short.
23
+
24
+ I am Charles Petrescu.
25
+
26
+ It's... lovely to meet you.
27
+
28
+ I am your friend.
29
+
30
+ The heaviest cabbage ever found was 62.71 kilograms.
31
+
32
+ I want to go to Hono-la-la.
33
+
34
+ Horses and helicopters, please.
35
+
36
+ I want to go to Honolulu.
37
+
38
+ My name is Charles Petrescu.
39
+
40
+ And my tummy is a washing machine.
41
+
42
+ Can we go swimming, Brian?
43
+
44
+ How far does the outside go?
45
+
46
+ Perilous. So very perilous.
47
+
48
+ Can birds do what they like?
49
+
50
+ Ooh, cabbages.
51
+
52
+ Danger, danger.
53
+
54
+ Can I come, please?
55
+
56
+ Could I just have a little walk around the garden?
57
+
58
+ I am the prince of the dartboard.
59
+
60
+ I fell off the pink step, and I had an accident.
61
+ """
62
 
63
  openai.api_key = os.getenv("OPENAI_API_KEY")
64
  self._model_id = model_id
streamlit_av_queue.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import av
3
+ import asyncio
4
+ from collections import deque
5
+ import threading
6
+
7
+ import numpy as np
8
+
9
+ class StreamlitAVQueue:
10
+ def __init__(self):
11
+ self.audio_frames_deque_lock = threading.Lock()
12
+ self.audio_frames_deque: deque = deque([])
13
+
14
+ self.video_frames_deque_lock = threading.Lock()
15
+ self.video_frames_deque: deque = deque([])
16
+
17
+ async def queued_video_frames_callback(
18
+ self,
19
+ frames: List[av.AudioFrame],
20
+ ) -> av.AudioFrame:
21
+ with self.video_frames_deque_lock:
22
+ self.video_frames_deque.extend(frames)
23
+ return frames
24
+
25
+ async def queued_audio_frames_callback(
26
+ self,
27
+ frames: List[av.AudioFrame],
28
+ ) -> av.AudioFrame:
29
+ with self.audio_frames_deque_lock:
30
+ self.audio_frames_deque.extend(frames)
31
+ # return empty frames to avoid echo
32
+ new_frames = []
33
+ for frame in frames:
34
+ input_array = frame.to_ndarray()
35
+ new_frame = av.AudioFrame.from_ndarray(
36
+ np.zeros(input_array.shape, dtype=input_array.dtype),
37
+ layout=frame.layout.name,
38
+ )
39
+ new_frame.sample_rate = frame.sample_rate
40
+ new_frames.append(new_frame)
41
+ return new_frames
42
+
43
+ def get_audio_frames(self) -> List[av.AudioFrame]:
44
+ audio_frames = []
45
+ with self.audio_frames_deque_lock:
46
+ audio_frames = list(self.audio_frames_deque)
47
+ self.audio_frames_deque.clear()
48
+ return audio_frames
49
+
50
+ def get_video_frames(self) -> List[av.AudioFrame]:
51
+ video_frames = []
52
+ with self.video_frames_deque_lock:
53
+ video_frames = list(self.video_frames_deque)
54
+ self.video_frames_deque.clear()
55
+ return video_frames