sohojoe commited on
Commit
98ec0ec
1 Parent(s): 768e92a

migrated speech to text to an actor

Browse files
charles_actor.py CHANGED
@@ -24,8 +24,8 @@ class CharlesActor:
24
  self._streamlit_av_queue = StreamlitAVQueue()
25
 
26
  print("002")
27
- from speech_to_text_vosk import SpeechToTextVosk
28
- self._speech_to_text_vosk = SpeechToTextVosk()
29
 
30
  from chat_pipeline import ChatPipeline
31
  self._chat_pipeline = ChatPipeline()
@@ -49,38 +49,49 @@ class CharlesActor:
49
  total_video_frames = 0
50
  total_audio_frames = 0
51
  loops = 0
 
 
52
 
53
  while True:
54
  if len(self._debug_queue) > 0:
55
  prompt = self._debug_queue.pop(0)
56
  await self._chat_pipeline.enqueue(prompt)
57
- audio_frames = await self._streamlit_av_queue.get_audio_frames_async()
58
  if len(audio_frames) > 0:
59
  total_audio_frames += len(audio_frames)
60
  # Concatenate all audio frames into a single buffer
61
  audio_buffer = b"".join([buffer.tobytes() for buffer in audio_frames])
62
- self._speech_to_text_vosk.add_speech_bytes(audio_buffer)
63
- prompt, speaker_finished = self._speech_to_text_vosk.get_text()
64
- if speaker_finished and len(prompt) > 0:
65
- print(f"Prompt: {prompt}")
66
- system_one_audio_history.append(prompt)
67
- if len(system_one_audio_history) > 10:
68
- system_one_audio_history = system_one_audio_history[-10:]
69
- table_content = "| System 1 Audio History |\n| --- |\n"
70
- table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
71
- self._system_one_audio_history_output = table_content
72
- await self._chat_pipeline.enqueue(prompt)
73
- video_frames = await self._streamlit_av_queue.get_video_frames_async()
74
- if len(video_frames) > 0:
75
- total_video_frames += len(video_frames)
76
- # for video_frame in video_frames:
77
- # system_one_video_output.image(video_frame.to_ndarray())
78
- # pass
 
 
 
 
 
 
 
 
 
79
 
80
  # update debug output
81
  if (total_video_frames >0 or total_audio_frames > 0):
82
  self._state = f"Processed {total_video_frames} video frames and {total_audio_frames} audio frames"
83
- await asyncio.sleep(0.1)
84
  loops+=1
85
  self._state = f"Processed {total_video_frames} video frames and {total_audio_frames} audio frames, loops: {loops}"
86
 
@@ -115,6 +126,6 @@ if __name__ == "__main__":
115
  # The start method is still running. You can poll for debug information here.
116
  time.sleep(1)
117
  state = charles_actor.get_state.remote()
118
- # print(f"Charles is in state: {ray.get(state)}")
119
  except KeyboardInterrupt:
120
  print("Script was manually terminated")
 
24
  self._streamlit_av_queue = StreamlitAVQueue()
25
 
26
  print("002")
27
+ from speech_to_text_vosk_actor import SpeechToTextVoskActor
28
+ self._speech_to_text_actor = SpeechToTextVoskActor.remote()
29
 
30
  from chat_pipeline import ChatPipeline
31
  self._chat_pipeline = ChatPipeline()
 
49
  total_video_frames = 0
50
  total_audio_frames = 0
51
  loops = 0
52
+
53
+ process_speech_to_text_future = []
54
 
55
  while True:
56
  if len(self._debug_queue) > 0:
57
  prompt = self._debug_queue.pop(0)
58
  await self._chat_pipeline.enqueue(prompt)
59
+ audio_frames = await self._streamlit_av_queue.get_audio_frames_async()
60
  if len(audio_frames) > 0:
61
  total_audio_frames += len(audio_frames)
62
  # Concatenate all audio frames into a single buffer
63
  audio_buffer = b"".join([buffer.tobytes() for buffer in audio_frames])
64
+ future = self._speech_to_text_actor.process_speech.remote(audio_buffer)
65
+ process_speech_to_text_future.append(future)
66
+ # audio_frames_task = None
67
+
68
+ if len(process_speech_to_text_future) > 0:
69
+ ready, _ = ray.wait([process_speech_to_text_future[0]], timeout=0)
70
+ if ready:
71
+ prompt, speaker_finished = ray.get(process_speech_to_text_future[0])
72
+ del process_speech_to_text_future[0]
73
+
74
+ if speaker_finished and len(prompt) > 0:
75
+ print(f"Prompt: {prompt}")
76
+ system_one_audio_history.append(prompt)
77
+ if len(system_one_audio_history) > 10:
78
+ system_one_audio_history = system_one_audio_history[-10:]
79
+ table_content = "| System 1 Audio History |\n| --- |\n"
80
+ table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
81
+ self._system_one_audio_history_output = table_content
82
+ await self._chat_pipeline.enqueue(prompt)
83
+
84
+ # video_frames = await self._streamlit_av_queue.get_video_frames_async()
85
+ # if len(video_frames) > 0:
86
+ # total_video_frames += len(video_frames)
87
+ # # for video_frame in video_frames:
88
+ # # system_one_video_output.image(video_frame.to_ndarray())
89
+ # # pass
90
 
91
  # update debug output
92
  if (total_video_frames >0 or total_audio_frames > 0):
93
  self._state = f"Processed {total_video_frames} video frames and {total_audio_frames} audio frames"
94
+ await asyncio.sleep(0.01)
95
  loops+=1
96
  self._state = f"Processed {total_video_frames} video frames and {total_audio_frames} audio frames, loops: {loops}"
97
 
 
126
  # The start method is still running. You can poll for debug information here.
127
  time.sleep(1)
128
  state = charles_actor.get_state.remote()
129
+ print(f"Charles is in state: {ray.get(state)}")
130
  except KeyboardInterrupt:
131
  print("Script was manually terminated")
speech_to_text_vosk.py → speech_to_text_vosk_actor.py RENAMED
@@ -1,62 +1,57 @@
1
  import json
2
  import os
3
- import asyncio
4
  from vosk import SetLogLevel, Model, KaldiRecognizer
5
- from multiprocessing import Process, Queue
6
- from queue import Empty
7
- SetLogLevel(-1) # mutes vosk verbosity
8
 
9
- class SpeechToTextVosk:
 
10
  def __init__(self, model='small', audio_bit_rate=16000) -> None:
11
  self.model = model
12
  self.audio_bit_rate = audio_bit_rate
13
 
14
- # Create a Queue for inter-process communication
15
- self.queue = Queue()
16
- self.result_queue = Queue()
17
-
18
- # Create and start a new Process with the worker function
19
- self.process = Process(target=self.worker)
20
- self.process.start()
21
-
22
- def worker(self):
23
  # load vosk model
24
  # get path of current file
25
  current_file_path = os.path.abspath(__file__)
26
  current_directory = os.path.dirname(current_file_path)
27
  _path = os.path.join(current_directory, 'models', 'vosk', self.model)
28
- model_voice = Model(_path)
29
- vosk = KaldiRecognizer(model_voice, self.audio_bit_rate)
30
-
31
- while True:
32
- try:
33
- # Get the next item from the queue. Blocks for 1s if necessary.
34
- data = self.queue.get(timeout=1)
35
 
36
- # Stop the worker if the sentinel None is received
37
- if data is None:
38
- break
39
-
40
- text, speaker_finished = self._process_speech(vosk, data)
41
-
42
- # put the result into result_queue
43
- self.result_queue.put((text, speaker_finished))
44
- except Empty:
45
- pass
 
 
 
 
 
 
 
46
 
47
  def add_speech_bytes(self, data: bytearray):
48
- self.queue.put(data)
 
 
 
49
 
50
- def _process_speech(self, vosk: KaldiRecognizer, data: bytearray) -> tuple[str, bool]:
51
  text = ''
52
  speaker_finished = False
53
- if vosk.AcceptWaveform(data):
54
- result = vosk.Result()
55
  result_json = json.loads(result)
56
  text = result_json['text']
57
  speaker_finished = True
58
  else:
59
- result = vosk.PartialResult()
60
  result_json = json.loads(result)
61
  text = result_json['partial']
62
  return text, speaker_finished
@@ -64,27 +59,13 @@ class SpeechToTextVosk:
64
  def get_text(self):
65
  text = ''
66
  speaker_finished = False
67
- while not self.result_queue.empty():
68
- result, speaker_finished = self.result_queue.get()
69
  text += result
70
- if speaker_finished:
 
71
  break
72
- return (text, speaker_finished)
73
-
74
  def get_audio_bit_rate(self):
75
  return self.audio_bit_rate
76
-
77
-
78
- def shutdown(self):
79
- # Send sentinel value to stop the worker
80
- self.queue.put(None)
81
- # Wait for the worker process to finish
82
- self.process.join()
83
- def __enter__(self):
84
- return self
85
-
86
- def __exit__(self, exc_type, exc_value, traceback):
87
- self.shutdown()
88
-
89
- def __del__(self):
90
- self.shutdown()
 
1
  import json
2
  import os
 
3
  from vosk import SetLogLevel, Model, KaldiRecognizer
4
+ import ray
5
+ SetLogLevel(-1) # mutes vosk verbosity
 
6
 
7
+ @ray.remote
8
+ class SpeechToTextVoskActor:
9
  def __init__(self, model='small', audio_bit_rate=16000) -> None:
10
  self.model = model
11
  self.audio_bit_rate = audio_bit_rate
12
 
 
 
 
 
 
 
 
 
 
13
  # load vosk model
14
  # get path of current file
15
  current_file_path = os.path.abspath(__file__)
16
  current_directory = os.path.dirname(current_file_path)
17
  _path = os.path.join(current_directory, 'models', 'vosk', self.model)
18
+ self.model_voice = Model(_path)
19
+ self.vosk = KaldiRecognizer(self.model_voice, self.audio_bit_rate)
 
 
 
 
 
20
 
21
+ self.text_queue = []
22
+ self.finished_queue = []
23
+
24
+ def process_speech(self, data: bytearray) -> tuple[str, bool]:
25
+ text = ''
26
+ speaker_finished = False
27
+ if self.vosk.AcceptWaveform(data):
28
+ result = self.vosk.Result()
29
+ result_json = json.loads(result)
30
+ text = result_json['text']
31
+ speaker_finished = True
32
+ else:
33
+ result = self.vosk.PartialResult()
34
+ result_json = json.loads(result)
35
+ text = result_json['partial']
36
+ return text, speaker_finished
37
+
38
 
39
  def add_speech_bytes(self, data: bytearray):
40
+ text, speaker_finished = self._process_speech(data)
41
+ self.text_queue.append(text)
42
+ if speaker_finished:
43
+ self.finished_queue.append(speaker_finished)
44
 
45
+ def _process_speech(self, data: bytearray) -> tuple[str, bool]:
46
  text = ''
47
  speaker_finished = False
48
+ if self.vosk.AcceptWaveform(data):
49
+ result = self.vosk.Result()
50
  result_json = json.loads(result)
51
  text = result_json['text']
52
  speaker_finished = True
53
  else:
54
+ result = self.vosk.PartialResult()
55
  result_json = json.loads(result)
56
  text = result_json['partial']
57
  return text, speaker_finished
 
59
  def get_text(self):
60
  text = ''
61
  speaker_finished = False
62
+ while self.text_queue:
63
+ result = self.text_queue.pop(0)
64
  text += result
65
+ if self.finished_queue:
66
+ speaker_finished = self.finished_queue.pop(0)
67
  break
68
+ return text, speaker_finished
69
+
70
  def get_audio_bit_rate(self):
71
  return self.audio_bit_rate