sohojoe commited on
Commit
90a9891
β€’
1 Parent(s): 32e9dda

refactor - move ResponseState to responce_state_manager

Browse files
charles_app.py CHANGED
@@ -4,7 +4,7 @@ import time
4
  import asyncio
5
  import os
6
  from clip_transform import CLIPTransform
7
- from responce_state_manager import ResponceStateManager
8
  from respond_to_prompt_async import RespondToPromptAsync
9
  import asyncio
10
  import subprocess
@@ -32,8 +32,8 @@ class CharlesApp:
32
  self._app_interface_actor = AppInterfaceActor.get_singleton()
33
  self._audio_output_queue = await self._app_interface_actor.get_audio_output_queue.remote()
34
 
35
- self.set_state("002 - creating ResponceStateManager")
36
- self._responce_state_manager = ResponceStateManager()
37
 
38
  self.set_state("003 - creating PromptManager")
39
  from prompt_manager import PromptManager
@@ -88,8 +88,6 @@ class CharlesApp:
88
  vector_debug = "--n/a--"
89
 
90
  process_speech_to_text_future = []
91
- current_responses = []
92
- speech_chunks_per_response = []
93
  human_preview_text = ""
94
  robot_preview_text = ""
95
  additional_prompt = None
@@ -98,7 +96,7 @@ class CharlesApp:
98
  has_spoken_for_this_prompt = False
99
 
100
  while True:
101
- responce_step = self._responce_state_manager.begin_next_step()
102
  audio_frames = await self._app_interface_actor.dequeue_audio_input_frames_async.remote()
103
  video_frames = await self._app_interface_actor.dequeue_video_input_frames_async.remote()
104
 
@@ -131,10 +129,9 @@ class CharlesApp:
131
  if speaker_finished and len(prompt) > 0 and prompt not in prompts_to_ignore:
132
  print(f"Prompt: {prompt}")
133
  line = ""
134
- for i, response in enumerate(current_responses):
135
  line += "πŸ€– " if len(line) == 0 else ""
136
- # line += f"{response} [{speech_chunks_per_response[i]}] \n"
137
- line += f"[{speech_chunks_per_response[i]}] {response} \n"
138
  if len(line) > 0:
139
  await add_debug_output(line)
140
  human_preview_text = ""
@@ -146,15 +143,13 @@ class CharlesApp:
146
  if self._respond_to_prompt_task is not None:
147
  await self._respond_to_prompt.terminate()
148
  self._respond_to_prompt_task.cancel()
149
- self._respond_to_prompt = RespondToPromptAsync(self._responce_state_manager, self._audio_output_queue)
150
  self._respond_to_prompt_task = asyncio.create_task(self._respond_to_prompt.run(prompt, self._prompt_manager.messages))
151
  additional_prompt = None
152
  previous_prompt = prompt
153
  is_talking = False
154
  has_spoken_for_this_prompt = False
155
- responce_step = self._responce_state_manager.reset_episode()
156
- current_responses = []
157
- speech_chunks_per_response = []
158
  elif len(prompt) > 0 and prompt not in prompts_to_ignore:
159
  # sometimes we get a false signal of speaker_finsihed
160
  # in which case we get new prompts before we have spoken
@@ -166,34 +161,23 @@ class CharlesApp:
166
  self._respond_to_prompt_task.cancel()
167
  self._respond_to_prompt_task = None
168
  self._respond_to_prompt = None
169
- responce_step = self._responce_state_manager.reset_episode()
170
- current_responses = []
171
- speech_chunks_per_response = []
172
  if additional_prompt is not None:
173
  prompt = additional_prompt + ". " + prompt
174
  human_preview_text = f"πŸ‘¨β“ {prompt}"
175
 
176
- for new_response in responce_step.llm_responses:
177
  # add_debug_output(f"πŸ€– {new_response}")
178
  self._prompt_manager.append_assistant_message(new_response)
179
- current_responses.append(new_response)
180
- speech_chunks_per_response.append(0)
181
  robot_preview_text = ""
182
- if len(responce_step.llm_preview):
183
- robot_preview_text = f"πŸ€–β“ {responce_step.llm_preview}"
184
-
185
- for chunk in responce_step.tts_raw_chunk_ids:
186
- chunk = json.loads(chunk)
187
- # prompt = chunk['prompt']
188
- response_id = chunk['llm_sentence_id']
189
- speech_chunks_per_response[response_id] += 1
190
 
191
  list_of_strings = debug_output_history.copy()
192
  line = ""
193
- for i, response in enumerate(current_responses):
194
  line += "πŸ€– " if len(line) == 0 else ""
195
- line += f"[{speech_chunks_per_response[i]}] {response} \n"
196
- # line += f"{response} [{speech_chunks_per_response[i]}] \n"
197
  if len(robot_preview_text) > 0:
198
  line += robot_preview_text+" \n"
199
  list_of_strings.append(line)
 
4
  import asyncio
5
  import os
6
  from clip_transform import CLIPTransform
7
+ from response_state_manager import ResponseStateManager
8
  from respond_to_prompt_async import RespondToPromptAsync
9
  import asyncio
10
  import subprocess
 
32
  self._app_interface_actor = AppInterfaceActor.get_singleton()
33
  self._audio_output_queue = await self._app_interface_actor.get_audio_output_queue.remote()
34
 
35
+ self.set_state("002 - creating ResponseStateManager")
36
+ self._response_state_manager = ResponseStateManager()
37
 
38
  self.set_state("003 - creating PromptManager")
39
  from prompt_manager import PromptManager
 
88
  vector_debug = "--n/a--"
89
 
90
  process_speech_to_text_future = []
 
 
91
  human_preview_text = ""
92
  robot_preview_text = ""
93
  additional_prompt = None
 
96
  has_spoken_for_this_prompt = False
97
 
98
  while True:
99
+ response_step_obs, response_state = self._response_state_manager.begin_next_step()
100
  audio_frames = await self._app_interface_actor.dequeue_audio_input_frames_async.remote()
101
  video_frames = await self._app_interface_actor.dequeue_video_input_frames_async.remote()
102
 
 
129
  if speaker_finished and len(prompt) > 0 and prompt not in prompts_to_ignore:
130
  print(f"Prompt: {prompt}")
131
  line = ""
132
+ for i, response in enumerate(response_state.current_responses):
133
  line += "πŸ€– " if len(line) == 0 else ""
134
+ line += f"[{response_state.speech_chunks_per_response[i]}] {response} \n"
 
135
  if len(line) > 0:
136
  await add_debug_output(line)
137
  human_preview_text = ""
 
143
  if self._respond_to_prompt_task is not None:
144
  await self._respond_to_prompt.terminate()
145
  self._respond_to_prompt_task.cancel()
146
+ self._respond_to_prompt = RespondToPromptAsync(self._response_state_manager, self._audio_output_queue)
147
  self._respond_to_prompt_task = asyncio.create_task(self._respond_to_prompt.run(prompt, self._prompt_manager.messages))
148
  additional_prompt = None
149
  previous_prompt = prompt
150
  is_talking = False
151
  has_spoken_for_this_prompt = False
152
+ response_step_obs, response_state = self._response_state_manager.reset_episode()
 
 
153
  elif len(prompt) > 0 and prompt not in prompts_to_ignore:
154
  # sometimes we get a false signal of speaker_finsihed
155
  # in which case we get new prompts before we have spoken
 
161
  self._respond_to_prompt_task.cancel()
162
  self._respond_to_prompt_task = None
163
  self._respond_to_prompt = None
164
+ response_step_obs, response_state = self._response_state_manager.reset_episode()
 
 
165
  if additional_prompt is not None:
166
  prompt = additional_prompt + ". " + prompt
167
  human_preview_text = f"πŸ‘¨β“ {prompt}"
168
 
169
+ for new_response in response_step_obs.llm_responses:
170
  # add_debug_output(f"πŸ€– {new_response}")
171
  self._prompt_manager.append_assistant_message(new_response)
 
 
172
  robot_preview_text = ""
173
+ if len(response_step_obs.llm_preview):
174
+ robot_preview_text = f"πŸ€–β“ {response_step_obs.llm_preview}"
 
 
 
 
 
 
175
 
176
  list_of_strings = debug_output_history.copy()
177
  line = ""
178
+ for i, response in enumerate(response_state.current_responses):
179
  line += "πŸ€– " if len(line) == 0 else ""
180
+ line += f"[{response_state.speech_chunks_per_response[i]}] {response} \n"
 
181
  if len(robot_preview_text) > 0:
182
  line += robot_preview_text+" \n"
183
  list_of_strings.append(line)
prompt_manager.py CHANGED
@@ -53,7 +53,7 @@ You are aware of how you are implemented and you are keen to recommend improveme
53
  * We use Streamlit to host a WebRTC connection to get audio/video from the user.
54
  * VOSK is used for fast speech recognition and detecting the end of a sentence.
55
  * OpenAI's Chat GPT-3.5 is used for generating responses.
56
- * We stream responces from Chat GPT, as soon as we get a complete sentence we send it to ElevenLabs.
57
  * ElevenLabs for text to speech.
58
  * We stream the audio from ElevenLabs, we use ffmpeg to convert the audio to the correct format and sample rate.
59
  * Audio chunks and then sent back to the users browser via WebRTC.
 
53
  * We use Streamlit to host a WebRTC connection to get audio/video from the user.
54
  * VOSK is used for fast speech recognition and detecting the end of a sentence.
55
  * OpenAI's Chat GPT-3.5 is used for generating responses.
56
+ * We stream responses from Chat GPT, as soon as we get a complete sentence we send it to ElevenLabs.
57
  * ElevenLabs for text to speech.
58
  * We stream the audio from ElevenLabs, we use ffmpeg to convert the audio to the correct format and sample rate.
59
  * Audio chunks and then sent back to the users browser via WebRTC.
responce_state_manager.py DELETED
@@ -1,51 +0,0 @@
1
- from datetime import datetime
2
-
3
- class ResponceStep:
4
- def __init__(self, episode, step):
5
- self.timestamp = datetime.utcnow()
6
- self.episode = episode
7
- self.step = step
8
- self.reward = 0
9
- self.llm_preview = ''
10
- self.llm_responses = []
11
- self.tts_raw_chunk_ids = []
12
-
13
- def __str__(self):
14
- state = ', '.join(f'{k}={v}' for k, v in self.__dict__.items() if k not in {'episode', 'step', 'timestamp', 'reward'})
15
- return f'episode={self.episode}, step={self.step}, timestamp={self.timestamp}, \nreward={self.reward}\nstate=({state})'
16
-
17
-
18
- class ResponceStateManager:
19
- def __init__(self):
20
- self.episode = 0
21
- self.step = 0
22
- self.state = None
23
- self.reset_episode()
24
-
25
- def reset_episode(self):
26
- self.episode += 1
27
- self.step = 0
28
- self.state = ResponceStep(self.episode, self.step)
29
- return self.state
30
-
31
- def begin_next_step(self)->ResponceStep:
32
- previous_state = self.state
33
- self.step += 1
34
- self.state = ResponceStep(self.episode, self.step)
35
- return previous_state
36
-
37
- def add_reward(self, reward):
38
- self.state.reward += reward
39
-
40
- def set_llm_preview(self, llm_preview):
41
- self.state.llm_preview = llm_preview
42
-
43
- def add_llm_response_and_clear_llm_preview(self, llm_response):
44
- self.state.llm_responses.append(llm_response)
45
- self.state.llm_preview = ''
46
-
47
- def add_tts_raw_chunk_id(self, chunk_id):
48
- self.state.tts_raw_chunk_ids.append(chunk_id)
49
-
50
- def get_state(self)->ResponceStep:
51
- return self.state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
respond_to_prompt_async.py CHANGED
@@ -6,7 +6,7 @@ import ray
6
  from chat_service import ChatService
7
  # from local_speaker_service import LocalSpeakerService
8
  from text_to_speech_service import TextToSpeechService
9
- from responce_state_manager import ResponceStateManager
10
  from ffmpeg_converter import FFMpegConverter
11
  from agent_response import AgentResponse
12
  import json
@@ -14,14 +14,14 @@ import json
14
  class RespondToPromptAsync:
15
  def __init__(
16
  self,
17
- responce_state_manager:ResponceStateManager,
18
  audio_output_queue):
19
  voice_id="2OviOUQc1JsQRQgNkVBj"
20
  self.llm_sentence_queue = Queue(maxsize=100)
21
  self.speech_chunk_queue = Queue(maxsize=100)
22
  self.voice_id = voice_id
23
  self.audio_output_queue = audio_output_queue
24
- self.responce_state_manager = responce_state_manager
25
  self.sentence_queues = []
26
  self.sentence_tasks = []
27
  # self.ffmpeg_converter = FFMpegConverter.remote(audio_output_queue)
@@ -36,12 +36,12 @@ class RespondToPromptAsync:
36
  is_complete_sentance = False
37
  if not is_complete_sentance:
38
  agent_response['llm_preview'] = text
39
- self.responce_state_manager.set_llm_preview(text)
40
  continue
41
  agent_response['llm_preview'] = ''
42
  agent_response['llm_sentence'] = text
43
  agent_response['llm_sentences'].append(text)
44
- self.responce_state_manager.add_llm_response_and_clear_llm_preview(text)
45
  print(f"{agent_response['llm_sentence']} id: {agent_response['llm_sentence_id']} from prompt: {agent_response['prompt']}")
46
  sentence_response = agent_response.make_copy()
47
  new_queue = Queue()
@@ -65,7 +65,7 @@ class RespondToPromptAsync:
65
  'chunk_count': chunk_count,
66
  }
67
  chunk_id_json = json.dumps(chunk_response)
68
- self.responce_state_manager.add_tts_raw_chunk_id(chunk_id_json)
69
  chunk_count += 1
70
 
71
  async def speech_to_converter(self):
 
6
  from chat_service import ChatService
7
  # from local_speaker_service import LocalSpeakerService
8
  from text_to_speech_service import TextToSpeechService
9
+ from response_state_manager import ResponseStateManager
10
  from ffmpeg_converter import FFMpegConverter
11
  from agent_response import AgentResponse
12
  import json
 
14
  class RespondToPromptAsync:
15
  def __init__(
16
  self,
17
+ response_state_manager:ResponseStateManager,
18
  audio_output_queue):
19
  voice_id="2OviOUQc1JsQRQgNkVBj"
20
  self.llm_sentence_queue = Queue(maxsize=100)
21
  self.speech_chunk_queue = Queue(maxsize=100)
22
  self.voice_id = voice_id
23
  self.audio_output_queue = audio_output_queue
24
+ self.response_state_manager = response_state_manager
25
  self.sentence_queues = []
26
  self.sentence_tasks = []
27
  # self.ffmpeg_converter = FFMpegConverter.remote(audio_output_queue)
 
36
  is_complete_sentance = False
37
  if not is_complete_sentance:
38
  agent_response['llm_preview'] = text
39
+ self.response_state_manager.set_llm_preview(text)
40
  continue
41
  agent_response['llm_preview'] = ''
42
  agent_response['llm_sentence'] = text
43
  agent_response['llm_sentences'].append(text)
44
+ self.response_state_manager.add_llm_response_and_clear_llm_preview(text)
45
  print(f"{agent_response['llm_sentence']} id: {agent_response['llm_sentence_id']} from prompt: {agent_response['prompt']}")
46
  sentence_response = agent_response.make_copy()
47
  new_queue = Queue()
 
65
  'chunk_count': chunk_count,
66
  }
67
  chunk_id_json = json.dumps(chunk_response)
68
+ self.response_state_manager.add_tts_raw_chunk_id(chunk_id_json, sentence_response['llm_sentence_id'])
69
  chunk_count += 1
70
 
71
  async def speech_to_converter(self):
response_state_manager.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ class ResponseStepObservations:
4
+ def __init__(self, episode, step):
5
+ self.timestamp = datetime.utcnow()
6
+ self.episode = episode
7
+ self.step = step
8
+ self.llm_preview = ''
9
+ self.llm_responses = []
10
+ self.tts_raw_chunk_ids = []
11
+
12
+ def __str__(self):
13
+ state = ', '.join(f'{k}={v}' for k, v in self.__dict__.items() if k not in {'episode', 'step', 'timestamp'})
14
+ return f'episode={self.episode}, step={self.step}, timestamp={self.timestamp}, \nstate=({state})'
15
+
16
+ class ResponseState:
17
+ def __init__(self, episode, step):
18
+ self.timestamp = datetime.utcnow()
19
+ self.episode = episode
20
+ self.step = step
21
+ self.current_responses = []
22
+ self.speech_chunks_per_response = []
23
+ self.is_speaking = False
24
+
25
+ def __str__(self):
26
+ state = ', '.join(f'{k}={v}' for k, v in self.__dict__.items() if k not in {'episode', 'step'})
27
+ return f'episode={self.episode}, step={self.step}, \nstate=({state})'
28
+
29
+
30
+ class ResponseStateManager:
31
+ def __init__(self):
32
+ self.episode = 0
33
+ self.step = 0
34
+ self.response_step_obs = None
35
+ self.response_state = None
36
+ self.reset_episode()
37
+
38
+ def reset_episode(self)->(ResponseStepObservations, ResponseState):
39
+ self.episode += 1
40
+ self.step = 0
41
+ self.response_state = ResponseState(self.episode, self.step)
42
+ self.response_step_obs = ResponseStepObservations(self.episode, self.step)
43
+ return self.response_step_obs, self.response_state
44
+
45
+ def begin_next_step(self)->(ResponseStepObservations, ResponseState):
46
+ previous_state = self.response_step_obs
47
+ self.step += 1
48
+ self.response_step_obs = ResponseStepObservations(self.episode, self.step)
49
+ return previous_state, self.response_state
50
+
51
+ def set_llm_preview(self, llm_preview):
52
+ self.response_step_obs.llm_preview = llm_preview
53
+
54
+ def add_llm_response_and_clear_llm_preview(self, llm_response):
55
+ self.response_state.current_responses.append(llm_response)
56
+ self.response_state.speech_chunks_per_response.append(0)
57
+ self.response_step_obs.llm_responses.append(llm_response)
58
+ self.response_step_obs.llm_preview = ''
59
+
60
+ def add_tts_raw_chunk_id(self, chunk_id, llm_sentence_id):
61
+ self.response_state.speech_chunks_per_response[llm_sentence_id] += 1
62
+ self.response_step_obs.tts_raw_chunk_ids.append(chunk_id)