sohojoe commited on
Commit
9740bc5
1 Parent(s): 730fe87

refactor app.py to run as async

Browse files
Files changed (1) hide show
  1. app.py +297 -285
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from collections import deque
2
  import os
3
  import threading
@@ -20,291 +21,302 @@ SetLogLevel(-1) # mutes vosk verbosity
20
  from dotenv import load_dotenv
21
  load_dotenv()
22
 
23
- system_one = {
24
- "audio_bit_rate": 16000,
25
- # "audio_bit_rate": 32000,
26
- # "audio_bit_rate": 48000,
27
-
28
- # "vision_embeddings_fps": 5,
29
- "vision_embeddings_fps": 2,
30
- }
31
-
32
-
33
- system_one["video_detection_emotions"] = [
34
- "a happy person",
35
- "the person is happy",
36
- "the person's emotional state is happy",
37
- "a sad person",
38
- "a scared person",
39
- "a disgusted person",
40
- "an angry person",
41
- "a suprised person",
42
- "a bored person",
43
- "an interested person",
44
- "a guilty person",
45
- "an indiffert person",
46
- "a distracted person",
47
- ]
48
-
49
-
50
- # system_one["video_detection_emotions"] = [
51
- # "Happiness",
52
- # "Sadness",
53
- # "Fear",
54
- # "Disgust",
55
- # "Anger",
56
- # "Surprise",
57
- # "Boredom",
58
- # "Interest",
59
- # "Excitement",
60
- # "Guilt",
61
- # "Shame",
62
- # "Relief",
63
- # "Love",
64
- # "Embarrassment",
65
- # "Pride",
66
- # "Envy",
67
- # "Jealousy",
68
- # "Anxiety",
69
- # "Hope",
70
- # "Despair",
71
- # "Frustration",
72
- # "Confusion",
73
- # "Curiosity",
74
- # "Contentment",
75
- # "Indifference",
76
- # "Anticipation",
77
- # "Gratitude",
78
- # "Bitterness"
79
- # ]
80
- system_one["video_detection_engement"] = [
81
- "the person is engaged in the conversation",
82
- "the person is not engaged in the conversation",
83
- "the person is looking at me",
84
- "the person is not looking at me",
85
- "the person is talking to me",
86
- "the person is not talking to me",
87
- "the person is engaged",
88
- "the person is talking",
89
- "the person is listening",
90
- ]
91
- system_one["video_detection_present"] = [
92
- "the view from a webcam",
93
- "the view from a webcam we see a person",
94
- # "the view from a webcam. I see a person",
95
- # "the view from a webcam. The person is looking at the camera",
96
- # "i am a webcam",
97
- # "i am a webcam and i see a person",
98
- # "i am a webcam and i see a person. The person is looking at me",
99
- # "a person",
100
- # "a person on a Zoom call",
101
- # "a person on a FaceTime call",
102
- # "a person on a WebCam call",
103
- # "no one",
104
- # " ",
105
- # "multiple people",
106
- # "a group of people",
107
- ]
108
-
109
- system_one_audio_status = st.empty()
110
-
111
-
112
- playing = st.checkbox("Playing", value=True)
113
-
114
- def load_vosk (model='small'):
115
- # load vosk model
116
- # get path of current file
117
- current_file_path = os.path.abspath(__file__)
118
- current_directory = os.path.dirname(current_file_path)
119
- _path = os.path.join(current_directory, 'models', 'vosk', model)
120
- model_voice = Model(_path)
121
- recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate'])
122
- return recognizer
123
-
124
- vask = load_vosk()
125
-
126
- def handle_audio_frame(frame):
127
- # if self.vosk.AcceptWaveform(data):
128
- pass
129
-
130
-
131
- def do_work(data: bytearray) -> tuple[str, bool]:
132
- text = ''
133
- speaker_finished = False
134
- if vask.AcceptWaveform(data):
135
- result = vask.Result()
136
- result_json = json.loads(result)
137
- text = result_json['text']
138
- speaker_finished = True
139
- else:
140
- result = vask.PartialResult()
141
- result_json = json.loads(result)
142
- text = result_json['partial']
143
- return text, speaker_finished
144
-
145
-
146
- audio_frames_deque_lock = threading.Lock()
147
- audio_frames_deque: deque = deque([])
148
-
149
- video_frames_deque_lock = threading.Lock()
150
- video_frames_deque: deque = deque([])
151
-
152
- async def queued_video_frames_callback(
153
- frames: List[av.AudioFrame],
154
- ) -> av.AudioFrame:
155
- with video_frames_deque_lock:
156
- video_frames_deque.extend(frames)
157
- return frames
158
-
159
- async def queued_audio_frames_callback(
160
- frames: List[av.AudioFrame],
161
- ) -> av.AudioFrame:
162
- with audio_frames_deque_lock:
163
- audio_frames_deque.extend(frames)
164
-
165
- # create frames to be returned.
166
- new_frames = []
167
- for frame in frames:
168
- input_array = frame.to_ndarray()
169
- new_frame = av.AudioFrame.from_ndarray(
170
- np.zeros(input_array.shape, dtype=input_array.dtype),
171
- layout=frame.layout.name,
172
- )
173
- new_frame.sample_rate = frame.sample_rate
174
- new_frames.append(new_frame)
175
-
176
- # TODO: replace with the audio we want to send to the other side.
177
-
178
- return new_frames
179
-
180
- system_one_audio_status.write("Initializing CLIP model")
181
- from clip_transform import CLIPTransform
182
- clip_transform = CLIPTransform()
183
-
184
- system_one_audio_status.write("Initializing CLIP templates")
185
-
186
- embeddings = clip_transform.text_to_embeddings(system_one["video_detection_emotions"])
187
- system_one["video_detection_emotions_embeddings"] = embeddings
188
-
189
- embeddings = clip_transform.text_to_embeddings(system_one["video_detection_engement"])
190
- system_one["video_detection_engement_embeddings"] = embeddings
191
-
192
- embeddings = clip_transform.text_to_embeddings(system_one["video_detection_present"])
193
- system_one["video_detection_present_embeddings"] = embeddings
194
-
195
- system_one_audio_status.write("Initializing webrtc_streamer")
196
- webrtc_ctx = webrtc_streamer(
197
- key="charles",
198
- desired_playing_state=playing,
199
- # audio_receiver_size=4096,
200
- queued_audio_frames_callback=queued_audio_frames_callback,
201
- queued_video_frames_callback=queued_video_frames_callback,
202
- mode=WebRtcMode.SENDRECV,
203
- rtc_configuration={"iceServers": get_ice_servers()},
204
- async_processing=True,
205
- )
206
-
207
-
208
- if not webrtc_ctx.state.playing:
209
- exit
210
-
211
- system_one_audio_status.write("Initializing streaming")
212
- system_one_audio_output = st.empty()
213
-
214
- system_one_video_output = st.empty()
215
-
216
- system_one_audio_history = []
217
- system_one_audio_history_output = st.empty()
218
-
219
-
220
- sound_chunk = pydub.AudioSegment.empty()
221
- current_video_embedding = None
222
- current_video_embedding_timestamp = time.monotonic()
223
-
224
-
225
- def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
226
- dot_product = torch.mm(embeddings, video_embedding.T)
227
- similarity_image_label = [(float("{:.4f}".format(dot_product[i][0])), embeddings_labels[i]) for i in range(len(embeddings_labels))]
228
- similarity_image_label.sort(reverse=True)
229
- return similarity_image_label
230
-
231
- def get_top_3_similarities_as_a_string(video_embedding, embeddings, embeddings_labels):
232
- similarities = get_dot_similarities(video_embedding, embeddings, embeddings_labels)
233
- top_3 = ""
234
- range_len = 3 if len(similarities) > 3 else len(similarities)
235
- for i in range(range_len):
236
- top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
237
- return top_3
238
-
239
- while True:
240
- if webrtc_ctx.state.playing:
241
- # handle video
242
- video_frames = []
243
  with video_frames_deque_lock:
244
- while len(video_frames_deque) > 0:
245
- frame = video_frames_deque.popleft()
246
- video_frames.append(frame)
247
- get_embeddings = False
248
- get_embeddings |= current_video_embedding is None
249
- current_time = time.monotonic()
250
- elapsed_time = current_time - current_video_embedding_timestamp
251
- get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
252
- if get_embeddings and len(video_frames) > 0:
253
- current_video_embedding_timestamp = current_time
254
- current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
255
-
256
- emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
257
- engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
258
- present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
259
-
260
- # table_content = "**System 1 Video:**\n\n"
261
- table_content = "| System 1 Video | |\n| --- | --- |\n"
262
- table_content += f"| Present | {present_top_3} |\n"
263
- table_content += f"| Emotion | {emotions_top_3} |\n"
264
- table_content += f"| Engagement | {engagement_top_3} |\n"
265
- system_one_video_output.markdown(table_content)
266
- # system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
267
- # for similarity, image_label in similarity_image_label:
268
- # print (f"{similarity} {image_label}")
269
-
270
- # handle audio
271
- audio_frames = []
272
  with audio_frames_deque_lock:
273
- while len(audio_frames_deque) > 0:
274
- frame = audio_frames_deque.popleft()
275
- audio_frames.append(frame)
276
-
277
- if len(audio_frames) == 0:
278
- time.sleep(0.1)
279
- system_one_audio_status.write("No frame arrived.")
280
- continue
281
-
282
- system_one_audio_status.write("Running. Say something!")
283
-
284
- for audio_frame in audio_frames:
285
- sound = pydub.AudioSegment(
286
- data=audio_frame.to_ndarray().tobytes(),
287
- sample_width=audio_frame.format.bytes,
288
- frame_rate=audio_frame.sample_rate,
289
- channels=len(audio_frame.layout.channels),
290
  )
291
- sound = sound.set_channels(1)
292
- sound = sound.set_frame_rate(system_one['audio_bit_rate'])
293
- sound_chunk += sound
294
-
295
- if len(sound_chunk) > 0:
296
- buffer = np.array(sound_chunk.get_array_of_samples())
297
- text, speaker_finished = do_work(buffer.tobytes())
298
- system_one_audio_output.markdown(f"**System 1 Audio:** {text}")
299
- if speaker_finished and len(text) > 0:
300
- system_one_audio_history.append(text)
301
- if len(system_one_audio_history) > 10:
302
- system_one_audio_history = system_one_audio_history[-10:]
303
- table_content = "| System 1 Audio History |\n| --- |\n"
304
- table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
305
- system_one_audio_history_output.markdown(table_content)
306
- sound_chunk = pydub.AudioSegment.empty()
307
-
308
- else:
309
- system_one_audio_status.write("Stopped.")
310
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
  from collections import deque
3
  import os
4
  import threading
 
21
  from dotenv import load_dotenv
22
  load_dotenv()
23
 
24
+ async def main():
25
+
26
+ system_one = {
27
+ "audio_bit_rate": 16000,
28
+ # "audio_bit_rate": 32000,
29
+ # "audio_bit_rate": 48000,
30
+
31
+ # "vision_embeddings_fps": 5,
32
+ "vision_embeddings_fps": 2,
33
+ }
34
+
35
+
36
+ system_one["video_detection_emotions"] = [
37
+ "a happy person",
38
+ "the person is happy",
39
+ "the person's emotional state is happy",
40
+ "a sad person",
41
+ "a scared person",
42
+ "a disgusted person",
43
+ "an angry person",
44
+ "a suprised person",
45
+ "a bored person",
46
+ "an interested person",
47
+ "a guilty person",
48
+ "an indiffert person",
49
+ "a distracted person",
50
+ ]
51
+
52
+
53
+ # system_one["video_detection_emotions"] = [
54
+ # "Happiness",
55
+ # "Sadness",
56
+ # "Fear",
57
+ # "Disgust",
58
+ # "Anger",
59
+ # "Surprise",
60
+ # "Boredom",
61
+ # "Interest",
62
+ # "Excitement",
63
+ # "Guilt",
64
+ # "Shame",
65
+ # "Relief",
66
+ # "Love",
67
+ # "Embarrassment",
68
+ # "Pride",
69
+ # "Envy",
70
+ # "Jealousy",
71
+ # "Anxiety",
72
+ # "Hope",
73
+ # "Despair",
74
+ # "Frustration",
75
+ # "Confusion",
76
+ # "Curiosity",
77
+ # "Contentment",
78
+ # "Indifference",
79
+ # "Anticipation",
80
+ # "Gratitude",
81
+ # "Bitterness"
82
+ # ]
83
+ system_one["video_detection_engement"] = [
84
+ "the person is engaged in the conversation",
85
+ "the person is not engaged in the conversation",
86
+ "the person is looking at me",
87
+ "the person is not looking at me",
88
+ "the person is talking to me",
89
+ "the person is not talking to me",
90
+ "the person is engaged",
91
+ "the person is talking",
92
+ "the person is listening",
93
+ ]
94
+ system_one["video_detection_present"] = [
95
+ "the view from a webcam",
96
+ "the view from a webcam we see a person",
97
+ # "the view from a webcam. I see a person",
98
+ # "the view from a webcam. The person is looking at the camera",
99
+ # "i am a webcam",
100
+ # "i am a webcam and i see a person",
101
+ # "i am a webcam and i see a person. The person is looking at me",
102
+ # "a person",
103
+ # "a person on a Zoom call",
104
+ # "a person on a FaceTime call",
105
+ # "a person on a WebCam call",
106
+ # "no one",
107
+ # " ",
108
+ # "multiple people",
109
+ # "a group of people",
110
+ ]
111
+
112
+ system_one_audio_status = st.empty()
113
+
114
+
115
+ playing = st.checkbox("Playing", value=True)
116
+
117
+ def load_vosk (model='small'):
118
+ # load vosk model
119
+ # get path of current file
120
+ current_file_path = os.path.abspath(__file__)
121
+ current_directory = os.path.dirname(current_file_path)
122
+ _path = os.path.join(current_directory, 'models', 'vosk', model)
123
+ model_voice = Model(_path)
124
+ recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate'])
125
+ return recognizer
126
+
127
+ vask = load_vosk()
128
+
129
+ def handle_audio_frame(frame):
130
+ # if self.vosk.AcceptWaveform(data):
131
+ pass
132
+
133
+
134
+ def do_work(data: bytearray) -> tuple[str, bool]:
135
+ text = ''
136
+ speaker_finished = False
137
+ if vask.AcceptWaveform(data):
138
+ result = vask.Result()
139
+ result_json = json.loads(result)
140
+ text = result_json['text']
141
+ speaker_finished = True
142
+ else:
143
+ result = vask.PartialResult()
144
+ result_json = json.loads(result)
145
+ text = result_json['partial']
146
+ return text, speaker_finished
147
+
148
+
149
+ audio_frames_deque_lock = threading.Lock()
150
+ audio_frames_deque: deque = deque([])
151
+
152
+ video_frames_deque_lock = threading.Lock()
153
+ video_frames_deque: deque = deque([])
154
+
155
+ async def queued_video_frames_callback(
156
+ frames: List[av.AudioFrame],
157
+ ) -> av.AudioFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  with video_frames_deque_lock:
159
+ video_frames_deque.extend(frames)
160
+ return frames
161
+
162
+ async def queued_audio_frames_callback(
163
+ frames: List[av.AudioFrame],
164
+ ) -> av.AudioFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  with audio_frames_deque_lock:
166
+ audio_frames_deque.extend(frames)
167
+
168
+ # create frames to be returned.
169
+ new_frames = []
170
+ for frame in frames:
171
+ input_array = frame.to_ndarray()
172
+ new_frame = av.AudioFrame.from_ndarray(
173
+ np.zeros(input_array.shape, dtype=input_array.dtype),
174
+ layout=frame.layout.name,
 
 
 
 
 
 
 
 
175
  )
176
+ new_frame.sample_rate = frame.sample_rate
177
+ new_frames.append(new_frame)
178
+
179
+ # TODO: replace with the audio we want to send to the other side.
180
+
181
+ return new_frames
182
+
183
+ system_one_audio_status.write("Initializing CLIP model")
184
+ from clip_transform import CLIPTransform
185
+ clip_transform = CLIPTransform()
186
+
187
+ system_one_audio_status.write("Initializing chat pipeline")
188
+ from chat_pipeline import ChatPipeline
189
+ chat_pipeline = ChatPipeline()
190
+
191
+ system_one_audio_status.write("Initializing CLIP templates")
192
+
193
+ embeddings = clip_transform.text_to_embeddings(system_one["video_detection_emotions"])
194
+ system_one["video_detection_emotions_embeddings"] = embeddings
195
+
196
+ embeddings = clip_transform.text_to_embeddings(system_one["video_detection_engement"])
197
+ system_one["video_detection_engement_embeddings"] = embeddings
198
+
199
+ embeddings = clip_transform.text_to_embeddings(system_one["video_detection_present"])
200
+ system_one["video_detection_present_embeddings"] = embeddings
201
+
202
+ system_one_audio_status.write("Initializing webrtc_streamer")
203
+ webrtc_ctx = webrtc_streamer(
204
+ key="charles",
205
+ desired_playing_state=playing,
206
+ # audio_receiver_size=4096,
207
+ queued_audio_frames_callback=queued_audio_frames_callback,
208
+ queued_video_frames_callback=queued_video_frames_callback,
209
+ mode=WebRtcMode.SENDRECV,
210
+ rtc_configuration={"iceServers": get_ice_servers()},
211
+ async_processing=True,
212
+ )
213
+
214
+
215
+ if not webrtc_ctx.state.playing:
216
+ exit
217
+
218
+ system_one_audio_status.write("Initializing streaming")
219
+ system_one_audio_output = st.empty()
220
+
221
+ system_one_video_output = st.empty()
222
+
223
+ system_one_audio_history = []
224
+ system_one_audio_history_output = st.empty()
225
+
226
+
227
+ sound_chunk = pydub.AudioSegment.empty()
228
+ current_video_embedding = None
229
+ current_video_embedding_timestamp = time.monotonic()
230
+
231
+
232
+ def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
233
+ dot_product = torch.mm(embeddings, video_embedding.T)
234
+ similarity_image_label = [(float("{:.4f}".format(dot_product[i][0])), embeddings_labels[i]) for i in range(len(embeddings_labels))]
235
+ similarity_image_label.sort(reverse=True)
236
+ return similarity_image_label
237
+
238
+ def get_top_3_similarities_as_a_string(video_embedding, embeddings, embeddings_labels):
239
+ similarities = get_dot_similarities(video_embedding, embeddings, embeddings_labels)
240
+ top_3 = ""
241
+ range_len = 3 if len(similarities) > 3 else len(similarities)
242
+ for i in range(range_len):
243
+ top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
244
+ return top_3
245
+
246
+ while True:
247
+ # await chat_pipeline.start()
248
+ # await chat_pipeline.enqueue(text)
249
+ if webrtc_ctx.state.playing:
250
+ # handle video
251
+ video_frames = []
252
+ with video_frames_deque_lock:
253
+ while len(video_frames_deque) > 0:
254
+ frame = video_frames_deque.popleft()
255
+ video_frames.append(frame)
256
+ get_embeddings = False
257
+ get_embeddings |= current_video_embedding is None
258
+ current_time = time.monotonic()
259
+ elapsed_time = current_time - current_video_embedding_timestamp
260
+ get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
261
+ if get_embeddings and len(video_frames) > 0:
262
+ current_video_embedding_timestamp = current_time
263
+ current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
264
+
265
+ emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
266
+ engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
267
+ present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
268
+
269
+ # table_content = "**System 1 Video:**\n\n"
270
+ table_content = "| System 1 Video | |\n| --- | --- |\n"
271
+ table_content += f"| Present | {present_top_3} |\n"
272
+ table_content += f"| Emotion | {emotions_top_3} |\n"
273
+ table_content += f"| Engagement | {engagement_top_3} |\n"
274
+ system_one_video_output.markdown(table_content)
275
+ # system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
276
+ # for similarity, image_label in similarity_image_label:
277
+ # print (f"{similarity} {image_label}")
278
+
279
+ # handle audio
280
+ audio_frames = []
281
+ with audio_frames_deque_lock:
282
+ while len(audio_frames_deque) > 0:
283
+ frame = audio_frames_deque.popleft()
284
+ audio_frames.append(frame)
285
+
286
+ if len(audio_frames) == 0:
287
+ time.sleep(0.1)
288
+ system_one_audio_status.write("No frame arrived.")
289
+ continue
290
+
291
+ system_one_audio_status.write("Running. Say something!")
292
+
293
+ for audio_frame in audio_frames:
294
+ sound = pydub.AudioSegment(
295
+ data=audio_frame.to_ndarray().tobytes(),
296
+ sample_width=audio_frame.format.bytes,
297
+ frame_rate=audio_frame.sample_rate,
298
+ channels=len(audio_frame.layout.channels),
299
+ )
300
+ sound = sound.set_channels(1)
301
+ sound = sound.set_frame_rate(system_one['audio_bit_rate'])
302
+ sound_chunk += sound
303
+
304
+ if len(sound_chunk) > 0:
305
+ buffer = np.array(sound_chunk.get_array_of_samples())
306
+ text, speaker_finished = do_work(buffer.tobytes())
307
+ system_one_audio_output.markdown(f"**System 1 Audio:** {text}")
308
+ if speaker_finished and len(text) > 0:
309
+ system_one_audio_history.append(text)
310
+ if len(system_one_audio_history) > 10:
311
+ system_one_audio_history = system_one_audio_history[-10:]
312
+ table_content = "| System 1 Audio History |\n| --- |\n"
313
+ table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
314
+ system_one_audio_history_output.markdown(table_content)
315
+ sound_chunk = pydub.AudioSegment.empty()
316
+
317
+ else:
318
+ system_one_audio_status.write("Stopped.")
319
+ break
320
+
321
+ if __name__ == "__main__":
322
+ asyncio.run(main())