Anna Sun commited on
Commit
fd69a21
1 Parent(s): c1e0588

more fixes

Browse files
Files changed (2) hide show
  1. app.py +27 -10
  2. simuleval_transcoder.py +1 -0
app.py CHANGED
@@ -35,6 +35,7 @@ def build_agent(model_path, config_name=None):
35
 
36
  agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
37
  transcoder = SimulevalTranscoder(
 
38
  sample_rate=48_000,
39
  debug=False,
40
  buffer_limit=1,
@@ -43,8 +44,8 @@ transcoder = SimulevalTranscoder(
43
  def start_recording():
44
  logger.debug(f"start_recording: starting transcoder")
45
  transcoder.reset_states()
46
- transcoder.start()
47
  transcoder.close = False
 
48
 
49
  def stop_recording():
50
  transcoder.close = True
@@ -87,11 +88,13 @@ def get_buffered_output():
87
 
88
  return speech, text, speech_and_text_output.final
89
 
 
90
  def streaming_input_callback():
91
  final = False
92
  max_wait_s = 15
93
  wait_s = 0
94
  translated_text_state = ""
 
95
  while not transcoder.close:
96
  translated_wav_segment, translated_text, final = get_buffered_output()
97
 
@@ -107,7 +110,7 @@ def streaming_input_callback():
107
  print("output sample rate", sample_rate)
108
  translated_wav_segment = sample_rate, np.array(audio_bytes)
109
  else:
110
- translated_wav_segment = bytes()
111
 
112
  if translated_text is not None:
113
  translated_text_state += " | " + str(translated_text)
@@ -123,16 +126,23 @@ def streaming_input_callback():
123
 
124
 
125
  def streaming_callback_dummy():
 
 
126
  while not transcoder.close:
127
  if s.queue.empty():
128
- print("empty")
129
- yield bytes()
 
130
  time.sleep(0.3)
131
  else:
132
- print("audio")
 
 
133
  audio = s.queue.get_nowait()
 
 
134
  s.queue.task_done()
135
- yield audio
136
 
137
  def clear():
138
  logger.debug(f"Clearing State")
@@ -175,21 +185,28 @@ def blocks():
175
  ).then(
176
  start_recording
177
  ).then(
178
- # streaming_callback_dummy, # TODO: autoplay works fine with streaming_callback_dummy
179
- # None,
180
- # output_translation_segment
 
 
 
181
  streaming_input_callback,
182
  None,
183
  [
184
  output_translation_segment,
185
  stream_output_text,
186
  translated_text_state,
187
- ],
188
  )
189
  input_audio.stop_recording(
190
  stop_recording
191
  )
192
  input_audio.stream(
 
 
 
 
193
  process_incoming_bytes, [input_audio], None
194
  )
195
 
 
35
 
36
  agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
37
  transcoder = SimulevalTranscoder(
38
+ agent,
39
  sample_rate=48_000,
40
  debug=False,
41
  buffer_limit=1,
 
44
  def start_recording():
45
  logger.debug(f"start_recording: starting transcoder")
46
  transcoder.reset_states()
 
47
  transcoder.close = False
48
+ transcoder.start()
49
 
50
  def stop_recording():
51
  transcoder.close = True
 
88
 
89
  return speech, text, speech_and_text_output.final
90
 
91
+ from scipy.io.wavfile import write as scipy_write
92
  def streaming_input_callback():
93
  final = False
94
  max_wait_s = 15
95
  wait_s = 0
96
  translated_text_state = ""
97
+ sample_rate = 24000
98
  while not transcoder.close:
99
  translated_wav_segment, translated_text, final = get_buffered_output()
100
 
 
110
  print("output sample rate", sample_rate)
111
  translated_wav_segment = sample_rate, np.array(audio_bytes)
112
  else:
113
+ translated_wav_segment = sample_rate, np.empty(0, dtype=np.int16)
114
 
115
  if translated_text is not None:
116
  translated_text_state += " | " + str(translated_text)
 
126
 
127
 
128
  def streaming_callback_dummy():
129
+ i = 0
130
+ out_text = ""
131
  while not transcoder.close:
132
  if s.queue.empty():
133
+ yield (
134
+ (48000, np.empty(0, dtype=np.int16)), out_text, out_text
135
+ )
136
  time.sleep(0.3)
137
  else:
138
+ i += 1
139
+ out_text += " | " + str(i)
140
+ print(out_text)
141
  audio = s.queue.get_nowait()
142
+ if i == 0:
143
+ print(audio[0], type(audio[1]))
144
  s.queue.task_done()
145
+ yield audio, out_text, out_text
146
 
147
  def clear():
148
  logger.debug(f"Clearing State")
 
185
  ).then(
186
  start_recording
187
  ).then(
188
+ # TODO: streaming speech autoplay works fine with streaming_callback_dummy,
189
+ # but speech output from streaming_input_callback has a huge delay
190
+ # when comparing print/debugging logs vs. output speech
191
+ # TODO: text output works fine with one output, but is not
192
+ # updating when output is both text + speech
193
+ # streaming_callback_dummy,
194
  streaming_input_callback,
195
  None,
196
  [
197
  output_translation_segment,
198
  stream_output_text,
199
  translated_text_state,
200
+ ]
201
  )
202
  input_audio.stop_recording(
203
  stop_recording
204
  )
205
  input_audio.stream(
206
+ # TODO: *only when streaming speech output* about half the time
207
+ # there is some race condition in gradio where process_incoming_bytes
208
+ # stops getting called once the first speech chunk is yield-ed
209
+ # in streaming_input_callback (or streaming_callback_dummy)
210
  process_incoming_bytes, [input_audio], None
211
  )
212
 
simuleval_transcoder.py CHANGED
@@ -325,6 +325,7 @@ class SimulevalTranscoder:
325
 
326
  def process_pipeline_loop(self):
327
  if self.close:
 
328
  return # closes the thread
329
 
330
  print("processing_pipeline")
 
325
 
326
  def process_pipeline_loop(self):
327
  if self.close:
328
+ print("transcoder closed")
329
  return # closes the thread
330
 
331
  print("processing_pipeline")