AlexHung29629 commited on
Commit
2bd9327
1 Parent(s): d63f37c

Update ultravox_processing.py

Browse files
Files changed (1) hide show
  1. ultravox_processing.py +2 -4
ultravox_processing.py CHANGED
@@ -133,7 +133,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
133
  """
134
  # TODO: Add support for multiple audio and text inputs.
135
  data = {}
136
- audio_embed_frames = 0
137
  if audio is not None and len(audio) > 0:
138
  # Main audio processing. The processor is model-specific.
139
  x = self.audio_processor(
@@ -162,7 +161,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
162
  ), "Text must be a list."
163
  processed_text = []
164
  data["audio_token_start_idx"] = []
165
- for t in text:
166
  assert self.audio_placeholder in t
167
  if "audio_token_len" not in data:
168
  raise ValueError(
@@ -175,7 +174,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
175
  add_special_tokens=False,
176
  )
177
  )
178
- print(f"{start_idx=}, {t=}")
179
  data["audio_token_start_idx"].append(start_idx)
180
 
181
  # Replace the audio placeholder with the audio token.
@@ -183,7 +181,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
183
  # where the number of </s> is the number of audio frames.
184
  t = t.replace(
185
  self.audio_placeholder,
186
- self.audio_token_replacement * audio_embed_frames,
187
  )
188
  processed_text.append(t)
189
 
 
133
  """
134
  # TODO: Add support for multiple audio and text inputs.
135
  data = {}
 
136
  if audio is not None and len(audio) > 0:
137
  # Main audio processing. The processor is model-specific.
138
  x = self.audio_processor(
 
161
  ), "Text must be a list."
162
  processed_text = []
163
  data["audio_token_start_idx"] = []
164
+ for i, t in enumerate(text):
165
  assert self.audio_placeholder in t
166
  if "audio_token_len" not in data:
167
  raise ValueError(
 
174
  add_special_tokens=False,
175
  )
176
  )
 
177
  data["audio_token_start_idx"].append(start_idx)
178
 
179
  # Replace the audio placeholder with the audio token.
 
181
  # where the number of </s> is the number of audio frames.
182
  t = t.replace(
183
  self.audio_placeholder,
184
+ self.audio_token_replacement * data["audio_token_len"][i],
185
  )
186
  processed_text.append(t)
187