AlexHung29629
commited on
Commit
•
2bd9327
1
Parent(s):
d63f37c
Update ultravox_processing.py
Browse files- ultravox_processing.py +2 -4
ultravox_processing.py
CHANGED
@@ -133,7 +133,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
133 |
"""
|
134 |
# TODO: Add support for multiple audio and text inputs.
|
135 |
data = {}
|
136 |
-
audio_embed_frames = 0
|
137 |
if audio is not None and len(audio) > 0:
|
138 |
# Main audio processing. The processor is model-specific.
|
139 |
x = self.audio_processor(
|
@@ -162,7 +161,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
162 |
), "Text must be a list."
|
163 |
processed_text = []
|
164 |
data["audio_token_start_idx"] = []
|
165 |
-
for t in text:
|
166 |
assert self.audio_placeholder in t
|
167 |
if "audio_token_len" not in data:
|
168 |
raise ValueError(
|
@@ -175,7 +174,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
175 |
add_special_tokens=False,
|
176 |
)
|
177 |
)
|
178 |
-
print(f"{start_idx=}, {t=}")
|
179 |
data["audio_token_start_idx"].append(start_idx)
|
180 |
|
181 |
# Replace the audio placeholder with the audio token.
|
@@ -183,7 +181,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
183 |
# where the number of </s> is the number of audio frames.
|
184 |
t = t.replace(
|
185 |
self.audio_placeholder,
|
186 |
-
self.audio_token_replacement *
|
187 |
)
|
188 |
processed_text.append(t)
|
189 |
|
|
|
133 |
"""
|
134 |
# TODO: Add support for multiple audio and text inputs.
|
135 |
data = {}
|
|
|
136 |
if audio is not None and len(audio) > 0:
|
137 |
# Main audio processing. The processor is model-specific.
|
138 |
x = self.audio_processor(
|
|
|
161 |
), "Text must be a list."
|
162 |
processed_text = []
|
163 |
data["audio_token_start_idx"] = []
|
164 |
+
for i, t in enumerate(text):
|
165 |
assert self.audio_placeholder in t
|
166 |
if "audio_token_len" not in data:
|
167 |
raise ValueError(
|
|
|
174 |
add_special_tokens=False,
|
175 |
)
|
176 |
)
|
|
|
177 |
data["audio_token_start_idx"].append(start_idx)
|
178 |
|
179 |
# Replace the audio placeholder with the audio token.
|
|
|
181 |
# where the number of </s> is the number of audio frames.
|
182 |
t = t.replace(
|
183 |
self.audio_placeholder,
|
184 |
+
self.audio_token_replacement * data["audio_token_len"][i],
|
185 |
)
|
186 |
processed_text.append(t)
|
187 |
|