AlexHung29629
/

test_mllama_v12

Feature Extraction

Model card Files Files and versions Community

AlexHung29629 commited on 29 days ago

Commit

2bd9327

•

1 Parent(s): d63f37c

Update ultravox_processing.py

Files changed (1) hide show

ultravox_processing.py +2 -4

ultravox_processing.py CHANGED Viewed

@@ -133,7 +133,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         """
         # TODO: Add support for multiple audio and text inputs.
         data = {}
-        audio_embed_frames = 0
         if audio is not None and len(audio) > 0:
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
@@ -162,7 +161,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             ), "Text must be a list."
             processed_text = []
             data["audio_token_start_idx"] = []
-            for t in text:
                 assert self.audio_placeholder in t
                 if "audio_token_len" not in data:
                     raise ValueError(
@@ -175,7 +174,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                         add_special_tokens=False,
                     )
                 )
-                print(f"{start_idx=}, {t=}")
                 data["audio_token_start_idx"].append(start_idx)
                 # Replace the audio placeholder with the audio token.
@@ -183,7 +181,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 #        where the number of </s> is the number of audio frames.
                 t = t.replace(
                     self.audio_placeholder,
-                    self.audio_token_replacement * audio_embed_frames,
                 )
                 processed_text.append(t)

         """
         # TODO: Add support for multiple audio and text inputs.
         data = {}
         if audio is not None and len(audio) > 0:
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
             ), "Text must be a list."
             processed_text = []
             data["audio_token_start_idx"] = []
+            for i, t in enumerate(text):
                 assert self.audio_placeholder in t
                 if "audio_token_len" not in data:
                     raise ValueError(
                         add_special_tokens=False,
                     )
                 )
                 data["audio_token_start_idx"].append(start_idx)
                 # Replace the audio placeholder with the audio token.
                 #        where the number of </s> is the number of audio frames.
                 t = t.replace(
                     self.audio_placeholder,
+                    self.audio_token_replacement * data["audio_token_len"][i],
                 )
                 processed_text.append(t)