Spaces:

Loren
/

Voxtral_Mini_Evaluation

Running on Zero

App Files Files Community

Loren commited on Jul 24

Commit

469746c

verified ·

1 Parent(s): 0d84c3d

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -79

app.py CHANGED Viewed

@@ -1,79 +1,84 @@
-import gradio as gr
-import torch
-from transformers import AutoProcessor, VoxtralForConditionalGeneration
-MAX_TOKENS = 32000
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"*** Device: {device}")
-# List models
-dict_models = {'Voxtral-Mini-3B-2507': 'Loren/Voxtral-Mini-3B-2507-dup',
-               'Voxtral-Small-24B-2507': 'Loren/Voxtral-Small-24B-2507-dup'}
-# Load models
-list_processor = []
-list_model = []
-for model_name in dict_models.values():
-    list_processor.append(AutoProcessor.from_pretrained(model_name))
-    list_model.append(VoxtralForConditionalGeneration.from_pretrained(model_name,
-                                                                      torch_dtype=torch.bfloat16,
-                                                                      device_map=device))
-# Supported languages
-dict_languages = {"English": "en",
-                  "French": "fr",
-                  "German": "de",
-                  "Spanish": "es",
-                  "Italian": "it",
-                  "Portuguese": "pt",
-                  "Dutch": "nl",
-                  "Hindi": "hi"}
-@spaces.GPU
-def process_transcript(audio_path, model_name, language):
-    """Process audio with selected Voxtral model and return the generated response"""
-    inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=model_name)
-    inputs = inputs.to(device, dtype=torch.bfloat16)
-    outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
-    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    return decoded_outputs[0]
-# Define Gradio interface
-with gr.Blocks(title="Transcription") as transcript:
-    gr.Markdown("# Audio Transcription")
-    gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
-    gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
-    with gr.Row():
-        with gr.Column():
-            sel_language = gr.Dropdown(
-                choices=list(dict_languages.keys()),
-                value="English",
-                label="Select the language of the audio file:"
-            )
-            sel_model = gr.Radio(dict_models.keys(), label="Select the model:")
-        with gr.Column():
-            sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
-            submit_transcript = gr.Button("Extract Transcription", variant="primary")
-        with gr.Column():
-            text_transcript = gr.Textbox(label="Generated Response", lines=10)
-    submit_transcript.click(
-        fn=process_transcript,
-        inputs=[dict_languages[sel_language], dict_models[sel_model], sel_audio],
-        outputs=text_transcript
-    )
-# Launch the app
-if __name__ == "__main__":
-    transcript.launch(share=True)

+import gradio as gr
+import torch
+from transformers import AutoProcessor, VoxtralForConditionalGeneration
+MAX_TOKENS = 32000
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"*** Device: {device}")
+# List models
+dict_models = {'Voxtral-Mini-3B-2507': 'mistralai/Voxtral-Mini-3B-2507',
+               'Voxtral-Small-24B-2507': 'mistralai/Voxtral-Small-24B-2507'}
+# Load models
+list_processor = []
+list_model = []
+for model_name in dict_models.values():
+    list_processor.append(AutoProcessor.from_pretrained(model_name))
+    list_model.append(VoxtralForConditionalGeneration.from_pretrained(model_name,
+                                                                      torch_dtype=torch.bfloat16,
+                                                                      device_map=device))
+# Supported languages
+dict_languages = {"English": "en",
+                  "French": "fr",
+                  "German": "de",
+                  "Spanish": "es",
+                  "Italian": "it",
+                  "Portuguese": "pt",
+                  "Dutch": "nl",
+                  "Hindi": "hi"}
+@spaces.GPU
+def process_transcript(audio_path, model, processor, language):
+    """Process audio with selected Voxtral model and return the generated response"""
+    inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=model_name)
+    inputs = inputs.to(device, dtype=torch.bfloat16)
+    outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
+    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    return decoded_outputs[0]
+# Define Gradio interface
+with gr.Blocks(title="Transcription") as transcript:
+    gr.Markdown("# Audio Transcription")
+    gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
+    gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
+    with gr.Row():
+        with gr.Column():
+            sel_language = gr.Dropdown(
+                choices=list(dict_languages.keys()),
+                value="English",
+                label="Select the language of the audio file:"
+            )
+            sel_model = gr.Radio(dict_models.keys(), label="Select the model:")
+        with gr.Column():
+            sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
+            submit_transcript = gr.Button("Extract Transcription", variant="primary")
+        with gr.Column():
+            text_transcript = gr.Textbox(label="Generated Response", lines=10)
+    try:
+        model_index = list(dict_models.keys()).index(sel_model)
+        submit_transcript.click(
+            fn=process_transcript,
+            inputs=[dict_languages[sel_language], list_model[model_index],
+                    list_processor[model_index], sel_audio],
+            outputs=text_transcript
+        )
+    except:
+        text_transcript = 'Error'
+# Launch the app
+if __name__ == "__main__":
+    transcript.launch(share=True)