Loren commited on
Commit
469746c
·
verified ·
1 Parent(s): 0d84c3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -79
app.py CHANGED
@@ -1,79 +1,84 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
-
5
- MAX_TOKENS = 32000
6
-
7
- device = "cuda" if torch.cuda.is_available() else "cpu"
8
- print(f"*** Device: {device}")
9
-
10
- # List models
11
- dict_models = {'Voxtral-Mini-3B-2507': 'Loren/Voxtral-Mini-3B-2507-dup',
12
- 'Voxtral-Small-24B-2507': 'Loren/Voxtral-Small-24B-2507-dup'}
13
-
14
- # Load models
15
- list_processor = []
16
- list_model = []
17
- for model_name in dict_models.values():
18
- list_processor.append(AutoProcessor.from_pretrained(model_name))
19
- list_model.append(VoxtralForConditionalGeneration.from_pretrained(model_name,
20
- torch_dtype=torch.bfloat16,
21
- device_map=device))
22
- # Supported languages
23
- dict_languages = {"English": "en",
24
- "French": "fr",
25
- "German": "de",
26
- "Spanish": "es",
27
- "Italian": "it",
28
- "Portuguese": "pt",
29
- "Dutch": "nl",
30
- "Hindi": "hi"}
31
-
32
- @spaces.GPU
33
- def process_transcript(audio_path, model_name, language):
34
- """Process audio with selected Voxtral model and return the generated response"""
35
-
36
- inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=model_name)
37
- inputs = inputs.to(device, dtype=torch.bfloat16)
38
-
39
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
40
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
41
-
42
- return decoded_outputs[0]
43
-
44
-
45
-
46
- # Define Gradio interface
47
- with gr.Blocks(title="Transcription") as transcript:
48
- gr.Markdown("# Audio Transcription")
49
- gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
50
- gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
51
-
52
- with gr.Row():
53
- with gr.Column():
54
- sel_language = gr.Dropdown(
55
- choices=list(dict_languages.keys()),
56
- value="English",
57
- label="Select the language of the audio file:"
58
- )
59
-
60
- sel_model = gr.Radio(dict_models.keys(), label="Select the model:")
61
-
62
- with gr.Column():
63
- sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
64
-
65
- submit_transcript = gr.Button("Extract Transcription", variant="primary")
66
-
67
- with gr.Column():
68
- text_transcript = gr.Textbox(label="Generated Response", lines=10)
69
-
70
- submit_transcript.click(
71
- fn=process_transcript,
72
- inputs=[dict_languages[sel_language], dict_models[sel_model], sel_audio],
73
- outputs=text_transcript
74
- )
75
-
76
-
77
- # Launch the app
78
- if __name__ == "__main__":
79
- transcript.launch(share=True)
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
+
5
+ MAX_TOKENS = 32000
6
+
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ print(f"*** Device: {device}")
9
+
10
+ # List models
11
+ dict_models = {'Voxtral-Mini-3B-2507': 'mistralai/Voxtral-Mini-3B-2507',
12
+ 'Voxtral-Small-24B-2507': 'mistralai/Voxtral-Small-24B-2507'}
13
+
14
+ # Load models
15
+ list_processor = []
16
+ list_model = []
17
+ for model_name in dict_models.values():
18
+ list_processor.append(AutoProcessor.from_pretrained(model_name))
19
+ list_model.append(VoxtralForConditionalGeneration.from_pretrained(model_name,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map=device))
22
+ # Supported languages
23
+ dict_languages = {"English": "en",
24
+ "French": "fr",
25
+ "German": "de",
26
+ "Spanish": "es",
27
+ "Italian": "it",
28
+ "Portuguese": "pt",
29
+ "Dutch": "nl",
30
+ "Hindi": "hi"}
31
+
32
+ @spaces.GPU
33
+ def process_transcript(audio_path, model, processor, language):
34
+ """Process audio with selected Voxtral model and return the generated response"""
35
+
36
+ inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=model_name)
37
+ inputs = inputs.to(device, dtype=torch.bfloat16)
38
+
39
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
40
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
41
+
42
+ return decoded_outputs[0]
43
+
44
+
45
+
46
+ # Define Gradio interface
47
+ with gr.Blocks(title="Transcription") as transcript:
48
+ gr.Markdown("# Audio Transcription")
49
+ gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
50
+ gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
51
+
52
+ with gr.Row():
53
+ with gr.Column():
54
+ sel_language = gr.Dropdown(
55
+ choices=list(dict_languages.keys()),
56
+ value="English",
57
+ label="Select the language of the audio file:"
58
+ )
59
+
60
+ sel_model = gr.Radio(dict_models.keys(), label="Select the model:")
61
+
62
+ with gr.Column():
63
+ sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
64
+
65
+ submit_transcript = gr.Button("Extract Transcription", variant="primary")
66
+
67
+ with gr.Column():
68
+ text_transcript = gr.Textbox(label="Generated Response", lines=10)
69
+
70
+ try:
71
+ model_index = list(dict_models.keys()).index(sel_model)
72
+ submit_transcript.click(
73
+ fn=process_transcript,
74
+ inputs=[dict_languages[sel_language], list_model[model_index],
75
+ list_processor[model_index], sel_audio],
76
+ outputs=text_transcript
77
+ )
78
+ except:
79
+ text_transcript = 'Error'
80
+
81
+
82
+ # Launch the app
83
+ if __name__ == "__main__":
84
+ transcript.launch(share=True)