Kr08 commited on
Commit
aaac499
·
verified ·
1 Parent(s): 159303c

Added zerogpu support, summarization with different language models and q/a feature

Browse files
Files changed (1) hide show
  1. app.py +93 -13
app.py CHANGED
@@ -1,24 +1,104 @@
1
  import gradio as gr
2
  from audio_processing import process_audio, print_results
3
- def transcribe_audio(audio_file):
4
- language_segments, final_segments = process_audio(audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  output = "Detected language changes:\n\n"
7
  for segment in language_segments:
8
  output += f"Language: {segment['language']}\n"
9
  output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"
10
 
11
- output += "Transcription with language detection and speaker diarization:\n\n"
 
12
  for segment in final_segments:
13
- output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) Speaker {segment['speaker']}: {segment['text']}\n"
14
- # output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}): {segment['text']}\n"
15
- return output
16
-
17
- iface = gr.Interface(
18
- fn=transcribe_audio,
19
- inputs=gr.Audio(type="filepath"),
20
- outputs="text",
21
- title="WhisperX Audio Transcription"
22
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  iface.launch()
 
1
  import gradio as gr
2
  from audio_processing import process_audio, print_results
3
+ from transformers import pipeline
4
+ import spaces
5
+ import torch
6
+
7
+ # Check if CUDA is available
8
+ cuda_available = torch.cuda.is_available()
9
+
10
+ # Initialize the summarization and question-answering models
11
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
12
+ qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
13
+
14
+ # Move models to GPU if available
15
+ if cuda_available:
16
+ summarizer.to('cuda')
17
+ qa_model.to('cuda')
18
+
19
+ @spaces.GPU
20
+ def transcribe_audio(audio_file, translate, model_size):
21
+ language_segments, final_segments = process_audio(audio_file, translate=translate, model_size=model_size)
22
 
23
  output = "Detected language changes:\n\n"
24
  for segment in language_segments:
25
  output += f"Language: {segment['language']}\n"
26
  output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"
27
 
28
+ output += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n"
29
+ full_text = ""
30
  for segment in final_segments:
31
+ output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n"
32
+ output += f"Original: {segment['text']}\n"
33
+ if translate:
34
+ output += f"Translated: {segment['translated']}\n"
35
+ full_text += segment['translated'] + " "
36
+ else:
37
+ full_text += segment['text'] + " "
38
+ output += "\n"
39
+
40
+ return output, full_text
41
+
42
+ @spaces.GPU
43
+ def summarize_text(text):
44
+ summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
45
+ return summary
46
+
47
+ @spaces.GPU
48
+ def answer_question(context, question):
49
+ result = qa_model(question=question, context=context)
50
+ return result['answer']
51
+
52
+ @spaces.GPU
53
+ def process_and_summarize(audio_file, translate, model_size):
54
+ transcription, full_text = transcribe_audio(audio_file, translate, model_size)
55
+ summary = summarize_text(full_text)
56
+ return transcription, summary
57
+
58
+ @spaces.GPU
59
+ def qa_interface(audio_file, translate, model_size, question):
60
+ _, full_text = transcribe_audio(audio_file, translate, model_size)
61
+ answer = answer_question(full_text, question)
62
+ return answer
63
+
64
+ # Main interface
65
+ with gr.Blocks() as iface:
66
+ gr.Markdown("# WhisperX Audio Transcription, Translation, Summarization, and QA (with ZeroGPU support)")
67
+
68
+ with gr.Tab("Transcribe and Summarize"):
69
+ audio_input = gr.Audio(type="filepath")
70
+ translate_checkbox = gr.Checkbox(label="Enable Translation")
71
+ model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small")
72
+ transcribe_button = gr.Button("Transcribe and Summarize")
73
+ transcription_output = gr.Textbox(label="Transcription")
74
+ summary_output = gr.Textbox(label="Summary")
75
+
76
+ transcribe_button.click(
77
+ process_and_summarize,
78
+ inputs=[audio_input, translate_checkbox, model_dropdown],
79
+ outputs=[transcription_output, summary_output]
80
+ )
81
+
82
+ with gr.Tab("Question Answering"):
83
+ qa_audio_input = gr.Audio(type="filepath")
84
+ qa_translate_checkbox = gr.Checkbox(label="Enable Translation")
85
+ qa_model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small")
86
+ question_input = gr.Textbox(label="Ask a question about the audio")
87
+ qa_button = gr.Button("Get Answer")
88
+ answer_output = gr.Textbox(label="Answer")
89
+
90
+ qa_button.click(
91
+ qa_interface,
92
+ inputs=[qa_audio_input, qa_translate_checkbox, qa_model_dropdown, question_input],
93
+ outputs=answer_output
94
+ )
95
+
96
+ gr.Markdown(
97
+ """
98
+ ## ZeroGPU Support
99
+ This application supports ZeroGPU for Hugging Face Spaces pro users.
100
+ GPU-intensive tasks are automatically optimized for better performance.
101
+ """
102
+ )
103
 
104
  iface.launch()