kadirnar commited on
Commit
0c8b1e1
1 Parent(s): f367093

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -3
app.py CHANGED
@@ -1,7 +1,194 @@
1
- from whisperplus.app import youtube_url_to_text_app, speaker_diarization_app
2
-
3
  import gradio as gr
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  gradio_app = gr.Blocks()
@@ -27,4 +214,4 @@ with gradio_app:
27
  speaker_diarization_app()
28
 
29
  gradio_app.queue()
30
- gradio_app.launch(debug=True)
 
 
 
1
  import gradio as gr
2
 
3
+ from whisperplus.pipelines.whisper import SpeechToTextPipeline
4
+ from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
5
+ from whisperplus.utils.download_utils import download_and_convert_to_mp3
6
+ from whisperplus.utils.text_utils import format_speech_to_dialogue
7
+
8
+
9
+ def youtube_url_to_text(url, model_id, language_choice):
10
+ """
11
+ Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
12
+ a specified model, and returns the transcript along with the video path.
13
+
14
+ Args:
15
+ url (str): The URL of the video to download and convert.
16
+ model_id (str): The ID of the speech-to-text model to use.
17
+ language_choice (str): The language choice for the speech-to-text conversion.
18
+
19
+ Returns:
20
+ transcript (str): The transcript of the speech-to-text conversion.
21
+ video_path (str): The path of the downloaded video.
22
+ """
23
+ video_path = download_and_convert_to_mp3(url)
24
+ pipeline = SpeechToTextPipeline(model_id)
25
+ transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice)
26
+
27
+ return transcript, video_path
28
+
29
+
30
+ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
31
+ """
32
+ Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
33
+ a specified model, and returns the transcript along with the video path.
34
+
35
+ Args:
36
+ url (str): The URL of the video to download and convert.
37
+ model_id (str): The ID of the speech-to-text model to use.
38
+ language_choice (str): The language choice for the speech-to-text conversion.
39
+
40
+ Returns:
41
+ transcript (str): The transcript of the speech-to-text conversion.
42
+ video_path (str): The path of the downloaded video.
43
+ """
44
+
45
+ pipeline = ASRDiarizationPipeline.from_pretrained(
46
+ asr_model=model_id,
47
+ diarizer_model="pyannote/speaker-diarization",
48
+ use_auth_token=False,
49
+ chunk_length_s=30,
50
+ device=device,
51
+ )
52
+
53
+ audio_path = download_and_convert_to_mp3(url)
54
+ output_text = pipeline(
55
+ audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
56
+ dialogue = format_speech_to_dialogue(output_text)
57
+ return dialogue, audio_path
58
+
59
+
60
+ def youtube_url_to_text_app():
61
+ with gr.Blocks():
62
+ with gr.Row():
63
+ with gr.Column():
64
+ youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
65
+
66
+ language_choice = gr.Dropdown(
67
+ choices=[
68
+ "English",
69
+ "Turkish",
70
+ "Spanish",
71
+ "French",
72
+ "Chinese",
73
+ "Japanese",
74
+ "Korean",
75
+ ],
76
+ value="Turkish",
77
+ label="Language",
78
+ )
79
+ whisper_model_id = gr.Dropdown(
80
+ choices=[
81
+ "openai/whisper-large-v3",
82
+ "openai/whisper-large",
83
+ "openai/whisper-medium",
84
+ "openai/whisper-base",
85
+ "openai/whisper-small",
86
+ "openai/whisper-tiny",
87
+ ],
88
+ value="openai/whisper-large-v3",
89
+ label="Whisper Model",
90
+ )
91
+ whisperplus_in_predict = gr.Button(value="Generator")
92
+
93
+ with gr.Column():
94
+ output_text = gr.Textbox(label="Output Text")
95
+ output_audio = gr.Audio(label="Output Audio")
96
+
97
+ whisperplus_in_predict.click(
98
+ fn=youtube_url_to_text,
99
+ inputs=[
100
+ youtube_url_path,
101
+ whisper_model_id,
102
+ language_choice,
103
+ ],
104
+ outputs=[output_text, output_audio],
105
+ )
106
+ gr.Examples(
107
+ examples=[
108
+ [
109
+ "https://www.youtube.com/watch?v=di3rHkEZuUw",
110
+ "openai/whisper-large-v3",
111
+ "English",
112
+ ],
113
+ ],
114
+ fn=youtube_url_to_text,
115
+ inputs=[
116
+ youtube_url_path,
117
+ whisper_model_id,
118
+ language_choice,
119
+ ],
120
+ outputs=[output_text, output_audio],
121
+ cache_examples=True,
122
+ )
123
+
124
+
125
+ def speaker_diarization_app():
126
+ with gr.Blocks():
127
+ with gr.Row():
128
+ with gr.Column():
129
+ youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
130
+
131
+ whisper_model_id = gr.Dropdown(
132
+ choices=[
133
+ "openai/whisper-large-v3",
134
+ "openai/whisper-large",
135
+ "openai/whisper-medium",
136
+ "openai/whisper-base",
137
+ "openai/whisper-small",
138
+ "openai/whisper-tiny",
139
+ ],
140
+ value="openai/whisper-large-v3",
141
+ label="Whisper Model",
142
+ )
143
+ device = gr.Dropdown(
144
+ choices=["cpu", "cuda", "mps"],
145
+ value="cuda",
146
+ label="Device",
147
+ )
148
+ num_speakers = gr.Number(value=2, label="Number of Speakers")
149
+ min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
150
+ max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
151
+ whisperplus_in_predict = gr.Button(value="Generator")
152
+
153
+ with gr.Column():
154
+ output_text = gr.Textbox(label="Output Text")
155
+ output_audio = gr.Audio(label="Output Audio")
156
+
157
+ whisperplus_in_predict.click(
158
+ fn=speaker_diarization,
159
+ inputs=[
160
+ youtube_url_path,
161
+ whisper_model_id,
162
+ device,
163
+ num_speakers,
164
+ min_speaker,
165
+ max_speaker,
166
+ ],
167
+ outputs=[output_text, output_audio],
168
+ )
169
+ gr.Examples(
170
+ examples=[
171
+ [
172
+ "https://www.youtube.com/shorts/o8PgLUgte2k",
173
+ "openai/whisper-large-v3",
174
+ "cuda",
175
+ 2,
176
+ 1,
177
+ 2,
178
+ ],
179
+ ],
180
+ fn=speaker_diarization,
181
+ inputs=[
182
+ youtube_url_path,
183
+ whisper_model_id,
184
+ device,
185
+ num_speakers,
186
+ min_speaker,
187
+ max_speaker,
188
+ ],
189
+ outputs=[output_text, output_audio],
190
+ cache_examples=True,
191
+ )
192
 
193
 
194
  gradio_app = gr.Blocks()
 
214
  speaker_diarization_app()
215
 
216
  gradio_app.queue()
217
+ gradio_app.launch(debug=True)