kadirnar commited on
Commit
70814d8
1 Parent(s): 2ec3c3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -65
app.py CHANGED
@@ -1,71 +1,10 @@
1
  import gradio as gr
2
 
 
 
3
  from whisperplus.utils.download_utils import download_and_convert_to_mp3
 
4
 
5
- import logging
6
-
7
- import torch
8
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
9
-
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
-
12
-
13
- class SpeechToTextPipeline:
14
- """Class for converting audio to text using a pre-trained speech recognition model."""
15
-
16
- def __init__(self, model_id: str = "openai/whisper-large-v3"):
17
- self.model = None
18
- self.device = None
19
-
20
- if self.model is None:
21
- self.load_model(model_id)
22
- else:
23
- logging.info("Model already loaded.")
24
-
25
- def load_model(self, model_id: str = "openai/whisper-large-v3"):
26
- """
27
- Loads the pre-trained speech recognition model and moves it to the specified device.
28
-
29
- Args:
30
- model_id (str): Identifier of the pre-trained model to be loaded.
31
- """
32
- logging.info("Loading model...")
33
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
34
- model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
35
- model.to(self.device)
36
- logging.info("Model loaded successfully.")
37
-
38
- self.model = model
39
-
40
- def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
41
- """
42
- Converts audio to text using the pre-trained speech recognition model.
43
-
44
- Args:
45
- audio_path (str): Path to the audio file to be transcribed.
46
- model_id (str): Identifier of the pre-trained model to be used for transcription.
47
-
48
- Returns:
49
- str: Transcribed text from the audio.
50
- """
51
- processor = AutoProcessor.from_pretrained(model_id)
52
- pipe = pipeline(
53
- "automatic-speech-recognition",
54
- model=self.model,
55
- torch_dtype=torch.float16,
56
- chunk_length_s=30,
57
- max_new_tokens=128,
58
- batch_size=24,
59
- return_timestamps=True,
60
- device="cuda",
61
- tokenizer=processor.tokenizer,
62
- feature_extractor=processor.feature_extractor,
63
- model_kwargs={"use_flash_attention_2": True},
64
- generate_kwargs={"language": language},
65
- )
66
- logging.info("Transcribing audio...")
67
- result = pipe(audio_path)["text"]
68
- return result
69
 
70
  def youtube_url_to_text(url, model_id, language_choice):
71
  """
@@ -88,6 +27,36 @@ def youtube_url_to_text(url, model_id, language_choice):
88
  return transcript, video_path
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def youtube_url_to_text_app():
92
  with gr.Blocks():
93
  with gr.Row():
@@ -134,6 +103,92 @@ def youtube_url_to_text_app():
134
  ],
135
  outputs=[output_text, output_audio],
136
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
 
139
  gradio_app = gr.Blocks()
@@ -155,6 +210,8 @@ with gradio_app:
155
  with gr.Column():
156
  with gr.Tab(label="Youtube URL to Text"):
157
  youtube_url_to_text_app()
 
 
158
 
159
  gradio_app.queue()
160
- gradio_app.launch(debug=True)
 
1
  import gradio as gr
2
 
3
+ from whisperplus.pipelines.whisper import SpeechToTextPipeline
4
+ from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
5
  from whisperplus.utils.download_utils import download_and_convert_to_mp3
6
+ from whisperplus.utils.text_utils import format_speech_to_dialogue
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def youtube_url_to_text(url, model_id, language_choice):
10
  """
 
27
  return transcript, video_path
28
 
29
 
30
+ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
31
+ """
32
+ Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
33
+ a specified model, and returns the transcript along with the video path.
34
+
35
+ Args:
36
+ url (str): The URL of the video to download and convert.
37
+ model_id (str): The ID of the speech-to-text model to use.
38
+ language_choice (str): The language choice for the speech-to-text conversion.
39
+
40
+ Returns:
41
+ transcript (str): The transcript of the speech-to-text conversion.
42
+ video_path (str): The path of the downloaded video.
43
+ """
44
+
45
+ pipeline = ASRDiarizationPipeline.from_pretrained(
46
+ asr_model=model_id,
47
+ diarizer_model="pyannote/speaker-diarization",
48
+ use_auth_token="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",
49
+ chunk_length_s=30,
50
+ device=device,
51
+ )
52
+
53
+ audio_path = download_and_convert_to_mp3(url)
54
+ output_text = pipeline(
55
+ audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
56
+ dialogue = format_speech_to_dialogue(output_text)
57
+ return dialogue, audio_path
58
+
59
+
60
  def youtube_url_to_text_app():
61
  with gr.Blocks():
62
  with gr.Row():
 
103
  ],
104
  outputs=[output_text, output_audio],
105
  )
106
+ gr.Examples(
107
+ examples=[
108
+ [
109
+ "https://www.youtube.com/watch?v=di3rHkEZuUw",
110
+ "openai/whisper-large-v3",
111
+ "English",
112
+ ],
113
+ ],
114
+ fn=youtube_url_to_text,
115
+ inputs=[
116
+ youtube_url_path,
117
+ whisper_model_id,
118
+ language_choice,
119
+ ],
120
+ outputs=[output_text, output_audio],
121
+ cache_examples=True,
122
+ )
123
+
124
+
125
+ def speaker_diarization_app():
126
+ with gr.Blocks():
127
+ with gr.Row():
128
+ with gr.Column():
129
+ youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
130
+
131
+ whisper_model_id = gr.Dropdown(
132
+ choices=[
133
+ "openai/whisper-large-v3",
134
+ "openai/whisper-large",
135
+ "openai/whisper-medium",
136
+ "openai/whisper-base",
137
+ "openai/whisper-small",
138
+ "openai/whisper-tiny",
139
+ ],
140
+ value="openai/whisper-large-v3",
141
+ label="Whisper Model",
142
+ )
143
+ device = gr.Dropdown(
144
+ choices=["cpu", "cuda", "mps"],
145
+ value="cuda",
146
+ label="Device",
147
+ )
148
+ num_speakers = gr.Number(value=2, label="Number of Speakers")
149
+ min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
150
+ max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
151
+ whisperplus_in_predict = gr.Button(value="Generator")
152
+
153
+ with gr.Column():
154
+ output_text = gr.Textbox(label="Output Text")
155
+ output_audio = gr.Audio(label="Output Audio")
156
+
157
+ whisperplus_in_predict.click(
158
+ fn=speaker_diarization,
159
+ inputs=[
160
+ youtube_url_path,
161
+ whisper_model_id,
162
+ device,
163
+ num_speakers,
164
+ min_speaker,
165
+ max_speaker,
166
+ ],
167
+ outputs=[output_text, output_audio],
168
+ )
169
+ gr.Examples(
170
+ examples=[
171
+ [
172
+ "https://www.youtube.com/shorts/o8PgLUgte2k",
173
+ "openai/whisper-large-v3",
174
+ "mps",
175
+ 2,
176
+ 1,
177
+ 2,
178
+ ],
179
+ ],
180
+ fn=speaker_diarization,
181
+ inputs=[
182
+ youtube_url_path,
183
+ whisper_model_id,
184
+ device,
185
+ num_speakers,
186
+ min_speaker,
187
+ max_speaker,
188
+ ],
189
+ outputs=[output_text, output_audio],
190
+ cache_examples=True,
191
+ )
192
 
193
 
194
  gradio_app = gr.Blocks()
 
210
  with gr.Column():
211
  with gr.Tab(label="Youtube URL to Text"):
212
  youtube_url_to_text_app()
213
+ with gr.Tab(label="Speaker Diarization"):
214
+ speaker_diarization_app()
215
 
216
  gradio_app.queue()
217
+ gradio_app.launch(debug=True)