cstr commited on
Commit
acd8816
·
verified ·
1 Parent(s): 2e6cc31

try live=True

Browse files
Files changed (1) hide show
  1. app.py +32 -51
app.py CHANGED
@@ -1,28 +1,21 @@
1
  import gradio as gr
2
  import os
3
  import time
4
- import sys
5
- import subprocess
6
  import tempfile
7
  import requests
8
  from urllib.parse import urlparse
9
  from pydub import AudioSegment
 
 
 
 
10
 
11
- # Clone and install faster-whisper from GitHub
12
- # (we should be able to do this in build.sh in a hf space)
13
- try:
14
- subprocess.run(["git", "clone", "https://github.com/SYSTRAN/faster-whisper.git"], check=True)
15
- subprocess.run(["pip", "install", "-e", "./faster-whisper"], check=True)
16
- except subprocess.CalledProcessError as e:
17
- print(f"Error during faster-whisper installation: {e}")
18
- sys.exit(1)
19
 
20
- # Add the faster-whisper directory to the Python path
21
  sys.path.append("./faster-whisper")
 
22
 
23
- from faster_whisper import WhisperModel
24
- from faster_whisper.transcribe import BatchedInferencePipeline
25
- import yt_dlp
26
 
27
  def download_audio(url, method_choice):
28
  parsed_url = urlparse(url)
@@ -31,7 +24,6 @@ def download_audio(url, method_choice):
31
  else:
32
  return download_direct_audio(url, method_choice)
33
 
34
- # Additional YouTube download methods
35
  def download_youtube_audio(url, method_choice):
36
  methods = {
37
  'yt-dlp': youtube_dl_method,
@@ -41,13 +33,12 @@ def download_youtube_audio(url, method_choice):
41
  'ffmpeg': ffmpeg_method,
42
  'aria2': aria2_method
43
  }
44
-
45
  method = methods.get(method_choice, youtube_dl_method)
46
-
47
  try:
48
  return method(url)
49
  except Exception as e:
50
- return f"Error downloading using {method_choice}: {str(e)}"
 
51
 
52
  def youtube_dl_method(url):
53
  ydl_opts = {
@@ -74,7 +65,6 @@ def pytube_method(url):
74
  return new_file
75
 
76
  def youtube_dl_classic_method(url):
77
- # Classic youtube-dl method
78
  ydl_opts = {
79
  'format': 'bestaudio/best',
80
  'postprocessors': [{
@@ -131,8 +121,9 @@ def download_direct_audio(url, method_choice):
131
  else:
132
  raise Exception(f"Failed to download audio from {url}")
133
  except Exception as e:
134
- return f"Error downloading direct audio: {str(e)}"
135
-
 
136
  def wget_method(url):
137
  output_file = tempfile.mktemp(suffix='.mp3')
138
  command = ['wget', '-O', output_file, url]
@@ -140,44 +131,43 @@ def wget_method(url):
140
  return output_file
141
 
142
  def trim_audio(audio_path, start_time, end_time):
143
- audio = AudioSegment.from_mp3(audio_path)
144
  trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
145
- trimmed_audio_path = tempfile.mktemp(suffix='.mp3')
146
- trimmed_audio.export(trimmed_audio_path, format="mp3")
147
  return trimmed_audio_path
148
 
 
 
 
 
 
 
149
  def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
150
  try:
151
- # Initialize the model
152
  model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
153
  batched_model = BatchedInferencePipeline(model=model)
154
 
155
- # Handle input source
156
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
157
- # It's a URL, download the audio
158
  audio_path = download_audio(input_source, download_method)
159
  if audio_path.startswith("Error"):
160
  yield f"Error: {audio_path}", "", None
161
  return
162
  else:
163
- # It's a local file path
164
  audio_path = input_source
165
 
166
- # Trim the audio if start_time or end_time is specified
167
  if start_time is not None or end_time is not None:
168
  trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
169
  audio_path = trimmed_audio_path
170
 
171
- # Benchmark transcription time
172
  start_time_perf = time.time()
173
  segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
174
  end_time_perf = time.time()
175
 
176
- # Show initial metrics as soon as possible
177
  transcription_time = end_time_perf - start_time_perf
178
  real_time_factor = info.duration / transcription_time
179
- audio_file_size = os.path.getsize(audio_path) / (1024 * 1024) # Size in MB
180
-
181
  metrics_output = (
182
  f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
183
  f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
@@ -191,15 +181,13 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
191
 
192
  transcription = ""
193
 
194
- # Stream transcription output gradually
195
  for segment in segments:
196
  transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
197
  transcription += transcription_segment
198
 
199
- if verbose:
200
  yield metrics_output, transcription, None
201
 
202
- # Final output with download option
203
  transcription_file = save_transcription(transcription)
204
  yield metrics_output, transcription, transcription_file
205
 
@@ -207,7 +195,6 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
207
  yield f"An error occurred: {str(e)}", "", None
208
 
209
  finally:
210
- # Clean up downloaded and trimmed files
211
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
212
  try:
213
  os.remove(audio_path)
@@ -219,17 +206,10 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
219
  except:
220
  pass
221
 
222
- def save_transcription(transcription):
223
- file_path = tempfile.mktemp(suffix='.txt')
224
- with open(file_path, 'w') as f:
225
- f.write(transcription)
226
- return file_path
227
-
228
- # Gradio interface
229
  iface = gr.Interface(
230
  fn=transcribe_audio,
231
  inputs=[
232
- gr.Textbox(label="Audio Source (Upload, MP3 URL, or YouTube URL)"),
233
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
234
  gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
235
  gr.Number(label="Start Time (seconds)", value=0),
@@ -237,18 +217,19 @@ iface = gr.Interface(
237
  gr.Checkbox(label="Verbose Output", value=False)
238
  ],
239
  outputs=[
240
- gr.Textbox(label="Transcription Metrics and Verbose Messages"),
241
- gr.Textbox(label="Transcription"),
242
- gr.File(label="Download Transcription")
243
  ],
244
- title="Faster Whisper Multi-Input Transcription",
245
- description="Enter an audio file path, MP3 URL, or YouTube URL to transcribe using Faster Whisper (GitHub version). Adjust the batch size and choose a download method.",
246
  examples=[
247
  ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
248
  ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
249
  ["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
250
  ],
251
- cache_examples=False # Prevents automatic processing of examples
 
252
  )
253
 
254
  iface.launch()
 
1
  import gradio as gr
2
  import os
3
  import time
 
 
4
  import tempfile
5
  import requests
6
  from urllib.parse import urlparse
7
  from pydub import AudioSegment
8
+ import logging
9
+ import torch
10
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
11
+ import yt_dlp
12
 
13
+ logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
14
 
 
15
  sys.path.append("./faster-whisper")
16
+ from faster_whisper import WhisperModel, BatchedInferencePipeline
17
 
18
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
19
 
20
  def download_audio(url, method_choice):
21
  parsed_url = urlparse(url)
 
24
  else:
25
  return download_direct_audio(url, method_choice)
26
 
 
27
  def download_youtube_audio(url, method_choice):
28
  methods = {
29
  'yt-dlp': youtube_dl_method,
 
33
  'ffmpeg': ffmpeg_method,
34
  'aria2': aria2_method
35
  }
 
36
  method = methods.get(method_choice, youtube_dl_method)
 
37
  try:
38
  return method(url)
39
  except Exception as e:
40
+ logging.error(f"Error downloading using {method_choice}: {str(e)}")
41
+ return None
42
 
43
  def youtube_dl_method(url):
44
  ydl_opts = {
 
65
  return new_file
66
 
67
  def youtube_dl_classic_method(url):
 
68
  ydl_opts = {
69
  'format': 'bestaudio/best',
70
  'postprocessors': [{
 
121
  else:
122
  raise Exception(f"Failed to download audio from {url}")
123
  except Exception as e:
124
+ logging.error(f"Error downloading direct audio: {str(e)}")
125
+ return None
126
+
127
  def wget_method(url):
128
  output_file = tempfile.mktemp(suffix='.mp3')
129
  command = ['wget', '-O', output_file, url]
 
131
  return output_file
132
 
133
  def trim_audio(audio_path, start_time, end_time):
134
+ audio = AudioSegment.from_file(audio_path)
135
  trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
136
+ trimmed_audio_path = tempfile.mktemp(suffix='.wav')
137
+ trimmed_audio.export(trimmed_audio_path, format="wav")
138
  return trimmed_audio_path
139
 
140
+ def save_transcription(transcription):
141
+ file_path = tempfile.mktemp(suffix='.txt')
142
+ with open(file_path, 'w') as f:
143
+ f.write(transcription)
144
+ return file_path
145
+
146
  def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
147
  try:
 
148
  model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
149
  batched_model = BatchedInferencePipeline(model=model)
150
 
 
151
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
 
152
  audio_path = download_audio(input_source, download_method)
153
  if audio_path.startswith("Error"):
154
  yield f"Error: {audio_path}", "", None
155
  return
156
  else:
 
157
  audio_path = input_source
158
 
 
159
  if start_time is not None or end_time is not None:
160
  trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
161
  audio_path = trimmed_audio_path
162
 
 
163
  start_time_perf = time.time()
164
  segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
165
  end_time_perf = time.time()
166
 
 
167
  transcription_time = end_time_perf - start_time_perf
168
  real_time_factor = info.duration / transcription_time
169
+ audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
170
+
171
  metrics_output = (
172
  f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
173
  f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
 
181
 
182
  transcription = ""
183
 
 
184
  for segment in segments:
185
  transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
186
  transcription += transcription_segment
187
 
188
+ if verbose:
189
  yield metrics_output, transcription, None
190
 
 
191
  transcription_file = save_transcription(transcription)
192
  yield metrics_output, transcription, transcription_file
193
 
 
195
  yield f"An error occurred: {str(e)}", "", None
196
 
197
  finally:
 
198
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
199
  try:
200
  os.remove(audio_path)
 
206
  except:
207
  pass
208
 
 
 
 
 
 
 
 
209
  iface = gr.Interface(
210
  fn=transcribe_audio,
211
  inputs=[
212
+ gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
213
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
214
  gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
215
  gr.Number(label="Start Time (seconds)", value=0),
 
217
  gr.Checkbox(label="Verbose Output", value=False)
218
  ],
219
  outputs=[
220
+ gr.Textbox(label="Transcription Metrics and Verbose Messages", lines=10),
221
+ gr.Textbox(label="Transcription", lines=10),
222
+ gr.File(label="Download Transcription")
223
  ],
224
+ title="Multi-Model Transcription",
225
+ description="Transcribe audio using with Whisper.",
226
  examples=[
227
  ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
228
  ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
229
  ["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
230
  ],
231
+ cache_examples=False,
232
+ live=True
233
  )
234
 
235
  iface.launch()