Spaces:
Running
Running
try live=True
Browse files
app.py
CHANGED
@@ -1,28 +1,21 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import time
|
4 |
-
import sys
|
5 |
-
import subprocess
|
6 |
import tempfile
|
7 |
import requests
|
8 |
from urllib.parse import urlparse
|
9 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
# (we should be able to do this in build.sh in a hf space)
|
13 |
-
try:
|
14 |
-
subprocess.run(["git", "clone", "https://github.com/SYSTRAN/faster-whisper.git"], check=True)
|
15 |
-
subprocess.run(["pip", "install", "-e", "./faster-whisper"], check=True)
|
16 |
-
except subprocess.CalledProcessError as e:
|
17 |
-
print(f"Error during faster-whisper installation: {e}")
|
18 |
-
sys.exit(1)
|
19 |
|
20 |
-
# Add the faster-whisper directory to the Python path
|
21 |
sys.path.append("./faster-whisper")
|
|
|
22 |
|
23 |
-
|
24 |
-
from faster_whisper.transcribe import BatchedInferencePipeline
|
25 |
-
import yt_dlp
|
26 |
|
27 |
def download_audio(url, method_choice):
|
28 |
parsed_url = urlparse(url)
|
@@ -31,7 +24,6 @@ def download_audio(url, method_choice):
|
|
31 |
else:
|
32 |
return download_direct_audio(url, method_choice)
|
33 |
|
34 |
-
# Additional YouTube download methods
|
35 |
def download_youtube_audio(url, method_choice):
|
36 |
methods = {
|
37 |
'yt-dlp': youtube_dl_method,
|
@@ -41,13 +33,12 @@ def download_youtube_audio(url, method_choice):
|
|
41 |
'ffmpeg': ffmpeg_method,
|
42 |
'aria2': aria2_method
|
43 |
}
|
44 |
-
|
45 |
method = methods.get(method_choice, youtube_dl_method)
|
46 |
-
|
47 |
try:
|
48 |
return method(url)
|
49 |
except Exception as e:
|
50 |
-
|
|
|
51 |
|
52 |
def youtube_dl_method(url):
|
53 |
ydl_opts = {
|
@@ -74,7 +65,6 @@ def pytube_method(url):
|
|
74 |
return new_file
|
75 |
|
76 |
def youtube_dl_classic_method(url):
|
77 |
-
# Classic youtube-dl method
|
78 |
ydl_opts = {
|
79 |
'format': 'bestaudio/best',
|
80 |
'postprocessors': [{
|
@@ -131,8 +121,9 @@ def download_direct_audio(url, method_choice):
|
|
131 |
else:
|
132 |
raise Exception(f"Failed to download audio from {url}")
|
133 |
except Exception as e:
|
134 |
-
|
135 |
-
|
|
|
136 |
def wget_method(url):
|
137 |
output_file = tempfile.mktemp(suffix='.mp3')
|
138 |
command = ['wget', '-O', output_file, url]
|
@@ -140,44 +131,43 @@ def wget_method(url):
|
|
140 |
return output_file
|
141 |
|
142 |
def trim_audio(audio_path, start_time, end_time):
|
143 |
-
audio = AudioSegment.
|
144 |
trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
|
145 |
-
trimmed_audio_path = tempfile.mktemp(suffix='.
|
146 |
-
trimmed_audio.export(trimmed_audio_path, format="
|
147 |
return trimmed_audio_path
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
150 |
try:
|
151 |
-
# Initialize the model
|
152 |
model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
|
153 |
batched_model = BatchedInferencePipeline(model=model)
|
154 |
|
155 |
-
# Handle input source
|
156 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
157 |
-
# It's a URL, download the audio
|
158 |
audio_path = download_audio(input_source, download_method)
|
159 |
if audio_path.startswith("Error"):
|
160 |
yield f"Error: {audio_path}", "", None
|
161 |
return
|
162 |
else:
|
163 |
-
# It's a local file path
|
164 |
audio_path = input_source
|
165 |
|
166 |
-
# Trim the audio if start_time or end_time is specified
|
167 |
if start_time is not None or end_time is not None:
|
168 |
trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
|
169 |
audio_path = trimmed_audio_path
|
170 |
|
171 |
-
# Benchmark transcription time
|
172 |
start_time_perf = time.time()
|
173 |
segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
|
174 |
end_time_perf = time.time()
|
175 |
|
176 |
-
# Show initial metrics as soon as possible
|
177 |
transcription_time = end_time_perf - start_time_perf
|
178 |
real_time_factor = info.duration / transcription_time
|
179 |
-
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
180 |
-
|
181 |
metrics_output = (
|
182 |
f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
|
183 |
f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
|
@@ -191,15 +181,13 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
|
|
191 |
|
192 |
transcription = ""
|
193 |
|
194 |
-
# Stream transcription output gradually
|
195 |
for segment in segments:
|
196 |
transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
|
197 |
transcription += transcription_segment
|
198 |
|
199 |
-
if verbose:
|
200 |
yield metrics_output, transcription, None
|
201 |
|
202 |
-
# Final output with download option
|
203 |
transcription_file = save_transcription(transcription)
|
204 |
yield metrics_output, transcription, transcription_file
|
205 |
|
@@ -207,7 +195,6 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
|
|
207 |
yield f"An error occurred: {str(e)}", "", None
|
208 |
|
209 |
finally:
|
210 |
-
# Clean up downloaded and trimmed files
|
211 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
212 |
try:
|
213 |
os.remove(audio_path)
|
@@ -219,17 +206,10 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
|
|
219 |
except:
|
220 |
pass
|
221 |
|
222 |
-
def save_transcription(transcription):
|
223 |
-
file_path = tempfile.mktemp(suffix='.txt')
|
224 |
-
with open(file_path, 'w') as f:
|
225 |
-
f.write(transcription)
|
226 |
-
return file_path
|
227 |
-
|
228 |
-
# Gradio interface
|
229 |
iface = gr.Interface(
|
230 |
fn=transcribe_audio,
|
231 |
inputs=[
|
232 |
-
gr.Textbox(label="Audio Source (Upload,
|
233 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
234 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
235 |
gr.Number(label="Start Time (seconds)", value=0),
|
@@ -237,18 +217,19 @@ iface = gr.Interface(
|
|
237 |
gr.Checkbox(label="Verbose Output", value=False)
|
238 |
],
|
239 |
outputs=[
|
240 |
-
gr.Textbox(label="Transcription Metrics and Verbose Messages"),
|
241 |
-
gr.Textbox(label="Transcription"),
|
242 |
-
gr.File(label="Download Transcription")
|
243 |
],
|
244 |
-
title="
|
245 |
-
description="
|
246 |
examples=[
|
247 |
["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
|
248 |
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
|
249 |
["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
|
250 |
],
|
251 |
-
cache_examples=False
|
|
|
252 |
)
|
253 |
|
254 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import time
|
|
|
|
|
4 |
import tempfile
|
5 |
import requests
|
6 |
from urllib.parse import urlparse
|
7 |
from pydub import AudioSegment
|
8 |
+
import logging
|
9 |
+
import torch
|
10 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
11 |
+
import yt_dlp
|
12 |
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
|
|
15 |
sys.path.append("./faster-whisper")
|
16 |
+
from faster_whisper import WhisperModel, BatchedInferencePipeline
|
17 |
|
18 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
19 |
|
20 |
def download_audio(url, method_choice):
|
21 |
parsed_url = urlparse(url)
|
|
|
24 |
else:
|
25 |
return download_direct_audio(url, method_choice)
|
26 |
|
|
|
27 |
def download_youtube_audio(url, method_choice):
|
28 |
methods = {
|
29 |
'yt-dlp': youtube_dl_method,
|
|
|
33 |
'ffmpeg': ffmpeg_method,
|
34 |
'aria2': aria2_method
|
35 |
}
|
|
|
36 |
method = methods.get(method_choice, youtube_dl_method)
|
|
|
37 |
try:
|
38 |
return method(url)
|
39 |
except Exception as e:
|
40 |
+
logging.error(f"Error downloading using {method_choice}: {str(e)}")
|
41 |
+
return None
|
42 |
|
43 |
def youtube_dl_method(url):
|
44 |
ydl_opts = {
|
|
|
65 |
return new_file
|
66 |
|
67 |
def youtube_dl_classic_method(url):
|
|
|
68 |
ydl_opts = {
|
69 |
'format': 'bestaudio/best',
|
70 |
'postprocessors': [{
|
|
|
121 |
else:
|
122 |
raise Exception(f"Failed to download audio from {url}")
|
123 |
except Exception as e:
|
124 |
+
logging.error(f"Error downloading direct audio: {str(e)}")
|
125 |
+
return None
|
126 |
+
|
127 |
def wget_method(url):
|
128 |
output_file = tempfile.mktemp(suffix='.mp3')
|
129 |
command = ['wget', '-O', output_file, url]
|
|
|
131 |
return output_file
|
132 |
|
133 |
def trim_audio(audio_path, start_time, end_time):
|
134 |
+
audio = AudioSegment.from_file(audio_path)
|
135 |
trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
|
136 |
+
trimmed_audio_path = tempfile.mktemp(suffix='.wav')
|
137 |
+
trimmed_audio.export(trimmed_audio_path, format="wav")
|
138 |
return trimmed_audio_path
|
139 |
|
140 |
+
def save_transcription(transcription):
|
141 |
+
file_path = tempfile.mktemp(suffix='.txt')
|
142 |
+
with open(file_path, 'w') as f:
|
143 |
+
f.write(transcription)
|
144 |
+
return file_path
|
145 |
+
|
146 |
def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
147 |
try:
|
|
|
148 |
model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
|
149 |
batched_model = BatchedInferencePipeline(model=model)
|
150 |
|
|
|
151 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
|
|
152 |
audio_path = download_audio(input_source, download_method)
|
153 |
if audio_path.startswith("Error"):
|
154 |
yield f"Error: {audio_path}", "", None
|
155 |
return
|
156 |
else:
|
|
|
157 |
audio_path = input_source
|
158 |
|
|
|
159 |
if start_time is not None or end_time is not None:
|
160 |
trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
|
161 |
audio_path = trimmed_audio_path
|
162 |
|
|
|
163 |
start_time_perf = time.time()
|
164 |
segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
|
165 |
end_time_perf = time.time()
|
166 |
|
|
|
167 |
transcription_time = end_time_perf - start_time_perf
|
168 |
real_time_factor = info.duration / transcription_time
|
169 |
+
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
170 |
+
|
171 |
metrics_output = (
|
172 |
f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
|
173 |
f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
|
|
|
181 |
|
182 |
transcription = ""
|
183 |
|
|
|
184 |
for segment in segments:
|
185 |
transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
|
186 |
transcription += transcription_segment
|
187 |
|
188 |
+
if verbose:
|
189 |
yield metrics_output, transcription, None
|
190 |
|
|
|
191 |
transcription_file = save_transcription(transcription)
|
192 |
yield metrics_output, transcription, transcription_file
|
193 |
|
|
|
195 |
yield f"An error occurred: {str(e)}", "", None
|
196 |
|
197 |
finally:
|
|
|
198 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
199 |
try:
|
200 |
os.remove(audio_path)
|
|
|
206 |
except:
|
207 |
pass
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
iface = gr.Interface(
|
210 |
fn=transcribe_audio,
|
211 |
inputs=[
|
212 |
+
gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
|
213 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
214 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
215 |
gr.Number(label="Start Time (seconds)", value=0),
|
|
|
217 |
gr.Checkbox(label="Verbose Output", value=False)
|
218 |
],
|
219 |
outputs=[
|
220 |
+
gr.Textbox(label="Transcription Metrics and Verbose Messages", lines=10),
|
221 |
+
gr.Textbox(label="Transcription", lines=10),
|
222 |
+
gr.File(label="Download Transcription")
|
223 |
],
|
224 |
+
title="Multi-Model Transcription",
|
225 |
+
description="Transcribe audio using with Whisper.",
|
226 |
examples=[
|
227 |
["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
|
228 |
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
|
229 |
["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
|
230 |
],
|
231 |
+
cache_examples=False,
|
232 |
+
live=True
|
233 |
)
|
234 |
|
235 |
iface.launch()
|