Spaces:
Sleeping
Sleeping
try live=True
Browse files
app.py
CHANGED
|
@@ -1,28 +1,21 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
-
import sys
|
| 5 |
-
import subprocess
|
| 6 |
import tempfile
|
| 7 |
import requests
|
| 8 |
from urllib.parse import urlparse
|
| 9 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
# (we should be able to do this in build.sh in a hf space)
|
| 13 |
-
try:
|
| 14 |
-
subprocess.run(["git", "clone", "https://github.com/SYSTRAN/faster-whisper.git"], check=True)
|
| 15 |
-
subprocess.run(["pip", "install", "-e", "./faster-whisper"], check=True)
|
| 16 |
-
except subprocess.CalledProcessError as e:
|
| 17 |
-
print(f"Error during faster-whisper installation: {e}")
|
| 18 |
-
sys.exit(1)
|
| 19 |
|
| 20 |
-
# Add the faster-whisper directory to the Python path
|
| 21 |
sys.path.append("./faster-whisper")
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
from faster_whisper.transcribe import BatchedInferencePipeline
|
| 25 |
-
import yt_dlp
|
| 26 |
|
| 27 |
def download_audio(url, method_choice):
|
| 28 |
parsed_url = urlparse(url)
|
|
@@ -31,7 +24,6 @@ def download_audio(url, method_choice):
|
|
| 31 |
else:
|
| 32 |
return download_direct_audio(url, method_choice)
|
| 33 |
|
| 34 |
-
# Additional YouTube download methods
|
| 35 |
def download_youtube_audio(url, method_choice):
|
| 36 |
methods = {
|
| 37 |
'yt-dlp': youtube_dl_method,
|
|
@@ -41,13 +33,12 @@ def download_youtube_audio(url, method_choice):
|
|
| 41 |
'ffmpeg': ffmpeg_method,
|
| 42 |
'aria2': aria2_method
|
| 43 |
}
|
| 44 |
-
|
| 45 |
method = methods.get(method_choice, youtube_dl_method)
|
| 46 |
-
|
| 47 |
try:
|
| 48 |
return method(url)
|
| 49 |
except Exception as e:
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
def youtube_dl_method(url):
|
| 53 |
ydl_opts = {
|
|
@@ -74,7 +65,6 @@ def pytube_method(url):
|
|
| 74 |
return new_file
|
| 75 |
|
| 76 |
def youtube_dl_classic_method(url):
|
| 77 |
-
# Classic youtube-dl method
|
| 78 |
ydl_opts = {
|
| 79 |
'format': 'bestaudio/best',
|
| 80 |
'postprocessors': [{
|
|
@@ -131,8 +121,9 @@ def download_direct_audio(url, method_choice):
|
|
| 131 |
else:
|
| 132 |
raise Exception(f"Failed to download audio from {url}")
|
| 133 |
except Exception as e:
|
| 134 |
-
|
| 135 |
-
|
|
|
|
| 136 |
def wget_method(url):
|
| 137 |
output_file = tempfile.mktemp(suffix='.mp3')
|
| 138 |
command = ['wget', '-O', output_file, url]
|
|
@@ -140,44 +131,43 @@ def wget_method(url):
|
|
| 140 |
return output_file
|
| 141 |
|
| 142 |
def trim_audio(audio_path, start_time, end_time):
|
| 143 |
-
audio = AudioSegment.
|
| 144 |
trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
|
| 145 |
-
trimmed_audio_path = tempfile.mktemp(suffix='.
|
| 146 |
-
trimmed_audio.export(trimmed_audio_path, format="
|
| 147 |
return trimmed_audio_path
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
| 150 |
try:
|
| 151 |
-
# Initialize the model
|
| 152 |
model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
|
| 153 |
batched_model = BatchedInferencePipeline(model=model)
|
| 154 |
|
| 155 |
-
# Handle input source
|
| 156 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
| 157 |
-
# It's a URL, download the audio
|
| 158 |
audio_path = download_audio(input_source, download_method)
|
| 159 |
if audio_path.startswith("Error"):
|
| 160 |
yield f"Error: {audio_path}", "", None
|
| 161 |
return
|
| 162 |
else:
|
| 163 |
-
# It's a local file path
|
| 164 |
audio_path = input_source
|
| 165 |
|
| 166 |
-
# Trim the audio if start_time or end_time is specified
|
| 167 |
if start_time is not None or end_time is not None:
|
| 168 |
trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
|
| 169 |
audio_path = trimmed_audio_path
|
| 170 |
|
| 171 |
-
# Benchmark transcription time
|
| 172 |
start_time_perf = time.time()
|
| 173 |
segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
|
| 174 |
end_time_perf = time.time()
|
| 175 |
|
| 176 |
-
# Show initial metrics as soon as possible
|
| 177 |
transcription_time = end_time_perf - start_time_perf
|
| 178 |
real_time_factor = info.duration / transcription_time
|
| 179 |
-
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
| 180 |
-
|
| 181 |
metrics_output = (
|
| 182 |
f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
|
| 183 |
f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
|
|
@@ -191,15 +181,13 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
|
|
| 191 |
|
| 192 |
transcription = ""
|
| 193 |
|
| 194 |
-
# Stream transcription output gradually
|
| 195 |
for segment in segments:
|
| 196 |
transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
|
| 197 |
transcription += transcription_segment
|
| 198 |
|
| 199 |
-
if verbose:
|
| 200 |
yield metrics_output, transcription, None
|
| 201 |
|
| 202 |
-
# Final output with download option
|
| 203 |
transcription_file = save_transcription(transcription)
|
| 204 |
yield metrics_output, transcription, transcription_file
|
| 205 |
|
|
@@ -207,7 +195,6 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
|
|
| 207 |
yield f"An error occurred: {str(e)}", "", None
|
| 208 |
|
| 209 |
finally:
|
| 210 |
-
# Clean up downloaded and trimmed files
|
| 211 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
| 212 |
try:
|
| 213 |
os.remove(audio_path)
|
|
@@ -219,17 +206,10 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
|
|
| 219 |
except:
|
| 220 |
pass
|
| 221 |
|
| 222 |
-
def save_transcription(transcription):
|
| 223 |
-
file_path = tempfile.mktemp(suffix='.txt')
|
| 224 |
-
with open(file_path, 'w') as f:
|
| 225 |
-
f.write(transcription)
|
| 226 |
-
return file_path
|
| 227 |
-
|
| 228 |
-
# Gradio interface
|
| 229 |
iface = gr.Interface(
|
| 230 |
fn=transcribe_audio,
|
| 231 |
inputs=[
|
| 232 |
-
gr.Textbox(label="Audio Source (Upload,
|
| 233 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
| 234 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
| 235 |
gr.Number(label="Start Time (seconds)", value=0),
|
|
@@ -237,18 +217,19 @@ iface = gr.Interface(
|
|
| 237 |
gr.Checkbox(label="Verbose Output", value=False)
|
| 238 |
],
|
| 239 |
outputs=[
|
| 240 |
-
gr.Textbox(label="Transcription Metrics and Verbose Messages"),
|
| 241 |
-
gr.Textbox(label="Transcription"),
|
| 242 |
-
gr.File(label="Download Transcription")
|
| 243 |
],
|
| 244 |
-
title="
|
| 245 |
-
description="
|
| 246 |
examples=[
|
| 247 |
["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
|
| 248 |
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
|
| 249 |
["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
|
| 250 |
],
|
| 251 |
-
cache_examples=False
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
iface.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
import time
|
|
|
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import requests
|
| 6 |
from urllib.parse import urlparse
|
| 7 |
from pydub import AudioSegment
|
| 8 |
+
import logging
|
| 9 |
+
import torch
|
| 10 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
| 11 |
+
import yt_dlp
|
| 12 |
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
| 15 |
sys.path.append("./faster-whisper")
|
| 16 |
+
from faster_whisper import WhisperModel, BatchedInferencePipeline
|
| 17 |
|
| 18 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def download_audio(url, method_choice):
|
| 21 |
parsed_url = urlparse(url)
|
|
|
|
| 24 |
else:
|
| 25 |
return download_direct_audio(url, method_choice)
|
| 26 |
|
|
|
|
| 27 |
def download_youtube_audio(url, method_choice):
|
| 28 |
methods = {
|
| 29 |
'yt-dlp': youtube_dl_method,
|
|
|
|
| 33 |
'ffmpeg': ffmpeg_method,
|
| 34 |
'aria2': aria2_method
|
| 35 |
}
|
|
|
|
| 36 |
method = methods.get(method_choice, youtube_dl_method)
|
|
|
|
| 37 |
try:
|
| 38 |
return method(url)
|
| 39 |
except Exception as e:
|
| 40 |
+
logging.error(f"Error downloading using {method_choice}: {str(e)}")
|
| 41 |
+
return None
|
| 42 |
|
| 43 |
def youtube_dl_method(url):
|
| 44 |
ydl_opts = {
|
|
|
|
| 65 |
return new_file
|
| 66 |
|
| 67 |
def youtube_dl_classic_method(url):
|
|
|
|
| 68 |
ydl_opts = {
|
| 69 |
'format': 'bestaudio/best',
|
| 70 |
'postprocessors': [{
|
|
|
|
| 121 |
else:
|
| 122 |
raise Exception(f"Failed to download audio from {url}")
|
| 123 |
except Exception as e:
|
| 124 |
+
logging.error(f"Error downloading direct audio: {str(e)}")
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
def wget_method(url):
|
| 128 |
output_file = tempfile.mktemp(suffix='.mp3')
|
| 129 |
command = ['wget', '-O', output_file, url]
|
|
|
|
| 131 |
return output_file
|
| 132 |
|
| 133 |
def trim_audio(audio_path, start_time, end_time):
|
| 134 |
+
audio = AudioSegment.from_file(audio_path)
|
| 135 |
trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
|
| 136 |
+
trimmed_audio_path = tempfile.mktemp(suffix='.wav')
|
| 137 |
+
trimmed_audio.export(trimmed_audio_path, format="wav")
|
| 138 |
return trimmed_audio_path
|
| 139 |
|
| 140 |
+
def save_transcription(transcription):
|
| 141 |
+
file_path = tempfile.mktemp(suffix='.txt')
|
| 142 |
+
with open(file_path, 'w') as f:
|
| 143 |
+
f.write(transcription)
|
| 144 |
+
return file_path
|
| 145 |
+
|
| 146 |
def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
| 147 |
try:
|
|
|
|
| 148 |
model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
|
| 149 |
batched_model = BatchedInferencePipeline(model=model)
|
| 150 |
|
|
|
|
| 151 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
|
|
|
| 152 |
audio_path = download_audio(input_source, download_method)
|
| 153 |
if audio_path.startswith("Error"):
|
| 154 |
yield f"Error: {audio_path}", "", None
|
| 155 |
return
|
| 156 |
else:
|
|
|
|
| 157 |
audio_path = input_source
|
| 158 |
|
|
|
|
| 159 |
if start_time is not None or end_time is not None:
|
| 160 |
trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
|
| 161 |
audio_path = trimmed_audio_path
|
| 162 |
|
|
|
|
| 163 |
start_time_perf = time.time()
|
| 164 |
segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
|
| 165 |
end_time_perf = time.time()
|
| 166 |
|
|
|
|
| 167 |
transcription_time = end_time_perf - start_time_perf
|
| 168 |
real_time_factor = info.duration / transcription_time
|
| 169 |
+
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
| 170 |
+
|
| 171 |
metrics_output = (
|
| 172 |
f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
|
| 173 |
f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
|
|
|
|
| 181 |
|
| 182 |
transcription = ""
|
| 183 |
|
|
|
|
| 184 |
for segment in segments:
|
| 185 |
transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
|
| 186 |
transcription += transcription_segment
|
| 187 |
|
| 188 |
+
if verbose:
|
| 189 |
yield metrics_output, transcription, None
|
| 190 |
|
|
|
|
| 191 |
transcription_file = save_transcription(transcription)
|
| 192 |
yield metrics_output, transcription, transcription_file
|
| 193 |
|
|
|
|
| 195 |
yield f"An error occurred: {str(e)}", "", None
|
| 196 |
|
| 197 |
finally:
|
|
|
|
| 198 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
| 199 |
try:
|
| 200 |
os.remove(audio_path)
|
|
|
|
| 206 |
except:
|
| 207 |
pass
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
iface = gr.Interface(
|
| 210 |
fn=transcribe_audio,
|
| 211 |
inputs=[
|
| 212 |
+
gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
|
| 213 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
| 214 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
| 215 |
gr.Number(label="Start Time (seconds)", value=0),
|
|
|
|
| 217 |
gr.Checkbox(label="Verbose Output", value=False)
|
| 218 |
],
|
| 219 |
outputs=[
|
| 220 |
+
gr.Textbox(label="Transcription Metrics and Verbose Messages", lines=10),
|
| 221 |
+
gr.Textbox(label="Transcription", lines=10),
|
| 222 |
+
gr.File(label="Download Transcription")
|
| 223 |
],
|
| 224 |
+
title="Multi-Model Transcription",
|
| 225 |
+
description="Transcribe audio using with Whisper.",
|
| 226 |
examples=[
|
| 227 |
["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
|
| 228 |
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
|
| 229 |
["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
|
| 230 |
],
|
| 231 |
+
cache_examples=False,
|
| 232 |
+
live=True
|
| 233 |
)
|
| 234 |
|
| 235 |
iface.launch()
|