File size: 9,034 Bytes
a3199db 694f93a a3199db 694f93a c5093bb 694f93a ca1f426 694f93a 8f59a41 d8b06bf df609a3 007d6a1 c55c408 ca1f426 df609a3 007d6a1 e35f365 7fdbed5 2b71965 7fdbed5 2b71965 7fdbed5 1851c8f 7fdbed5 2b71965 007d6a1 d8b06bf 2b71965 e35f365 d8b06bf 9703cbc d8b06bf 9703cbc e35f365 86a050b 007d6a1 d8b06bf 55c7d23 9703cbc d8b06bf 9703cbc 694f93a e35f365 86a050b 007d6a1 694f93a d8b06bf 694f93a d2e9f55 fe4ae7f 694f93a d8b06bf 694f93a d8b06bf 694f93a d8b06bf 7fdbed5 694f93a 7fdbed5 d2e9f55 694f93a 8f59a41 694f93a 8f59a41 694f93a 8f59a41 d8b06bf 694f93a d2e9f55 d8b06bf 2b71965 d8b06bf 9703cbc 2b71965 d8b06bf 2b71965 007d6a1 d8b06bf df609a3 d2e9f55 d8b06bf d2e9f55 694f93a c55c408 d8b06bf fe4ae7f d8b06bf 694f93a 8f59a41 2b71965 694f93a d8b06bf 694f93a d8b06bf 2b71965 d8b06bf 694f93a a3199db 7fdbed5 694f93a 3b1a6b5 d8b06bf 694f93a 7fdbed5 c55c408 d2e9f55 694f93a 7fdbed5 694f93a d8b06bf df609a3 a3199db 007d6a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging as transformers_logging
import math
import json
import tempfile
import logging
import concurrent.futures
# Configurar logging
logging.basicConfig(level=logging.INFO)
# Suprimir advertencias
warnings.filterwarnings("ignore")
# Configurar verbosidad para transformers
transformers_logging.set_verbosity_error()
# Modelos actualizados por idioma
MODELS = {
"es": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-xlsr-53-spanish",
"jonatasgrosman/wav2vec2-xls-r-1b-spanish"
],
"en": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-960h",
"microsoft/wav2vec2-base-960h"
],
"pt": [
"facebook/wav2vec2-large-xlsr-53-portuguese",
"openai/whisper-medium",
"jonatasgrosman/wav2vec2-xlsr-53-portuguese"
],
"fr": [
"jonatasgrosman/wav2vec2-large-xlsr-53-french"
]
}
# Cache de modelos para evitar m煤ltiples cargas
model_cache = {}
def get_model(model_name):
if model_name not in model_cache:
model_cache[model_name] = WhisperForConditionalGeneration.from_pretrained(model_name)
return model_cache[model_name]
# Funci贸n para verificar si ffmpeg est谩 instalado
def verify_ffmpeg_installation():
try:
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
except subprocess.CalledProcessError as e:
logging.error("ffmpeg no est谩 instalado o no se puede ejecutar correctamente.")
raise e
def convert_audio_to_wav(audio_path):
if os.path.isdir(audio_path):
raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
wav_path = tmp.name
# A帽adir la opci贸n '-y' para sobrescribir el archivo existente sin preguntar
command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Imprimir resultados para depuraci贸n
logging.info(process.stdout.decode())
logging.error(process.stderr.decode())
if process.returncode != 0:
raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
return wav_path
def detect_language(audio_path):
try:
speech, _ = librosa.load(audio_path, sr=16000, duration=30)
except Exception as e:
raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = get_model("openai/whisper-base")
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
langs = detect_langs(transcription)
es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
if abs(es_confidence - pt_confidence) < 0.2:
return 'es'
return max(langs, key=lambda x: x.prob).lang
def transcribe_audio_stream(audio, model_name):
wav_audio = convert_audio_to_wav(audio)
speech, rate = librosa.load(wav_audio, sr=16000)
duration = len(speech) / rate
transcriptions = []
processor = WhisperProcessor.from_pretrained(model_name)
model = get_model(model_name)
chunk_duration = 30 # segundos
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
progress = min(100, (end / duration) * 100)
transcriptions.append({
"start_time": i,
"end_time": end,
"text": transcription
})
yield transcriptions, progress
def detect_and_select_model(audio):
wav_audio = convert_audio_to_wav(audio)
language = detect_language(wav_audio)
model_options = MODELS.get(language, MODELS["en"])
return language, model_options
def save_transcription(transcriptions, file_format):
if file_format == "JSON":
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as tmp:
json.dump(transcriptions, tmp, ensure_ascii=False, indent=4)
file_path = tmp.name
elif file_format == "TXT":
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
for entry in transcriptions:
tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
file_path = tmp.name
logging.info(f"Archivo de transcripci贸n guardado en: {file_path}")
return file_path
def combined_interface(audio, file_format, confirmed_language, chosen_model):
try:
logging.info(f"Ruta del archivo de audio subido: {audio}")
verify_ffmpeg_installation()
language, model_options = detect_and_select_model(audio)
# Si el usuario ha confirmado el idioma, lo usamos, sino, lo detectamos autom谩ticamente
if not confirmed_language:
confirmed_language = language
# Sugerimos un modelo, pero permitimos que el usuario elija uno
if not chosen_model:
chosen_model = model_options[0]
logging.info(f"Idioma detectado: {confirmed_language}")
logging.info(f"Modelos disponibles: {model_options}")
logging.info(f"Modelo seleccionado: {chosen_model}")
# Primer yield: A帽adir None para la s茅ptima salida (Archivo de Descarga)
yield confirmed_language, model_options, chosen_model, "", 0, "Initializing...", None
transcriptions = []
for partial_transcriptions, progress in transcribe_audio_stream(audio, chosen_model):
transcriptions = partial_transcriptions
full_transcription = " ".join([t["text"] for t in transcriptions])
progress_int = math.floor(progress)
status = f"Transcribing... {progress_int}% complete"
logging.info(f"Progreso: {progress_int}%")
yield confirmed_language, model_options, chosen_model, full_transcription.strip(), progress_int, status, None
logging.info("Guardando transcripci贸n.")
file_path = save_transcription(transcriptions, file_format)
if os.path.isdir(file_path):
raise ValueError(f"El archivo de transcripci贸n deber铆a ser un archivo, pero es un directorio: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"El archivo de transcripci贸n no existe: {file_path}")
os.remove("converted_audio.wav")
logging.info("Archivos temporales limpiados.")
yield confirmed_language, model_options, chosen_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
except Exception as e:
logging.error(f"Error: {e}")
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
iface = gr.Interface(
fn=combined_interface,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File"),
gr.Radio(choices=["JSON", "TXT"], label="Choose output format"),
gr.Dropdown(choices=["", "es", "en", "pt", "fr"], label="Confirm detected language (optional)"),
gr.Dropdown(choices=["", "openai/whisper-large-v3", "facebook/wav2vec2-large-xlsr-53-spanish",
"jonatasgrosman/wav2vec2-xls-r-1b-spanish", "microsoft/wav2vec2-base-960h"], label="Choose model (optional)")
],
outputs=[
gr.Textbox(label="Detected Language"),
gr.Dropdown(label="Available Models", choices=[]),
gr.Textbox(label="Selected Model"),
gr.Textbox(label="Transcription", lines=10),
gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
gr.Textbox(label="Status"),
gr.File(label="Download Transcription")
],
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
description="Upload an audio file to detect the language, confirm the detection or choose a model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
live=True
)
if __name__ == "__main__":
iface.queue().launch()
|