File size: 6,758 Bytes
a3199db 694f93a a3199db 694f93a c5093bb 694f93a c5093bb 694f93a df609a3 c55c408 df609a3 2b71965 e35f365 7fdbed5 2b71965 7fdbed5 2b71965 7fdbed5 1851c8f 7fdbed5 2b71965 e35f365 86a050b 694f93a e35f365 86a050b 694f93a d2e9f55 fe4ae7f 694f93a df609a3 694f93a df609a3 694f93a df609a3 694f93a 5653d92 694f93a c55c408 694f93a 7fdbed5 694f93a 7fdbed5 d2e9f55 694f93a d2e9f55 694f93a 2b71965 694f93a df609a3 d2e9f55 fe4ae7f d2e9f55 694f93a c55c408 694f93a fe4ae7f 694f93a 2b71965 694f93a 2b71965 694f93a a3199db 7fdbed5 694f93a 7fdbed5 c55c408 d2e9f55 694f93a 7fdbed5 694f93a df609a3 a3199db d2e9f55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json
# Suppress warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()
# Updated models by language
MODELS = {
"es": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-xlsr-53-spanish",
"jonatasgrosman/wav2vec2-xls-r-1b-spanish"
],
"en": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-960h",
"microsoft/wav2vec2-base-960h"
],
"pt": [
"facebook/wav2vec2-large-xlsr-53-portuguese",
"openai/whisper-medium",
"jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
]
}
def convert_audio_to_wav(audio_path):
if os.path.isdir(audio_path):
raise ValueError(f"The path provided is a directory: {audio_path}")
wav_path = "converted_audio.wav"
command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return wav_path
def detect_language(audio_path):
try:
speech, _ = librosa.load(audio_path, sr=16000, duration=30)
except Exception as e:
raise ValueError(f"Error loading audio file with librosa: {e}")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
langs = detect_langs(transcription)
es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
if abs(es_confidence - pt_confidence) < 0.2:
return 'es'
return max(langs, key=lambda x: x.prob).lang
def transcribe_audio_stream(audio, model_name):
wav_audio = convert_audio_to_wav(audio)
speech, rate = librosa.load(wav_audio, sr=16000)
duration = len(speech) / rate
transcriptions = []
if "whisper" in model_name:
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
chunk_duration = 30 # seconds
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
progress = min(100, (end / duration) * 100)
transcriptions.append({
"start_time": i,
"end_time": end,
"text": transcription
})
yield transcriptions, progress
else:
transcriber = pipeline("automatic-speech-recognition", model=model_name)
chunk_duration = 10 # seconds
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
result = transcriber(chunk)
progress = min(100, (end / duration) * 100)
transcriptions.append({
"start_time": i,
"end_time": end,
"text": result["text"]
})
yield transcriptions, progress
def detect_and_select_model(audio):
wav_audio = convert_audio_to_wav(audio)
language = detect_language(wav_audio)
model_options = MODELS.get(language, MODELS["en"])
return language, model_options
def save_transcription(transcriptions, file_format):
if file_format == "JSON":
file_path = "transcription.json"
with open(file_path, 'w') as f:
json.dump(transcriptions, f, ensure_ascii=False, indent=4)
elif file_format == "TXT":
file_path = "transcription.txt"
with open(file_path, 'w') as f:
for entry in transcriptions:
f.write(f"{entry['start_time']},{entry['end_time']},{entry['text']}\n")
return file_path
def combined_interface(audio, file_format):
try:
language, model_options = detect_and_select_model(audio)
selected_model = model_options[0]
yield language, model_options, selected_model, "", 0, "Initializing..."
transcriptions = []
for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
transcriptions = partial_transcriptions
full_transcription = " ".join([t["text"] for t in transcriptions])
progress_int = math.floor(progress)
status = f"Transcribing... {progress_int}% complete"
yield language, model_options, selected_model, full_transcription.strip(), progress_int, status
# Save transcription file
file_path = save_transcription(transcriptions, file_format)
# Clean up temporary files
os.remove("converted_audio.wav")
yield language, model_options, selected_model, full_transcription.strip(), 100, f"Transcription complete! Download {file_path}", file_path
except Exception as e:
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
iface = gr.Interface(
fn=combined_interface,
inputs=[
gr.Audio(type="filepath"),
gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
],
outputs=[
gr.Textbox(label="Detected Language"),
gr.Dropdown(label="Available Models", choices=[]),
gr.Textbox(label="Selected Model"),
gr.Textbox(label="Transcription", lines=10),
gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
gr.Textbox(label="Status"),
gr.File(label="Download Transcription")
],
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
live=True
)
if __name__ == "__main__":
iface.queue().launch()
|