Spaces:
Runtime error
Runtime error
import whisper | |
import gradio as gr | |
import datetime | |
import subprocess | |
import torch | |
import pyannote.audio | |
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding | |
from pyannote.audio import Audio | |
from pyannote.core import Segment | |
import wave | |
import contextlib | |
from sklearn.cluster import AgglomerativeClustering | |
import numpy as np | |
# model = whisper.load_model("large-v2") | |
embedding_model = PretrainedSpeakerEmbedding( | |
"speechbrain/spkrec-ecapa-voxceleb", | |
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), | |
) | |
#Transcribe a bulk of audio files | |
def bulk_transcribe(files, model): | |
chosen_model = whisper.load_model(model) | |
output = "" | |
for i in files: | |
output += ( | |
'--Archivo "' | |
+ get_file_name(i.name) | |
+ '"' | |
+ "\n\n" | |
+ transcribe(i.name, chosen_model) | |
+ "\n\n" | |
) | |
with open("Transcripci贸n.txt", "w") as file: | |
file.write(output) | |
return "Transcripci贸n.txt", output | |
#Getting the file name from the path | |
def get_file_name(file): | |
file_path = file.split("/") | |
file_name = file_path[-1] | |
return file_name | |
#The main function that transcribe each audio file | |
def transcribe(audio, model): | |
num_speakers = 3 | |
path, error = convert_to_wav(audio) | |
if error is not None: | |
return error | |
duration = get_duration(path) | |
if duration > 4 * 60 * 60: | |
return "La duraci贸n del audio es muy larga" | |
result = model.transcribe(path) | |
segments = result["segments"] | |
num_speakers = min(max(round(num_speakers), 1), len(segments)) | |
if len(segments) == 1: | |
segments[0]["speaker"] = "HABLANTE 1" | |
else: | |
embeddings = make_embeddings(path, segments, duration) | |
add_speaker_labels(segments, embeddings, num_speakers) | |
output = get_output(segments) | |
return output | |
def convert_to_wav(path): | |
if path[-3:] != "wav": | |
new_path = ".".join(path.split(".")[:-1]) + ".wav" | |
try: | |
subprocess.call(["ffmpeg", "-i", path, new_path, "-y"]) | |
except: | |
return path, "Error: No se pudo convertir archivo a .wav" | |
path = new_path | |
return path, None | |
def get_duration(path): | |
with contextlib.closing(wave.open(path, "r")) as f: | |
frames = f.getnframes() | |
rate = f.getframerate() | |
return frames / float(rate) | |
def make_embeddings(path, segments, duration): | |
embeddings = np.zeros(shape=(len(segments), 192)) | |
for i, segment in enumerate(segments): | |
embeddings[i] = segment_embedding(path, segment, duration) | |
return np.nan_to_num(embeddings) | |
audio = Audio() | |
def segment_embedding(path, segment, duration): | |
start = segment["start"] | |
end = min(duration, segment["end"]) | |
clip = Segment(start, end) | |
waveform, sample_rate = audio.crop(path, clip) | |
return embedding_model(waveform[None]) | |
def add_speaker_labels(segments, embeddings, num_speakers): | |
clustering = AgglomerativeClustering(num_speakers).fit(embeddings) | |
labels = clustering.labels_ | |
for i in range(len(segments)): | |
segments[i]["speaker"] = "HABLANTE " + str(labels[i] + 1) | |
def time(secs): | |
return datetime.timedelta(seconds=round(secs)) | |
def get_output(segments): | |
output = "" | |
for i, segment in enumerate(segments): | |
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: | |
if i != 0: | |
output += "\n\n" | |
output += segment["speaker"] + " " + str(time(segment["start"])) + "\n\n" | |
output += segment["text"][1:] + " " | |
return output | |
gr.Interface( | |
title="Reconocimiento de hablantes con Whisper en Espa帽ol", | |
description="La interfaz permite la transcripci贸n de audios individuales y en conjunto a texto a trav茅s de los modelos de Whisper, para archivos donde existen espec铆ficamente tres hablantes. Por defecto, est谩 seleccionado el modelo 'large-v2' que presenta el mejor rendimiento y requiere mayor procesamiento. Sin embargo, es posible seleccionar el modelo a aplicar sobre los archivos a trav茅s del dropdown que ha sido desarrollado. De igual forma, se genera una transcripci贸n directa y un archivo .txt descargable que contiene el texto correspondiente al grupo de archivos seleccionados.", | |
fn=bulk_transcribe, | |
inputs=[ | |
gr.File(file_count="multiple", file_types=["audio"], label="Archivos de audio"), | |
gr.Dropdown( | |
label="Modelo de Whisper", | |
choices=["tiny", "base", "small", "medium", "large", "large-v2"], | |
value="large-v2", | |
), | |
], | |
outputs=[gr.File(label="Archivo TXT con transcripci贸n"), gr.Textbox(label="Transcripci贸n de archivos de audio")], | |
).launch() | |