Spaces:
Runtime error
Runtime error
File size: 4,739 Bytes
e985555 d00be44 e985555 d00be44 e985555 d00be44 e985555 d4736d8 e985555 d4736d8 e985555 6e3969a d72a3cf d4736d8 e985555 d4736d8 8befa65 d4736d8 6e3969a 697ae86 d4736d8 697ae86 d4736d8 6e3969a d72a3cf d4736d8 e985555 d4736d8 e985555 d4736d8 e985555 d4736d8 e985555 d4736d8 e985555 d4736d8 6e3969a d4736d8 e985555 d4736d8 e985555 d4736d8 e985555 d4736d8 e985555 d4736d8 a2a93c3 d4736d8 f504fd5 d4736d8 e18baf9 d4736d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import whisper
import gradio as gr
import datetime
import subprocess
import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
from sklearn.cluster import AgglomerativeClustering
import numpy as np
# model = whisper.load_model("large-v2")
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)
#Transcribe a bulk of audio files
def bulk_transcribe(files, model):
chosen_model = whisper.load_model(model)
output = ""
for i in files:
output += (
'--Archivo "'
+ get_file_name(i.name)
+ '"'
+ "\n\n"
+ transcribe(i.name, chosen_model)
+ "\n\n"
)
with open("Transcripci贸n.txt", "w") as file:
file.write(output)
return "Transcripci贸n.txt", output
#Getting the file name from the path
def get_file_name(file):
file_path = file.split("/")
file_name = file_path[-1]
return file_name
#The main function that transcribe each audio file
def transcribe(audio, model):
num_speakers = 3
path, error = convert_to_wav(audio)
if error is not None:
return error
duration = get_duration(path)
if duration > 4 * 60 * 60:
return "La duraci贸n del audio es muy larga"
result = model.transcribe(path)
segments = result["segments"]
num_speakers = min(max(round(num_speakers), 1), len(segments))
if len(segments) == 1:
segments[0]["speaker"] = "HABLANTE 1"
else:
embeddings = make_embeddings(path, segments, duration)
add_speaker_labels(segments, embeddings, num_speakers)
output = get_output(segments)
return output
def convert_to_wav(path):
if path[-3:] != "wav":
new_path = ".".join(path.split(".")[:-1]) + ".wav"
try:
subprocess.call(["ffmpeg", "-i", path, new_path, "-y"])
except:
return path, "Error: No se pudo convertir archivo a .wav"
path = new_path
return path, None
def get_duration(path):
with contextlib.closing(wave.open(path, "r")) as f:
frames = f.getnframes()
rate = f.getframerate()
return frames / float(rate)
def make_embeddings(path, segments, duration):
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(path, segment, duration)
return np.nan_to_num(embeddings)
audio = Audio()
def segment_embedding(path, segment, duration):
start = segment["start"]
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(path, clip)
return embedding_model(waveform[None])
def add_speaker_labels(segments, embeddings, num_speakers):
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
segments[i]["speaker"] = "HABLANTE " + str(labels[i] + 1)
def time(secs):
return datetime.timedelta(seconds=round(secs))
def get_output(segments):
output = ""
for i, segment in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
if i != 0:
output += "\n\n"
output += segment["speaker"] + " " + str(time(segment["start"])) + "\n\n"
output += segment["text"][1:] + " "
return output
gr.Interface(
title="Reconocimiento de hablantes con Whisper en Espa帽ol",
description="La interfaz permite la transcripci贸n de audios individuales y en conjunto a texto a trav茅s de los modelos de Whisper, para archivos donde existen espec铆ficamente tres hablantes. Por defecto, est谩 seleccionado el modelo 'large-v2' que presenta el mejor rendimiento y requiere mayor procesamiento. Sin embargo, es posible seleccionar el modelo a aplicar sobre los archivos a trav茅s del dropdown que ha sido desarrollado. De igual forma, se genera una transcripci贸n directa y un archivo .txt descargable que contiene el texto correspondiente al grupo de archivos seleccionados.",
fn=bulk_transcribe,
inputs=[
gr.File(file_count="multiple", file_types=["audio"], label="Archivos de audio"),
gr.Dropdown(
label="Modelo de Whisper",
choices=["tiny", "base", "small", "medium", "large", "large-v2"],
value="large-v2",
),
],
outputs=[gr.File(label="Archivo TXT con transcripci贸n"), gr.Textbox(label="Transcripci贸n de archivos de audio")],
).launch()
|