File size: 4,739 Bytes
e985555
d00be44
e985555
d00be44
e985555
d00be44
e985555
 
 
 
 
 
 
 
 
 
 
 
 
d4736d8
 
e985555
d4736d8
e985555
 
6e3969a
d72a3cf
d4736d8
 
e985555
d4736d8
 
 
 
 
 
 
 
 
 
8befa65
d4736d8
 
 
6e3969a
697ae86
d4736d8
 
697ae86
d4736d8
6e3969a
d72a3cf
d4736d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e985555
 
d4736d8
 
 
 
 
 
 
 
 
e985555
 
d4736d8
 
 
 
 
e985555
 
d4736d8
 
 
 
 
e985555
 
 
d4736d8
e985555
d4736d8
6e3969a
d4736d8
 
 
 
 
e985555
 
d4736d8
 
 
 
 
 
e985555
 
d4736d8
 
e985555
 
d4736d8
 
 
 
 
 
 
 
 
e985555
 
d4736d8
a2a93c3
d4736d8
 
 
 
f504fd5
d4736d8
 
 
 
e18baf9
d4736d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import whisper
import gradio as gr
import datetime

import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

# model = whisper.load_model("large-v2")
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

#Transcribe a bulk of audio files
def bulk_transcribe(files, model):
    chosen_model = whisper.load_model(model)
    output = ""
    for i in files:
        output += (
            '--Archivo "'
            + get_file_name(i.name)
            + '"'
            + "\n\n"
            + transcribe(i.name, chosen_model)
            + "\n\n"
        )

    with open("Transcripci贸n.txt", "w") as file:
        file.write(output)

    return "Transcripci贸n.txt", output

#Getting the file name from the path
def get_file_name(file):
    file_path = file.split("/")
    file_name = file_path[-1]
    return file_name

#The main function that transcribe each audio file
def transcribe(audio, model):
    num_speakers = 3
    path, error = convert_to_wav(audio)
    if error is not None:
        return error

    duration = get_duration(path)
    if duration > 4 * 60 * 60:
        return "La duraci贸n del audio es muy larga"

    result = model.transcribe(path)

    segments = result["segments"]

    num_speakers = min(max(round(num_speakers), 1), len(segments))
    if len(segments) == 1:
        segments[0]["speaker"] = "HABLANTE 1"
    else:
        embeddings = make_embeddings(path, segments, duration)
        add_speaker_labels(segments, embeddings, num_speakers)
    output = get_output(segments)
    return output


def convert_to_wav(path):
    if path[-3:] != "wav":
        new_path = ".".join(path.split(".")[:-1]) + ".wav"
        try:
            subprocess.call(["ffmpeg", "-i", path, new_path, "-y"])
        except:
            return path, "Error: No se pudo convertir archivo a .wav"
        path = new_path
    return path, None


def get_duration(path):
    with contextlib.closing(wave.open(path, "r")) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        return frames / float(rate)


def make_embeddings(path, segments, duration):
    embeddings = np.zeros(shape=(len(segments), 192))
    for i, segment in enumerate(segments):
        embeddings[i] = segment_embedding(path, segment, duration)
    return np.nan_to_num(embeddings)


audio = Audio()


def segment_embedding(path, segment, duration):
    start = segment["start"]
    
    end = min(duration, segment["end"])
    clip = Segment(start, end)
    waveform, sample_rate = audio.crop(path, clip)
    return embedding_model(waveform[None])


def add_speaker_labels(segments, embeddings, num_speakers):
    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)

    labels = clustering.labels_
    for i in range(len(segments)):
        segments[i]["speaker"] = "HABLANTE " + str(labels[i] + 1)


def time(secs):
    return datetime.timedelta(seconds=round(secs))


def get_output(segments):
    output = ""
    for i, segment in enumerate(segments):
        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
            if i != 0:
                output += "\n\n"
            output += segment["speaker"] + " " + str(time(segment["start"])) + "\n\n"
        output += segment["text"][1:] + " "
    return output


gr.Interface(
    title="Reconocimiento de hablantes con Whisper en Espa帽ol",
    description="La interfaz permite la transcripci贸n de audios individuales y en conjunto a texto a trav茅s de los modelos de Whisper, para archivos donde existen espec铆ficamente tres hablantes. Por defecto, est谩 seleccionado el modelo 'large-v2' que presenta el mejor rendimiento y requiere mayor procesamiento. Sin embargo, es posible seleccionar el modelo a aplicar sobre los archivos a trav茅s del dropdown que ha sido desarrollado. De igual forma, se genera una transcripci贸n directa y un archivo .txt descargable que contiene el texto correspondiente al grupo de archivos seleccionados.",
    fn=bulk_transcribe,
    inputs=[
        gr.File(file_count="multiple", file_types=["audio"], label="Archivos de audio"),
        gr.Dropdown(
            label="Modelo de Whisper",
            choices=["tiny", "base", "small", "medium", "large", "large-v2"],
            value="large-v2",
        ),
    ],
    outputs=[gr.File(label="Archivo TXT con transcripci贸n"), gr.Textbox(label="Transcripci贸n de archivos de audio")],
).launch()