MetroBox commited on
Commit
e985555
1 Parent(s): d00be44

app implementation

Browse files
Files changed (2) hide show
  1. app.py +113 -4
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,7 +1,116 @@
 
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
  import gradio as gr
3
+ import datetime
4
 
5
+ import subprocess
 
6
 
7
+ import torch
8
+ import pyannote.audio
9
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
10
+
11
+ from pyannote.audio import Audio
12
+ from pyannote.core import Segment
13
+
14
+ import wave
15
+ import contextlib
16
+
17
+ from sklearn.cluster import AgglomerativeClustering
18
+ import numpy as np
19
+
20
+ model = whisper.load_model("large-v2")
21
+ embedding_model = PretrainedSpeakerEmbedding(
22
+ "speechbrain/spkrec-ecapa-voxceleb",
23
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
+ )
25
+
26
+ def bulk_transcribe(files):
27
+ output=""
28
+ number=1
29
+ for i in files:
30
+ output+='Archivo '+str(number)+'\n'+transcribe(i.name)+'\n'
31
+ number+=1
32
+ return output
33
+
34
+
35
+
36
+ def transcribe(audio):
37
+ num_speakers=3
38
+ path, error = convert_to_wav(audio)
39
+ if error is not None:
40
+ return error
41
+
42
+ duration = get_duration(path)
43
+ if duration > 4 * 60 * 60:
44
+ return "La duración del audio es muy larga"
45
+
46
+ result = model.transcribe(path)
47
+ segments = result["segments"]
48
+
49
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
50
+ if len(segments) == 1:
51
+ segments[0]['speaker'] = 'HABLANTE 1'
52
+ else:
53
+ embeddings = make_embeddings(path, segments, duration)
54
+ add_speaker_labels(segments, embeddings, num_speakers)
55
+ output = get_output(segments)
56
+ return output
57
+
58
+ def convert_to_wav(path):
59
+ if path[-3:] != 'wav':
60
+ new_path = '.'.join(path.split('.')[:-1]) + '.wav'
61
+ try:
62
+ subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
63
+ except:
64
+ return path, 'Error: No se pudo convertir archivo a .wav'
65
+ path = new_path
66
+ return path, None
67
+
68
+ def get_duration(path):
69
+ with contextlib.closing(wave.open(path,'r')) as f:
70
+ frames = f.getnframes()
71
+ rate = f.getframerate()
72
+ return frames / float(rate)
73
+
74
+ def make_embeddings(path, segments, duration):
75
+ embeddings = np.zeros(shape=(len(segments), 192))
76
+ for i, segment in enumerate(segments):
77
+ embeddings[i] = segment_embedding(path, segment, duration)
78
+ return np.nan_to_num(embeddings)
79
+
80
+ audio = Audio()
81
+
82
+ def segment_embedding(path, segment, duration):
83
+ start = segment["start"]
84
+ # Whisper overshoots the end timestamp in the last segment
85
+ end = min(duration, segment["end"])
86
+ clip = Segment(start, end)
87
+ waveform, sample_rate = audio.crop(path, clip)
88
+ return embedding_model(waveform[None])
89
+
90
+ def add_speaker_labels(segments, embeddings, num_speakers):
91
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
92
+ labels = clustering.labels_
93
+ for i in range(len(segments)):
94
+ segments[i]["speaker"] = 'HABLANTE ' + str(labels[i] + 1)
95
+
96
+ def time(secs):
97
+ return datetime.timedelta(seconds=round(secs))
98
+
99
+ def get_output(segments):
100
+ output = ''
101
+ for (i, segment) in enumerate(segments):
102
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
103
+ if i != 0:
104
+ output += '\n\n'
105
+ output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
106
+ output += segment["text"][1:] + ' '
107
+ return output
108
+
109
+ gr.Interface(
110
+ title = 'Reconocimiento de hablantes con Whisper en Español',
111
+ fn=bulk_transcribe,
112
+ inputs=gr.inputs.File(file_count="multiple"),
113
+ outputs=[
114
+ gr.outputs.Textbox(label='Transcripción')
115
+ ]
116
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/pyannote/pyannote-audio
2
+ git+https://github.com/openai/whisper.git
3
+ gradio