MetroBox commited on
Commit
d4736d8
1 Parent(s): fb3a431

update formatting

Browse files
Files changed (2) hide show
  1. .vscode/settings.json +6 -0
  2. app.py +101 -87
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
app.py CHANGED
@@ -17,118 +17,132 @@ import contextlib
17
  from sklearn.cluster import AgglomerativeClustering
18
  import numpy as np
19
 
20
- #model = whisper.load_model("large-v2")
21
- embedding_model = PretrainedSpeakerEmbedding(
22
  "speechbrain/spkrec-ecapa-voxceleb",
23
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
  )
25
 
 
26
  def bulk_transcribe(files, model):
27
- chosen_model=whisper.load_model(model)
28
- output=""
29
  for i in files:
30
- output+='--Archivo "'+get_file_name(i.name)+'"'+'\n\n'+transcribe(i.name, chosen_model)+'\n\n'
31
-
32
- with open('Transcripción.txt', 'w') as file:
 
 
 
 
 
 
 
33
  file.write(output)
34
-
35
- return 'Transcripción.txt', output
 
36
 
37
  def get_file_name(file):
38
- file_path=file.split("/")
39
- file_name=file_path[-1]
40
  return file_name
41
-
 
42
  def transcribe(audio, model):
43
- num_speakers=3
44
- path, error = convert_to_wav(audio)
45
- if error is not None:
46
- return error
47
-
48
- duration = get_duration(path)
49
- if duration > 4 * 60 * 60:
50
- return "La duración del audio es muy larga"
51
-
52
- result = {}
53
- result = model.transcribe(path)
54
-
55
- segments = result["segments"]
56
-
57
- num_speakers = min(max(round(num_speakers), 1), len(segments))
58
- if len(segments) == 1:
59
- segments[0]['speaker'] = 'HABLANTE 1'
60
- else:
61
- embeddings = make_embeddings(path, segments, duration)
62
- add_speaker_labels(segments, embeddings, num_speakers)
63
- output = get_output(segments)
64
- return output
65
 
66
  def convert_to_wav(path):
67
- if path[-3:] != 'wav':
68
- new_path = '.'.join(path.split('.')[:-1]) + '.wav'
69
- try:
70
- subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
71
- except:
72
- return path, 'Error: No se pudo convertir archivo a .wav'
73
- path = new_path
74
- return path, None
 
75
 
76
  def get_duration(path):
77
- with contextlib.closing(wave.open(path,'r')) as f:
78
- frames = f.getnframes()
79
- rate = f.getframerate()
80
- return frames / float(rate)
 
81
 
82
  def make_embeddings(path, segments, duration):
83
- embeddings = np.zeros(shape=(len(segments), 192))
84
- for i, segment in enumerate(segments):
85
- embeddings[i] = segment_embedding(path, segment, duration)
86
- return np.nan_to_num(embeddings)
 
87
 
88
  audio = Audio()
89
 
 
90
  def segment_embedding(path, segment, duration):
91
- start = segment["start"]
92
- # Whisper overshoots the end timestamp in the last segment
93
- end = min(duration, segment["end"])
94
- clip = Segment(start, end)
95
- waveform, sample_rate = audio.crop(path, clip)
96
- return embedding_model(waveform[None])
 
97
 
98
  def add_speaker_labels(segments, embeddings, num_speakers):
99
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
100
-
101
- labels={}
102
- labels = clustering.labels_
103
- for i in range(len(segments)):
104
- segments[i]["speaker"] = 'HABLANTE ' + str(labels[i] + 1)
105
 
106
  def time(secs):
107
- return datetime.timedelta(seconds=round(secs))
 
108
 
109
  def get_output(segments):
110
- output = ''
111
- for (i, segment) in enumerate(segments):
112
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
113
- if i != 0:
114
- output += '\n\n'
115
- output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
116
- output += segment["text"][1:] + ' '
117
- return output
 
118
 
119
  gr.Interface(
120
- title = 'Reconocimiento de hablantes con Whisper en Español',
121
- fn=bulk_transcribe,
122
- inputs=[gr.File(file_count="multiple", file_types=["audio"], label='Archivos de audio'),
123
- gr.Dropdown(label="Modelo",
124
- choices=[
125
- "tiny",
126
- "base",
127
- "small",
128
- "medium",
129
- "large",
130
- "large-v2"
131
- ],
132
- value="large-v2")],
133
- outputs=[gr.File(label="Archivo TXT"), gr.Textbox(label='Transcripción')]
134
- ).launch()
 
17
  from sklearn.cluster import AgglomerativeClustering
18
  import numpy as np
19
 
20
+ # model = whisper.load_model("large-v2")
21
+ embedding_model = PretrainedSpeakerEmbedding(
22
  "speechbrain/spkrec-ecapa-voxceleb",
23
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
24
  )
25
 
26
+
27
  def bulk_transcribe(files, model):
28
+ chosen_model = whisper.load_model(model)
29
+ output = ""
30
  for i in files:
31
+ output += (
32
+ '--Archivo "'
33
+ + get_file_name(i.name)
34
+ + '"'
35
+ + "\n\n"
36
+ + transcribe(i.name, chosen_model)
37
+ + "\n\n"
38
+ )
39
+
40
+ with open("Transcripción.txt", "w") as file:
41
  file.write(output)
42
+
43
+ return "Transcripción.txt", output
44
+
45
 
46
  def get_file_name(file):
47
+ file_path = file.split("/")
48
+ file_name = file_path[-1]
49
  return file_name
50
+
51
+
52
  def transcribe(audio, model):
53
+ num_speakers = 3
54
+ path, error = convert_to_wav(audio)
55
+ if error is not None:
56
+ return error
57
+
58
+ duration = get_duration(path)
59
+ if duration > 4 * 60 * 60:
60
+ return "La duración del audio es muy larga"
61
+
62
+ result = model.transcribe(path)
63
+
64
+ segments = result["segments"]
65
+
66
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
67
+ if len(segments) == 1:
68
+ segments[0]["speaker"] = "HABLANTE 1"
69
+ else:
70
+ embeddings = make_embeddings(path, segments, duration)
71
+ add_speaker_labels(segments, embeddings, num_speakers)
72
+ output = get_output(segments)
73
+ return output
74
+
75
 
76
  def convert_to_wav(path):
77
+ if path[-3:] != "wav":
78
+ new_path = ".".join(path.split(".")[:-1]) + ".wav"
79
+ try:
80
+ subprocess.call(["ffmpeg", "-i", path, new_path, "-y"])
81
+ except:
82
+ return path, "Error: No se pudo convertir archivo a .wav"
83
+ path = new_path
84
+ return path, None
85
+
86
 
87
  def get_duration(path):
88
+ with contextlib.closing(wave.open(path, "r")) as f:
89
+ frames = f.getnframes()
90
+ rate = f.getframerate()
91
+ return frames / float(rate)
92
+
93
 
94
  def make_embeddings(path, segments, duration):
95
+ embeddings = np.zeros(shape=(len(segments), 192))
96
+ for i, segment in enumerate(segments):
97
+ embeddings[i] = segment_embedding(path, segment, duration)
98
+ return np.nan_to_num(embeddings)
99
+
100
 
101
  audio = Audio()
102
 
103
+
104
  def segment_embedding(path, segment, duration):
105
+ start = segment["start"]
106
+ # Whisper overshoots the end timestamp in the last segment
107
+ end = min(duration, segment["end"])
108
+ clip = Segment(start, end)
109
+ waveform, sample_rate = audio.crop(path, clip)
110
+ return embedding_model(waveform[None])
111
+
112
 
113
  def add_speaker_labels(segments, embeddings, num_speakers):
114
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
115
+
116
+ labels = clustering.labels_
117
+ for i in range(len(segments)):
118
+ segments[i]["speaker"] = "HABLANTE " + str(labels[i] + 1)
119
+
120
 
121
  def time(secs):
122
+ return datetime.timedelta(seconds=round(secs))
123
+
124
 
125
  def get_output(segments):
126
+ output = ""
127
+ for i, segment in enumerate(segments):
128
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
129
+ if i != 0:
130
+ output += "\n\n"
131
+ output += segment["speaker"] + " " + str(time(segment["start"])) + "\n\n"
132
+ output += segment["text"][1:] + " "
133
+ return output
134
+
135
 
136
  gr.Interface(
137
+ title="Reconocimiento de hablantes con Whisper en Español",
138
+ fn=bulk_transcribe,
139
+ inputs=[
140
+ gr.File(file_count="multiple", file_types=["audio"], label="Archivos de audio"),
141
+ gr.Dropdown(
142
+ label="Modelo",
143
+ choices=["tiny", "base", "small", "medium", "large", "large-v2"],
144
+ value="large-v2",
145
+ ),
146
+ ],
147
+ outputs=[gr.File(label="Archivo TXT"), gr.Textbox(label="Transcripción")],
148
+ ).launch()