Spaces:

alexiserodriguez
/

whisper-transcription-app

Runtime error

App Files Files Community

MetroBox commited on Jun 20, 2023

Commit

d4736d8

1 Parent(s): fb3a431

update formatting

Browse files

Files changed (2) hide show

.vscode/settings.json +6 -0
app.py +101 -87

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}

app.py CHANGED Viewed

@@ -17,118 +17,132 @@ import contextlib
 from sklearn.cluster import AgglomerativeClustering
 import numpy as np
-#model = whisper.load_model("large-v2")
-embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 )
 def bulk_transcribe(files, model):
-    chosen_model=whisper.load_model(model)
-    output=""
     for i in files:
-        output+='--Archivo "'+get_file_name(i.name)+'"'+'\n\n'+transcribe(i.name, chosen_model)+'\n\n'
-    with open('Transcripción.txt', 'w') as file:
         file.write(output)
-    return 'Transcripción.txt', output
 def get_file_name(file):
-    file_path=file.split("/")
-    file_name=file_path[-1]
     return file_name
 def transcribe(audio, model):
-  num_speakers=3
-  path, error = convert_to_wav(audio)
-  if error is not None:
-    return error
-  duration = get_duration(path)
-  if duration > 4 * 60 * 60:
-    return "La duración del audio es muy larga"
-  result = {}
-  result = model.transcribe(path)
-  segments = result["segments"]
-  num_speakers = min(max(round(num_speakers), 1), len(segments))
-  if len(segments) == 1:
-    segments[0]['speaker'] = 'HABLANTE 1'
-  else:
-    embeddings = make_embeddings(path, segments, duration)
-    add_speaker_labels(segments, embeddings, num_speakers)
-  output = get_output(segments)
-  return output
 def convert_to_wav(path):
-  if path[-3:] != 'wav':
-    new_path = '.'.join(path.split('.')[:-1]) + '.wav'
-    try:
-      subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
-    except:
-      return path, 'Error: No se pudo convertir archivo a .wav'
-    path = new_path
-  return path, None
 def get_duration(path):
-  with contextlib.closing(wave.open(path,'r')) as f:
-    frames = f.getnframes()
-    rate = f.getframerate()
-    return frames / float(rate)
 def make_embeddings(path, segments, duration):
-  embeddings = np.zeros(shape=(len(segments), 192))
-  for i, segment in enumerate(segments):
-    embeddings[i] = segment_embedding(path, segment, duration)
-  return np.nan_to_num(embeddings)
 audio = Audio()
 def segment_embedding(path, segment, duration):
-  start = segment["start"]
-  # Whisper overshoots the end timestamp in the last segment
-  end = min(duration, segment["end"])
-  clip = Segment(start, end)
-  waveform, sample_rate = audio.crop(path, clip)
-  return embedding_model(waveform[None])
 def add_speaker_labels(segments, embeddings, num_speakers):
-  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
-  labels={}
-  labels = clustering.labels_
-  for i in range(len(segments)):
-    segments[i]["speaker"] = 'HABLANTE ' + str(labels[i] + 1)
 def time(secs):
-  return datetime.timedelta(seconds=round(secs))
 def get_output(segments):
-  output = ''
-  for (i, segment) in enumerate(segments):
-    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
-      if i != 0:
-        output += '\n\n'
-      output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
-    output += segment["text"][1:] + ' '
-  return output
 gr.Interface(
-    title = 'Reconocimiento de hablantes con Whisper en Español',
-    fn=bulk_transcribe,
-    inputs=[gr.File(file_count="multiple", file_types=["audio"], label='Archivos de audio'),
-            gr.Dropdown(label="Modelo",
-                        choices=[
-                            "tiny",
-                            "base",
-                            "small",
-                            "medium",
-                            "large",
-                            "large-v2"
-                            ],
-                        value="large-v2")],
-    outputs=[gr.File(label="Archivo TXT"), gr.Textbox(label='Transcripción')]
-  ).launch()

 from sklearn.cluster import AgglomerativeClustering
 import numpy as np
+# model = whisper.load_model("large-v2")
+embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
+    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
 )
 def bulk_transcribe(files, model):
+    chosen_model = whisper.load_model(model)
+    output = ""
     for i in files:
+        output += (
+            '--Archivo "'
+            + get_file_name(i.name)
+            + '"'
+            + "\n\n"
+            + transcribe(i.name, chosen_model)
+            + "\n\n"
+        )
+    with open("Transcripción.txt", "w") as file:
         file.write(output)
+    return "Transcripción.txt", output
 def get_file_name(file):
+    file_path = file.split("/")
+    file_name = file_path[-1]
     return file_name
 def transcribe(audio, model):
+    num_speakers = 3
+    path, error = convert_to_wav(audio)
+    if error is not None:
+        return error
+    duration = get_duration(path)
+    if duration > 4 * 60 * 60:
+        return "La duración del audio es muy larga"
+    result = model.transcribe(path)
+    segments = result["segments"]
+    num_speakers = min(max(round(num_speakers), 1), len(segments))
+    if len(segments) == 1:
+        segments[0]["speaker"] = "HABLANTE 1"
+    else:
+        embeddings = make_embeddings(path, segments, duration)
+        add_speaker_labels(segments, embeddings, num_speakers)
+    output = get_output(segments)
+    return output
 def convert_to_wav(path):
+    if path[-3:] != "wav":
+        new_path = ".".join(path.split(".")[:-1]) + ".wav"
+        try:
+            subprocess.call(["ffmpeg", "-i", path, new_path, "-y"])
+        except:
+            return path, "Error: No se pudo convertir archivo a .wav"
+        path = new_path
+    return path, None
 def get_duration(path):
+    with contextlib.closing(wave.open(path, "r")) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        return frames / float(rate)
 def make_embeddings(path, segments, duration):
+    embeddings = np.zeros(shape=(len(segments), 192))
+    for i, segment in enumerate(segments):
+        embeddings[i] = segment_embedding(path, segment, duration)
+    return np.nan_to_num(embeddings)
 audio = Audio()
 def segment_embedding(path, segment, duration):
+    start = segment["start"]
+    # Whisper overshoots the end timestamp in the last segment
+    end = min(duration, segment["end"])
+    clip = Segment(start, end)
+    waveform, sample_rate = audio.crop(path, clip)
+    return embedding_model(waveform[None])
 def add_speaker_labels(segments, embeddings, num_speakers):
+    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+    labels = clustering.labels_
+    for i in range(len(segments)):
+        segments[i]["speaker"] = "HABLANTE " + str(labels[i] + 1)
 def time(secs):
+    return datetime.timedelta(seconds=round(secs))
 def get_output(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+            if i != 0:
+                output += "\n\n"
+            output += segment["speaker"] + " " + str(time(segment["start"])) + "\n\n"
+        output += segment["text"][1:] + " "
+    return output
 gr.Interface(
+    title="Reconocimiento de hablantes con Whisper en Español",
+    fn=bulk_transcribe,
+    inputs=[
+        gr.File(file_count="multiple", file_types=["audio"], label="Archivos de audio"),
+        gr.Dropdown(
+            label="Modelo",
+            choices=["tiny", "base", "small", "medium", "large", "large-v2"],
+            value="large-v2",
+        ),
+    ],
+    outputs=[gr.File(label="Archivo TXT"), gr.Textbox(label="Transcripción")],
+).launch()