Spaces:

salmanmapkar
/

audio-video-transcriber

Runtime error

App Files Files Community

salmanmapkar commited on Dec 22, 2022

Commit

4832cf7

•

1 Parent(s): 0278cb8

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -23

app.py CHANGED Viewed

@@ -13,6 +13,22 @@ import json
 pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
 from pydub.effects import speedup
 import moviepy.editor as mp
 __FILES = set()
@@ -131,6 +147,67 @@ def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
     RemoveAllFiles()
     return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
 def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
     if retries:
         # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
@@ -141,7 +218,7 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
-        return Transcribe(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
@@ -157,10 +234,10 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
-        return Transcribe(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
-    return Transcribe(NumberOfSpeakers, SpeakerNames)
 def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
     if retries:
@@ -184,27 +261,10 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
             stream = ffmpeg.input('temp_audio.m4a')
             stream = ffmpeg.output(stream, 'temp_audio.wav')
             RemoveFile("temp_audio.m4a")
-            return Transcribe(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error(f"Unable to get video from {URL}")
-with gr.Blocks() as _block_ut:
-    ftxt, fjsonl, fcsv = True, False, False
-    def output_selection(_ftxt, _fjsonl, _fcsv):
-        global ftxt, fjsonl, fcsv
-        ftxt = _ftxt
-        fjsonl = _fjsonl
-        fcsv = _fcsv
-    with gr.Row():
-        nos =  gr.Number(label="Number of Speakers", value="0")
-        sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
-        url = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
-        ocb = gr.CheckboxGroup(["Text", "JSONL", "CSV"])
-        bt = gr.Button(fn=output_selection)
-        with gr.Column():
-            output_txt = gr.Textbox(label="Transcribed Text", lines=15, visible = ftxt)
-            output_jsonl = gr.JSON(label="Transcribed Text", visible = fjsonl)
 ut = gr.Interface(
     fn=YoutubeTranscribe,
     inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
@@ -221,5 +281,5 @@ at = gr.Interface(
     outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
 )
-demo = gr.TabbedInterface([_block_ut, vt, at], ["Youtube URL", "Video", "Audio"])
 demo.launch()

 pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
 from pydub.effects import speedup
 import moviepy.editor as mp
+import datetime
+import torch
+import pyannote.audio
+from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+from pyannote.audio import Audio
+from pyannote.core import Segment
+import wave
+import contextlib
+from sklearn.cluster import AgglomerativeClustering
+import numpy as np
+model = whisper.load_model("medium")
+embedding_model = PretrainedSpeakerEmbedding(
+	"speechbrain/spkrec-ecapa-voxceleb",
+	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+)
 __FILES = set()
     RemoveAllFiles()
     return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
+def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
+	audio = Audio()
+    GenerateSpeakerDict(speaker_names)
+	def get_output(segments):
+		# print(segments)
+		output = ''
+		for (i, segment) in enumerate(segments):
+    		if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+    			if i != 0:
+        			conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
+    		conversation[-1][1] += segment["text"][1:]
+		# return output
+        return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
+	def get_duration(path):
+		with contextlib.closing(wave.open(path,'r')) as f:
+		frames = f.getnframes()
+		rate = f.getframerate()
+		return frames / float(rate)
+	def make_embeddings(path, segments, duration):
+		embeddings = np.zeros(shape=(len(segments), 192))
+		for i, segment in enumerate(segments):
+		embeddings[i] = segment_embedding(path, segment, duration)
+		return np.nan_to_num(embeddings)
+	def segment_embedding(path, segment, duration):
+		start = segment["start"]
+		# Whisper overshoots the end timestamp in the last segment
+		end = min(duration, segment["end"])
+		clip = Segment(start, end)
+		waveform, sample_rate = audio.crop(path, clip)
+		return embedding_model(waveform[None])
+	def add_speaker_labels(segments, embeddings, num_speakers):
+		clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+		labels = clustering.labels_
+		for i in range(len(segments)):
+		segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
+	def time(secs):
+		return datetime.timedelta(seconds=round(secs))
+	duration = get_duration(audio)
+	if duration > 4 * 60 * 60:
+		return "Audio duration too long"
+	result = model.transcribe(audio)
+	segments = result["segments"]
+	num_speakers = min(max(round(num_speakers), 1), len(segments))
+	if len(segments) == 1:
+		segments[0]['speaker'] = 'SPEAKER 1'
+	else:
+		embeddings = make_embeddings(audio, segments, duration)
+		add_speaker_labels(segments, embeddings, num_speakers)
+	return get_output(segments)
+	# return output
 def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
     if retries:
         # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
+        return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
         if not (os.path.isfile("temp_audio.wav")):
             return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
+        return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
+    return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
 def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
     if retries:
             stream = ffmpeg.input('temp_audio.m4a')
             stream = ffmpeg.output(stream, 'temp_audio.wav')
             RemoveFile("temp_audio.m4a")
+            return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
     else:
         raise gr.Error(f"Unable to get video from {URL}")
 ut = gr.Interface(
     fn=YoutubeTranscribe,
     inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
     outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
 )
+demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
 demo.launch()