from google.colab import files uploaded = files.upload() path = next(iter(uploaded)) num_speakers = 2 #@param {type:"integer"} language = 'English' #@param ['any', 'English'] model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large'] model_name = model_size if language == 'English' and model_size != 'tiny': model_name += '.en' !pip install -q git+https://github.com/openai/whisper.git > /dev/null !pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null import whisper import datetime import subprocess import torch import pyannote.audio from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding embedding_model = PretrainedSpeakerEmbedding( "speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cuda")) from pyannote.audio import Audio from pyannote.core import Segment import wave import contextlib from sklearn.cluster import AgglomerativeClustering import numpy as np if path[-3:] != 'wav': subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y']) path = 'audio.wav' model = whisper.load_model(model_size) result = model.transcribe(path) segments = result["segments"] with contextlib.closing(wave.open(path,'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) audio = Audio() def segment_embedding(segment): start = segment["start"] end = min(duration, segment["end"]) clip = Segment(start, end) waveform, sample_rate = audio.crop(path, clip) return embedding_model(waveform[None]) embeddings = np.zeros(shape=(len(segments), 192)) for i, segment in enumerate(segments): embeddings[i] = segment_embedding(segment) embeddings = np.nan_to_num(embeddings) clustering = AgglomerativeClustering(num_speakers).fit(embeddings) labels = clustering.labels_ for i in range(len(segments)): segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) # speaker = 'Held' # speaker = 'Heldisha' # if segments[i]["speaker"]== 'SPEAKER 1': # segments[i]["speaker"] = 'Held' # elif segments[i]["speaker"]== 'SPEAKER 2': # segments[i]["speaker"] = 'Heldisha' # if segments[i]["speaker"]== 'SPEAKER 1': # segments[i]["speaker"] = segments.index('n') # k = list(segments) # print(k[5]) def time(secs): return datetime.timedelta(seconds=round(secs)) f = open("transcript.txt", "w") for (i, segment) in enumerate(segments): if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n') f.write(segment["text"][1:] + ' ') f.close() # with open('transcript.txt', 'r') as file: # text = file.read() # words = text.split() # i = words.index('name') # if (words[i-1] == 'My') or (words[i-1] == 'my') and (words[i+1] == 'is'): # name1 = words[i+2] # print(name1) # with open('transcript.txt', 'r') as file: # text = file.read() # new_text = text.replace('SPEAKER 1', name1) # with open('transcript.txt', 'w') as file: # file.write(new_text) # with open('transcript.txt', 'r') as file: # text = file.read() # words = text.split() # i = words.index('name') # if (words[i+3] == 'What') or (1<2) and (words[i+1] == 'is') or 1<2: # name2 = words[i+22] # print(name2) # with open('transcript.txt', 'r') as file: # text = file.read() # new_text = text.replace('SPEAKER 2', name2) # with open('transcript.txt', 'w') as file: # file.write(new_text)