Spaces:
Build error
Build error
from google.colab import files | |
uploaded = files.upload() | |
path = next(iter(uploaded)) | |
num_speakers = 2 #@param {type:"integer"} | |
language = 'English' #@param ['any', 'English'] | |
model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large'] | |
model_name = model_size | |
if language == 'English' and model_size != 'tiny': | |
model_name += '.en' | |
!pip install -q git+https://github.com/openai/whisper.git > /dev/null | |
!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null | |
import whisper | |
import datetime | |
import subprocess | |
import torch | |
import pyannote.audio | |
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding | |
embedding_model = PretrainedSpeakerEmbedding( | |
"speechbrain/spkrec-ecapa-voxceleb", | |
device=torch.device("cuda")) | |
from pyannote.audio import Audio | |
from pyannote.core import Segment | |
import wave | |
import contextlib | |
from sklearn.cluster import AgglomerativeClustering | |
import numpy as np | |
if path[-3:] != 'wav': | |
subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y']) | |
path = 'audio.wav' | |
model = whisper.load_model(model_size) | |
result = model.transcribe(path) | |
segments = result["segments"] | |
with contextlib.closing(wave.open(path,'r')) as f: | |
frames = f.getnframes() | |
rate = f.getframerate() | |
duration = frames / float(rate) | |
audio = Audio() | |
def segment_embedding(segment): | |
start = segment["start"] | |
end = min(duration, segment["end"]) | |
clip = Segment(start, end) | |
waveform, sample_rate = audio.crop(path, clip) | |
return embedding_model(waveform[None]) | |
embeddings = np.zeros(shape=(len(segments), 192)) | |
for i, segment in enumerate(segments): | |
embeddings[i] = segment_embedding(segment) | |
embeddings = np.nan_to_num(embeddings) | |
clustering = AgglomerativeClustering(num_speakers).fit(embeddings) | |
labels = clustering.labels_ | |
for i in range(len(segments)): | |
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) | |
# speaker = 'Held' | |
# speaker = 'Heldisha' | |
# if segments[i]["speaker"]== 'SPEAKER 1': | |
# segments[i]["speaker"] = 'Held' | |
# elif segments[i]["speaker"]== 'SPEAKER 2': | |
# segments[i]["speaker"] = 'Heldisha' | |
# if segments[i]["speaker"]== 'SPEAKER 1': | |
# segments[i]["speaker"] = segments.index('n') | |
# k = list(segments) | |
# print(k[5]) | |
def time(secs): | |
return datetime.timedelta(seconds=round(secs)) | |
f = open("transcript.txt", "w") | |
for (i, segment) in enumerate(segments): | |
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: | |
f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n') | |
f.write(segment["text"][1:] + ' ') | |
f.close() | |
# with open('transcript.txt', 'r') as file: | |
# text = file.read() | |
# words = text.split() | |
# i = words.index('name') | |
# if (words[i-1] == 'My') or (words[i-1] == 'my') and (words[i+1] == 'is'): | |
# name1 = words[i+2] | |
# print(name1) | |
# with open('transcript.txt', 'r') as file: | |
# text = file.read() | |
# new_text = text.replace('SPEAKER 1', name1) | |
# with open('transcript.txt', 'w') as file: | |
# file.write(new_text) | |
# with open('transcript.txt', 'r') as file: | |
# text = file.read() | |
# words = text.split() | |
# i = words.index('name') | |
# if (words[i+3] == 'What') or (1<2) and (words[i+1] == 'is') or 1<2: | |
# name2 = words[i+22] | |
# print(name2) | |
# with open('transcript.txt', 'r') as file: | |
# text = file.read() | |
# new_text = text.replace('SPEAKER 2', name2) | |
# with open('transcript.txt', 'w') as file: | |
# file.write(new_text) | |