SoulAbi's picture
Update app.py
9d42a5f
raw
history blame
3.56 kB
from google.colab import files
uploaded = files.upload()
path = next(iter(uploaded))
num_speakers = 2 #@param {type:"integer"}
language = 'English' #@param ['any', 'English']
model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large']
model_name = model_size
if language == 'English' and model_size != 'tiny':
model_name += '.en'
!pip install -q git+https://github.com/openai/whisper.git > /dev/null
!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null
import whisper
import datetime
import subprocess
import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cuda"))
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
from sklearn.cluster import AgglomerativeClustering
import numpy as np
if path[-3:] != 'wav':
subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
path = 'audio.wav'
model = whisper.load_model(model_size)
result = model.transcribe(path)
segments = result["segments"]
with contextlib.closing(wave.open(path,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
audio = Audio()
def segment_embedding(segment):
start = segment["start"]
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(path, clip)
return embedding_model(waveform[None])
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment)
embeddings = np.nan_to_num(embeddings)
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
# speaker = 'Held'
# speaker = 'Heldisha'
# if segments[i]["speaker"]== 'SPEAKER 1':
# segments[i]["speaker"] = 'Held'
# elif segments[i]["speaker"]== 'SPEAKER 2':
# segments[i]["speaker"] = 'Heldisha'
# if segments[i]["speaker"]== 'SPEAKER 1':
# segments[i]["speaker"] = segments.index('n')
# k = list(segments)
# print(k[5])
def time(secs):
return datetime.timedelta(seconds=round(secs))
f = open("transcript.txt", "w")
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
f.write(segment["text"][1:] + ' ')
f.close()
# with open('transcript.txt', 'r') as file:
# text = file.read()
# words = text.split()
# i = words.index('name')
# if (words[i-1] == 'My') or (words[i-1] == 'my') and (words[i+1] == 'is'):
# name1 = words[i+2]
# print(name1)
# with open('transcript.txt', 'r') as file:
# text = file.read()
# new_text = text.replace('SPEAKER 1', name1)
# with open('transcript.txt', 'w') as file:
# file.write(new_text)
# with open('transcript.txt', 'r') as file:
# text = file.read()
# words = text.split()
# i = words.index('name')
# if (words[i+3] == 'What') or (1<2) and (words[i+1] == 'is') or 1<2:
# name2 = words[i+22]
# print(name2)
# with open('transcript.txt', 'r') as file:
# text = file.read()
# new_text = text.replace('SPEAKER 2', name2)
# with open('transcript.txt', 'w') as file:
# file.write(new_text)