dubbing / diarize.py
adastmin's picture
Upload 18 files
597a3c5
raw
history blame
2.9 kB
# This file contains all functions related to diarizing a video including optimization and processing a speech diary (rttm file)
# These functions use a functional approach as I didn't wanted to group them and not bloat the video class with such specific functions
# Perhaps going forward I should abstract diary entries as their own objects similar to dub_line, but I haven't decidded yet as diaries might be useful for voice cloning as well
import app_state
import utils
from Voice import Voice
from pyannote.audio import Pipeline
import torchaudio.transforms as T
import torchaudio
import random
pipeline = None
# Read RTTM files generated by Pyannote into an array containing the speaker, start, and end of their speech in the audio
def load_diary(file):
diary = []
with open(file, 'r', encoding='utf-8') as diary_file:
for line in diary_file.read().strip().split('\n'):
line_values = line.split(' ')
diary.append([line_values[7], float(line_values[3]), float(line_values[4])])
total_speakers = len(set(line[0] for line in diary))
app_state.speakers = initialize_speakers(total_speakers)
return diary
# Time Shift the speech diary to be in line with the start time
def update_diary_timing(diary, start_time):
return [[int(line[0].split('_')[1]), line[1] + start_time, line[2]] for line in diary]
def initialize_speakers(speaker_count):
speakers = []
speaker_options = app_state.sample_speaker.list_speakers()
for i in range(speaker_count):
speakers.append(Voice(Voice.VoiceType.COQUI, f"Voice {i}"))
speakers[i].set_voice_params('tts_models/en/vctk/vits', random.choice(speaker_options))
return speakers
def find_nearest_speaker(diary, sub):
return diary[
utils.find_nearest(
[diary_entry[1] for diary_entry in diary],
sub.start
)
][0]
def optimize_audio_diarization(video):
crop = video.crop_audio(True)
waveform, sample_rate = torchaudio.load(crop)
# Apply noise reduction
noise_reduce = T.Vad(sample_rate=sample_rate)
clean_waveform = noise_reduce(waveform)
# Normalize audio
normalize = T.Resample(orig_freq=sample_rate, new_freq=sample_rate)
normalized_waveform = normalize(clean_waveform)
return normalized_waveform, sample_rate
def run_diarization(video):
global pipeline # Probably should move this to app state?
if not pipeline:
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token="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")
import torch
pipeline.to(torch.device("cuda"))
output = utils.get_output_path(video.file, ".rttm")
optimized, sample_rate = optimize_audio_diarization(video)
diarization = pipeline({"waveform": optimized, "sample_rate": sample_rate})
with open(output, "w") as rttm:
diarization.write_rttm(rttm)
diary = load_diary(output)
diary = update_diary_timing(diary, video.start_time)
for sub in video.subs_adjusted:
sub.voice = find_nearest_speaker(diary, sub)