File size: 2,897 Bytes
597a3c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# This file contains all functions related to diarizing a video including optimization and processing a speech diary (rttm file)
# These functions use a functional approach as I didn't wanted to group them and not bloat the video class with such specific functions
# Perhaps going forward I should abstract diary entries as their own objects similar to dub_line, but I haven't decidded yet as diaries might be useful for voice cloning as well

import app_state
import utils
from Voice import Voice
from pyannote.audio import Pipeline
import torchaudio.transforms as T
import torchaudio
import random

pipeline = None

# Read RTTM files generated by Pyannote into an array containing the speaker, start, and end of their speech in the audio
def load_diary(file):
	diary = []
	with open(file, 'r', encoding='utf-8') as diary_file:
		for line in diary_file.read().strip().split('\n'):
			line_values = line.split(' ')
			diary.append([line_values[7], float(line_values[3]), float(line_values[4])])
	total_speakers = len(set(line[0] for line in diary))
	app_state.speakers = initialize_speakers(total_speakers)
	return diary

# Time Shift the speech diary to be in line with the start time
def update_diary_timing(diary, start_time):
	return [[int(line[0].split('_')[1]), line[1] + start_time, line[2]] for line in diary]

def initialize_speakers(speaker_count):
	speakers = []
	speaker_options = app_state.sample_speaker.list_speakers()
	for i in range(speaker_count):
		speakers.append(Voice(Voice.VoiceType.COQUI, f"Voice {i}"))
		speakers[i].set_voice_params('tts_models/en/vctk/vits', random.choice(speaker_options))
	return speakers

def find_nearest_speaker(diary, sub):
	return diary[
		utils.find_nearest(
			[diary_entry[1] for diary_entry in diary],
			sub.start
		)
	][0]



def optimize_audio_diarization(video):
	crop = video.crop_audio(True)
	waveform, sample_rate = torchaudio.load(crop)
	# Apply noise reduction
	noise_reduce = T.Vad(sample_rate=sample_rate)
	clean_waveform = noise_reduce(waveform)
	
	# Normalize audio
	normalize = T.Resample(orig_freq=sample_rate, new_freq=sample_rate)
	normalized_waveform = normalize(clean_waveform)

	return normalized_waveform, sample_rate

def run_diarization(video):
	global pipeline # Probably should move this to app state?
	if not pipeline:
		pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token="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")
		import torch
		pipeline.to(torch.device("cuda"))
	output = utils.get_output_path(video.file, ".rttm")
	optimized, sample_rate = optimize_audio_diarization(video)
	diarization = pipeline({"waveform": optimized, "sample_rate": sample_rate})
	with open(output, "w") as rttm:
		diarization.write_rttm(rttm)
	diary = load_diary(output)
	diary = update_diary_timing(diary, video.start_time)
	for sub in video.subs_adjusted:
		sub.voice = find_nearest_speaker(diary, sub)