Spaces:

Najeeb1
/

TextToSpeech

Running

App Files Files Community

TextToSpeech / inference.py

Najeeb1

Upload folder using huggingface_hub

acf2dde verified 11 months ago

raw

history blame contribute delete

9.93 kB

	import time
	import json
	import soundfile as sf
	import torch
	import torchaudio
	import librosa
	import yaml
	from munch import Munch
	from nltk.tokenize import word_tokenize
	from models import *
	from utils import *
	from text_utils import TextCleaner
	import phonemizer
	from Utils.PLBERT.util import load_plbert
	from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
	import pandas as pd
	# Setup
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True
	import random
	import tqdm
	import argparse


	# Load packages
	text_cleaner = TextCleaner()

	# Mel Spectrogram transformation
	to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
	mean, std = -4, 4


	# Load models and configurations
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	#LibriModel
	config = yaml.safe_load(open("Models/LibriTTS/config.yml"))

	#LJModel
	# config = yaml.safe_load(open("Models/LJSpeech/Models_LJSpeech_config.yml"))

	# load pretrained ASR model
	ASR_config = config.get('ASR_config', False)
	ASR_path = config.get('ASR_path', False)
	text_aligner = load_ASR_models(ASR_path, ASR_config)

	# load pretrained F0 model
	F0_path = config.get('F0_path', False)
	pitch_extractor = load_F0_models(F0_path)

	# load BERT model
	BERT_path = config.get('PLBERT_dir', False)
	plbert = load_plbert(BERT_path)
	model_params = recursive_munch(config['model_params'])
	model = build_model(model_params, text_aligner, pitch_extractor, plbert)
	_ = [model[key].eval() for key in model]
	_ = [model[key].to(device) for key in model]

	#LibriModel
	params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')

	#LJModel
	# params_whole = torch.load("Models/LJSpeech/epoch_2nd_00100.pth", map_location='cpu')

	params = params_whole['net']


	for key in model:
	if key in params:
	print('%s loaded' % key)
	try:
	model[key].load_state_dict(params[key])
	except:
	from collections import OrderedDict
	state_dict = params[key]
	new_state_dict = OrderedDict()
	for k, v in state_dict.items():
	name = k[7:] # remove `module.`
	new_state_dict[name] = v
	# load params
	model[key].load_state_dict(new_state_dict, strict=False)
	# except:
	# _load(params[key], model[key])
	_ = [model[key].eval() for key in model]

	# Load sampler
	sampler = DiffusionSampler(
	model.diffusion.diffusion,
	sampler=ADPM2Sampler(),
	sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
	clamp=False
	)

	# Load phonemizer
	global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)


	# Preprocessing functions
	def length_to_mask(lengths):
	"Gets the mask of the max length"
	mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
	mask = torch.gt(mask + 1, lengths.unsqueeze(1))
	return mask


	def preprocess(wave):
	"Turns wave to mel tensor"
	wave_tensor = torch.from_numpy(wave).float()
	mel_tensor = to_mel(wave_tensor)
	mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
	return mel_tensor


	def compute_style(path, device, model):
	"Computes the style vector for a given audio file"
	wave, sr = librosa.load(path, sr=24000)
	audio, index = librosa.effects.trim(wave, top_db=30)
	if sr != 24000:
	audio = librosa.resample(audio, sr, 24000)
	mel_tensor = preprocess(audio).to(device)
	with torch.no_grad():
	ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
	ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))
	return torch.cat([ref_s, ref_p], dim=1)


	# Inference function
	def inference(text, ref_s, model, device, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1):
	# Preprocess text
	text = text.replace('"', '')
	ps = global_phonemizer.phonemize([text])
	ps = word_tokenize(ps[0])
	ps = ' '.join(ps)
	tokens = text_cleaner(ps)
	tokens.insert(0, 0)
	tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
	max_length = 512

	if len(tokens) > max_length:
	tokens = tokens[:max_length]

	with torch.no_grad():
	# Process text
	input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
	text_mask = length_to_mask(input_lengths).to(device)
	t_en = model.text_encoder(tokens, input_lengths, text_mask)
	bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
	d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

	s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
	embedding=bert_dur,
	embedding_scale=embedding_scale,
	features=ref_s, # reference from the same speaker as the embedding
	num_steps=diffusion_steps).squeeze(1)


	s = s_pred[:, 128:]
	ref = s_pred[:, :128]

	ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
	s = beta * s + (1 - beta) * ref_s[:, 128:]

	d = model.predictor.text_encoder(d_en,
	s, input_lengths, text_mask)

	x, _ = model.predictor.lstm(d)
	duration = model.predictor.duration_proj(x)

	duration = torch.sigmoid(duration).sum(axis=-1)
	pred_dur = torch.round(duration.squeeze()).clamp(min=1)

	pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
	c_frame = 0
	for i in range(pred_aln_trg.size(0)):
	pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
	c_frame += int(pred_dur[i].data)

	# encode prosody
	en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
	if model_params.decoder.type == "hifigan":
	asr_new = torch.zeros_like(en)
	asr_new[:, :, 0] = en[:, :, 0]
	asr_new[:, :, 1:] = en[:, :, 0:-1]
	en = asr_new

	F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

	asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
	if model_params.decoder.type == "hifigan":
	asr_new = torch.zeros_like(asr)
	asr_new[:, :, 0] = asr[:, :, 0]
	asr_new[:, :, 1:] = asr[:, :, 0:-1]
	asr = asr_new

	out = model.decoder(asr,
	F0_pred, N_pred, ref.squeeze().unsqueeze(0))


	# Return synthesized speech
	return out.squeeze().cpu().numpy()[..., :-50]


	# Function to generate synthetic voices for multiple texts
	def generate_synthetic_voice(transcription, ref_s, model, device, index, output_folder="Output", alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1):

	wav = inference(transcription, ref_s, model, device, alpha, beta, diffusion_steps, embedding_scale)
	audio_filename = f"synthetic_voice_{index}.wav"
	output_path = f"{output_folder}/{audio_filename}"
	sf.write(output_path, wav, 24000)


	return output_path


	def main(max_files):
	# Load the transcriptions from the CSV file
	csv_file = 'transcriptions.csv'

	if os.path.exists(csv_file):
	df = pd.read_csv(csv_file)

	# Ensure the "audio_path" column exists
	if 'audio_path' not in df.columns:
	df['audio_path'] = ''

	# Get the list of reference audio files
	ref_audio_folder = 'reference_audio'
	ref_audio_files = [os.path.join(ref_audio_folder, f) for f in os.listdir(ref_audio_folder) if f.endswith('.wav')]

	# Calculate the number of rows to process
	num_rows_to_process = df['audio_path'].isnull().sum() + (df['audio_path'] == '').sum()
	num_rows_to_process = min(num_rows_to_process, max_files)

	if max_files == np.inf:
	max_files = num_rows_to_process

	progress_bar = tqdm.tqdm(total=num_rows_to_process, desc="Generating Audio Files")
	new_audio_count = 0 # Initialize the counter for new audio files

	# Process transcriptions that don't have an audio path yet
	for index, row in df.iterrows():
	if pd.isna(row['audio_path']) or row['audio_path'] == '':
	path = random.choice(ref_audio_files)
	ref_s = compute_style(path, device, model)
	transcription = row['transcription']
	audio_path = generate_synthetic_voice(transcription, ref_s, model, device, index)

	df.at[index, 'audio_path'] = audio_path # Write audio path in dataframe
	new_audio_count += 1
	progress_bar.update(1)

	# Save the updated DataFrame to the CSV file
	df.to_csv(csv_file, index=False)

	# Stop if the maximum number of new audio files is reached
	if new_audio_count >= max_files:
	break



	if new_audio_count == num_rows_to_process:
	print('All transcriptions have been created.')
	else:
	print(f'Finished creating {new_audio_count} new transcriptions.')

	else:
	print(f"CSV file {csv_file} does not exist.")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Generate synthetic audio files.')
	parser.add_argument('--max_files', type=str, default='5', help='Maximum number of new audio files to create or "all" to process all rows')
	args = parser.parse_args()

	if args.max_files.lower() == 'all':
	max_files = np.inf # Set to infinity to process all rows
	else:
	max_files = int(args.max_files)

	main(max_files=max_files)

	# Example: python inference.py --max_files=100
	# This will create 100 new audio files or all the transcriptions from the csv file