GenerSpeech / data_gen /tts /wav_processors /common_processors.py
Rongjiehuang's picture
update
222619b
raw
history blame
2.98 kB
import os
import subprocess
import librosa
import numpy as np
from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
from data_gen.tts.data_gen_utils import trim_long_silences
from utils.audio import save_wav
from utils.rnnoise import rnnoise
from utils.hparams import hparams
@register_wav_processors(name='sox_to_wav')
class ConvertToWavProcessor(BaseWavProcessor):
@property
def name(self):
return 'ToWav'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
if input_fn[-4:] == '.wav':
return input_fn, sr
else:
output_fn = self.output_fn(input_fn)
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
return output_fn, sr
@register_wav_processors(name='sox_resample')
class ResampleProcessor(BaseWavProcessor):
@property
def name(self):
return 'Resample'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
sr_file = librosa.core.get_samplerate(input_fn)
if sr != sr_file:
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
y, _ = librosa.core.load(input_fn, sr=sr)
y, _ = librosa.effects.trim(y)
save_wav(y, output_fn, sr)
return output_fn, sr
else:
return input_fn, sr
@register_wav_processors(name='trim_sil')
class TrimSILProcessor(BaseWavProcessor):
@property
def name(self):
return 'TrimSIL'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
y, _ = librosa.core.load(input_fn, sr=sr)
y, _ = librosa.effects.trim(y)
save_wav(y, output_fn, sr)
return output_fn
@register_wav_processors(name='trim_all_sil')
class TrimAllSILProcessor(BaseWavProcessor):
@property
def name(self):
return 'TrimSIL'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
y, audio_mask, _ = trim_long_silences(
input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
save_wav(y, output_fn, sr)
if preprocess_args['save_sil_mask']:
os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
return output_fn, sr
@register_wav_processors(name='denoise')
class DenoiseProcessor(BaseWavProcessor):
@property
def name(self):
return 'Denoise'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
rnnoise(input_fn, output_fn, out_sample_rate=sr)
return output_fn, sr