import os import glob import torch from glob import glob import numpy as np from pydub import AudioSegment from faster_whisper import WhisperModel from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments model_size = "medium" # Run on GPU with FP16 model = None def split_audio_whisper(audio_path, target_dir='processed'): global model if model is None: model = WhisperModel(model_size, device="cuda", compute_type="float16") audio = AudioSegment.from_file(audio_path) max_len = len(audio) audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] target_folder = os.path.join(target_dir, audio_name) segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True) segments = list(segments) # create directory os.makedirs(target_folder, exist_ok=True) wavs_folder = os.path.join(target_folder, 'wavs') os.makedirs(wavs_folder, exist_ok=True) # segments s_ind = 0 start_time = None for k, w in enumerate(segments): # process with the time if k == 0: start_time = max(0, w.start) end_time = w.end # calculate confidence if len(w.words) > 0: confidence = sum([s.probability for s in w.words]) / len(w.words) else: confidence = 0. # clean text text = w.text.replace('...', '') # left 0.08s for each audios audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)] # segment file name fname = f"{audio_name}_seg{s_ind}.wav" # filter out the segment shorter than 1.5s and longer than 20s save = audio_seg.duration_seconds > 1.5 and \ audio_seg.duration_seconds < 20. and \ len(text) >= 2 and len(text) < 200 if save: output_file = os.path.join(wavs_folder, fname) audio_seg.export(output_file, format='wav') if k < len(segments) - 1: start_time = max(0, segments[k+1].start - 0.08) s_ind = s_ind + 1 return wavs_folder def split_audio_vad(audio_path, target_dir, split_seconds=10.0): SAMPLE_RATE = 16000 audio_vad = get_audio_tensor(audio_path) segments = get_vad_segments( audio_vad, output_sample=True, min_speech_duration=0.1, min_silence_duration=1, method="silero", ) segments = [(seg["start"], seg["end"]) for seg in segments] segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] print(segments) audio_active = AudioSegment.silent(duration=0) audio = AudioSegment.from_file(audio_path) for start_time, end_time in segments: audio_active += audio[int( start_time * 1000) : int(end_time * 1000)] audio_dur = audio_active.duration_seconds print(f'after vad: dur = {audio_dur}') audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] target_folder = os.path.join(target_dir, audio_name) wavs_folder = os.path.join(target_folder, 'wavs') os.makedirs(wavs_folder, exist_ok=True) start_time = 0. count = 0 num_splits = int(np.round(audio_dur / split_seconds)) assert num_splits > 0, 'input audio is too short' interval = audio_dur / num_splits for i in range(num_splits): end_time = min(start_time + interval, audio_dur) if i == num_splits - 1: end_time = audio_dur output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav" audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)] audio_seg.export(output_file, format='wav') start_time = end_time count += 1 return wavs_folder def get_se(audio_path, vc_model, target_dir='processed', vad=True): device = vc_model.device audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] se_path = os.path.join(target_dir, audio_name, 'se.pth') if os.path.isfile(se_path): se = torch.load(se_path).to(device) return se, audio_name if os.path.isdir(audio_path): wavs_folder = audio_path elif vad: wavs_folder = split_audio_vad(audio_path, target_dir) else: wavs_folder = split_audio_whisper(audio_path, target_dir) audio_segs = glob(f'{wavs_folder}/*.wav') if len(audio_segs) == 0: raise NotImplementedError('No audio segments found!') return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name