File size: 3,389 Bytes

0dabde8

import os
import torch
import librosa
from tqdm import tqdm
from speaker_encoder.voice_encoder import SpeakerEncoder
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments


@torch.no_grad()
def se_extractor(audio_path, smodel):
    # vad
    SAMPLE_RATE = 16000
    audio_vad = get_audio_tensor(audio_path)
    segments = get_vad_segments(
        audio_vad,
        output_sample=True,
        min_speech_duration=0.1,
        min_silence_duration=1,
        method="silero",
    )
    segments = [(seg["start"], seg["end"]) for seg in segments]
    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]

    if len(segments) == 0:
        segments = [(0, len(audio_vad)/SAMPLE_RATE)]
        print(segments)

    # spk
    gs = []

    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
    # audio = torch.tensor(audio).float().to(device)

    for s, e in segments:
        y = audio[int(SAMPLE_RATE*s):int(SAMPLE_RATE*e)]
        g = smodel.embed_utterance(y)
        g = torch.from_numpy(g).unsqueeze(0)
        gs.append(g)

    gs = torch.stack(gs).mean(0)
    return gs.cpu()


def process_audio_folder(input_folder, output_folder, model, device):
    """
    Process all audio files in a folder and its subfolders, 
    save the extracted features as .pt files in the output folder with the same structure.

    Args:
        input_folder (str): Path to the input folder containing audio files.
        output_folder (str): Path to the output folder to save .pt files.
        model: Pre-trained model for feature extraction.
        device: Torch device (e.g., 'cpu' or 'cuda').
    """
    # Collect all audio file paths
    audio_files = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(('.wav', '.mp3', '.flac')):  # Adjust for the audio formats you want to process
                audio_files.append(os.path.join(root, file))

    # Process each audio file with tqdm for progress
    for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
        # Construct output path
        relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
        output_dir = os.path.join(output_folder, relative_path)
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')

        # Check if the .pt file already exists
        if os.path.exists(output_path):
            # print(f"Skipped (already exists): {output_path}")
            continue  # Skip processing this file
        # Extract features
        target_se = se_extractor(audio_path, model).to(device)
        # Save the feature as .pt
        torch.save(target_se, output_path)
        # print(f"Processed and saved: {output_path}")


if __name__ == '__main__':
    smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
    device = 'cuda'
    # input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
    # output_folder = 'spk/LibriTTS-R/train-clean-360/'
    # process_audio_folder(input_folder, output_folder, smodel, device)

    input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
    output_folder = 'spk/VCTK/VCTK-Corpus/'
    process_audio_folder(input_folder, output_folder, smodel, device)