File size: 2,442 Bytes
9fd672f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import shutil
import torch
import torchaudio
from pathlib import Path
def prepare_data_stage2(data_dir="data8", exp_dir="logs/s2"):
"""Prepare data for stage 2 training"""
# Get project root directory (parent of GPT_SoVITS)
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Convert relative paths to absolute
data_dir = os.path.join(root_dir, data_dir)
exp_dir = os.path.join(root_dir, exp_dir)
print(f"Data directory: {data_dir}")
print(f"Experiment directory: {exp_dir}")
# Create required directories
os.makedirs(exp_dir, exist_ok=True)
os.makedirs(os.path.join(exp_dir, "4-cnhubert"), exist_ok=True)
os.makedirs(os.path.join(exp_dir, "5-wav32k"), exist_ok=True)
# Convert phoneme.txt to name2text.txt format
phoneme_path = os.path.join(data_dir, "phoneme.txt")
name2text_path = os.path.join(exp_dir, "2-name2text.txt")
print(f"Reading phoneme data from: {phoneme_path}")
print(f"Writing text data to: {name2text_path}")
with open(phoneme_path, "r", encoding="utf8") as f_in, \
open(name2text_path, "w", encoding="utf8") as f_out:
for line in f_in:
parts = line.strip().split("|")
if len(parts) >= 2:
wav_name = os.path.basename(parts[0])
text = parts[1]
# Format: wav_name \t text \t speaker_id \t language_id
f_out.write(f"{wav_name}\t{text}\t0\tHindi\n")
# Copy wav files to 5-wav32k
wav_dir = os.path.join(data_dir, "wavs")
wav32k_dir = os.path.join(exp_dir, "5-wav32k")
print(f"Processing wav files from: {wav_dir}")
print(f"Saving to: {wav32k_dir}")
for wav_file in os.listdir(wav_dir):
if wav_file.endswith(".wav"):
src_path = os.path.join(wav_dir, wav_file)
dst_path = os.path.join(wav32k_dir, wav_file)
# Load and resample if needed
waveform, sr = torchaudio.load(src_path)
if sr != 32000:
resampler = torchaudio.transforms.Resample(sr, 32000)
waveform = resampler(waveform)
# Save as 32kHz wav
torchaudio.save(dst_path, waveform, 32000)
print("Data preparation complete. Please run the Hubert feature extraction before training.")
if __name__ == "__main__":
prepare_data_stage2() |