Spaces:
Sleeping
Sleeping
File size: 3,452 Bytes
324bf29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import h5py
import glob
import torch
import numpy as np
import os
import torchaudio
import soundfile as sf
from utils.g2p.symbols import symbols
from utils.g2p import PhonemeBpeTokenizer
from utils.prompt_making import make_prompt, make_transcript
from data.collation import get_text_token_collater
from data.dataset import create_dataloader
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
from data.tokenizer import (
AudioTokenizer,
tokenize_audio,
)
tokenizer_path = "./utils/g2p/bpe_175.json"
tokenizer = PhonemeBpeTokenizer(tokenizer_path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def make_prompts(name, audio_prompt_path, transcript=None):
text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_175.json")
text_collater = get_text_token_collater()
codec = AudioTokenizer(device)
wav_pr, sr = torchaudio.load(audio_prompt_path)
# check length
if wav_pr.size(-1) / sr > 15:
raise ValueError(f"Prompt too long, expect length below 15 seconds, got {wav_pr / sr} seconds.")
if wav_pr.size(0) == 2:
wav_pr = wav_pr.mean(0, keepdim=True)
text_pr, lang_pr = make_transcript(name, wav_pr, sr, transcript)
# tokenize audio
encoded_frames = tokenize_audio(codec, (wav_pr, sr))
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
# tokenize text
phonemes, langs = text_tokenizer.tokenize(text=f"{text_pr}".strip())
text_tokens, enroll_x_lens = text_collater(
[
phonemes
]
)
return audio_tokens, text_tokens, langs, text_pr
def create_dataset(data_dir, dataloader_process_only):
if dataloader_process_only:
h5_output_path=f"{data_dir}/audio_sum.hdf5"
ann_output_path=f"{data_dir}/audio_ann_sum.txt"
#audio_folder = os.path.join(data_dir, 'audio')
audio_paths = glob.glob(f"{data_dir}/*.wav") # Change this to match your audio file extension
# Create or open an HDF5 file
with h5py.File(h5_output_path, 'w') as h5_file:
# Loop through each audio and text file, assuming they have the same stem
for audio_path in audio_paths:
stem = os.path.splitext(os.path.basename(audio_path))[0]
audio_tokens, text_tokens, langs, text = make_prompts(name=stem, audio_prompt_path=audio_path)
text_tokens = text_tokens.squeeze(0)
# Create a group for each stem
grp = h5_file.create_group(stem)
# Add audio and text tokens as datasets to the group
grp.create_dataset('audio', data=audio_tokens)
#grp.create_dataset('text', data=text_tokens)
with open(ann_output_path, 'a', encoding='utf-8') as ann_file:
try:
audio, sample_rate = sf.read(audio_path)
duration = len(audio) / sample_rate
ann_file.write(f'{stem}|{duration}|{langs[0]}|{text}\n') # 改行を追加
print(f"Successfully wrote to {ann_output_path}")
except Exception as e:
print(f"An error occurred: {e}")
else:
dataloader = create_dataloader(data_dir=data_dir, max_duration=20)
return dataloader |