|
from utils.argutils import print_args |
|
from pathlib import Path |
|
import argparse |
|
import sys |
|
import wave |
|
import os |
|
from itertools import chain |
|
from tqdm import tqdm |
|
import re |
|
|
|
def preprocess_kspon(input_dirs): |
|
folders = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) |
|
|
|
for folder in tqdm(folders, "folders", len(folders), unit="folders"): |
|
texts = list() |
|
symbol = ["o/", "b/", "l/", "n/", "u/", "+", "*", "(", "/"] |
|
punctuation = [" ", ".", "?", "!"] |
|
white = [" ", " ", ",,", ",,,"] |
|
|
|
existing_fnames = list() |
|
for file in folder.glob("*"): |
|
existing_fnames.append(file) |
|
|
|
|
|
if str(file).endswith(".txt") and not str(file).endswith("alignment.txt"): |
|
s = os.path.splitext(file) |
|
s = os.path.split(s[0]) |
|
|
|
with open(file, "r", encoding='cp949') as f: |
|
texts.append(s[1] + "$\"" + "|" + " ".join(f.read().splitlines()) + "|" + "\"\n") |
|
|
|
for i, text in enumerate(texts): |
|
text = re.sub('\)\/\([κ°-ν£\s\w]*\)', "", text) |
|
for sym in symbol: |
|
text = text.replace(sym, "") |
|
for pun in punctuation: |
|
text = text.replace(pun, " ") |
|
for wh in white: |
|
text = text.replace(wh, ",") |
|
text = text.replace("$", " ") |
|
text = text.replace("|", ",") |
|
text = text.replace(",,", ",") |
|
texts[i] = text |
|
with open(os.path.join(folder, os.path.basename(folder) + "_alignment.txt"), "w", encoding='cp949') as a: |
|
for text in texts: |
|
a.write(text) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser( |
|
description="pcm, raw νμ₯μ νμΌμ wavνμ₯μλ‘ λ³ν", |
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter |
|
) |
|
|
|
parser.add_argument("path", type=str, help="μ²λ¦¬ν ν΄λ κ²½λ‘") |
|
args = parser.parse_args() |
|
|
|
dataset_root = Path(args.path) |
|
input_dirs = [dataset_root.joinpath("KsponSpeech_01"), |
|
dataset_root.joinpath("KsponSpeech_02"), |
|
dataset_root.joinpath("KsponSpeech_03"), |
|
dataset_root.joinpath("KsponSpeech_04"), |
|
dataset_root.joinpath("KsponSpeech_05")] |
|
|
|
preprocess_kspon(input_dirs) |
|
|