| import os | |
| import tempfile | |
| import re | |
| import argparse | |
| from tqdm import tqdm | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--txt", type=str) | |
| parser.add_argument("--lid", type=str) | |
| parser.add_argument("--dst", type=str) | |
| parser.add_argument("--model", type=str) | |
| args = parser.parse_args() | |
| UROMAN_PL = args.model + "uroman/bin/uroman.pl" | |
| def norm_uroman(text): | |
| text = text.lower() | |
| text = text.replace("’", "'") | |
| text = re.sub("([^a-z' ])", " ", text) | |
| text = re.sub(" +", " ", text) | |
| return text.strip() | |
| def uromanize(words): | |
| iso = "xxx" | |
| with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2: | |
| with open(tf.name, "w") as f: | |
| f.write("\n".join(words)) | |
| cmd = f"perl " + UROMAN_PL | |
| cmd += f" -l {iso} " | |
| cmd += f" < {tf.name} > {tf2.name}" | |
| os.system(cmd) | |
| lexicon = {} | |
| with open(tf2.name) as f: | |
| for idx, line in enumerate(f): | |
| if not line.strip(): | |
| continue | |
| line = re.sub(r"\s+", "", norm_uroman(line)).strip() | |
| lexicon[words[idx]] = " ".join(line) + " |" | |
| return lexicon | |
| def convert_sent(txt, char_lang=False): | |
| if char_lang: | |
| words = txt | |
| else: | |
| words = txt.split(" ") | |
| lexicon = uromanize(words) | |
| pron = [] | |
| pron_no_sp = [] | |
| for w in words: | |
| if w in lexicon: | |
| pron.append(lexicon[w]) | |
| pron_no_sp.append(lexicon[w].replace(" |", "")) | |
| return " ".join(pron), " ".join(pron_no_sp) | |
| if __name__ == "__main__": | |
| if not os.path.exists(args.dst): | |
| os.makedirs(args.dst) | |
| txts = [x.strip() for x in open(args.txt, "r").readlines()] | |
| langs = [x.strip() for x in open(args.lid, "r").readlines()] | |
| assert len(txts) == len(langs) | |
| cer_langs = [x.strip() for x in open("cer_langs.txt", "r").readlines()] | |
| with open(args.dst + "/nbest_asr_hyp_uroman", "w", buffering=1) as f: | |
| for t, l in tqdm(zip(txts,langs), total=len(txts)): | |
| pron, _ = convert_sent(t, l in cer_langs) | |
| f.write(pron + "\n") | |