#!/usr/bin/env python3
# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)

from phonemizer import phonemize
import re


def read_lexicon():
    in_file = "./CMU.in.IPA.txt"
    words = set()
    pattern = re.compile("^[a-zA-Z'-\.]+$")
    with open(in_file) as f:
        for line in f:
            try:
                line = line.strip()
                word, _ = line.split(",")
                word = word.strip()
                if not pattern.match(word):
                    #  print(line, "word is", word)
                    continue
            except:
                #  print(line)
                continue

            assert word not in words, word
            words.add(word)
    return list(words)


def main():
    words = read_lexicon()
    num_words = len(words)
    batch = 5000
    i = 0
    word2ipa = dict()
    while i < num_words:
        print(f"{i}/{num_words}, {i/num_words*100:.3f}%")
        this_batch = words[i : i + batch]
        i += batch
        phonemes = phonemize(
            this_batch,
            language="en-us",
            backend="espeak",
            strip=True,
            preserve_punctuation=True,
            with_stress=True,
        )
        for w, p in zip(this_batch, phonemes):
            word2ipa[w] = " ".join(list(p))

    with open("lexicon.txt", "w", encoding="utf-8") as f:
        for w, p in word2ipa.items():
            f.write(f"{w} {p}\n")


if __name__ == "__main__":
    main()