#!/usr/bin/env python3 # Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang) from phonemizer import phonemize import re def read_lexicon(): in_file = "./CMU.in.IPA.txt" words = set() pattern = re.compile("^[a-zA-Z'-\.]+$") with open(in_file) as f: for line in f: try: line = line.strip() word, _ = line.split(",") word = word.strip() if not pattern.match(word): # print(line, "word is", word) continue except: # print(line) continue assert word not in words, word words.add(word) return list(words) def main(): words = read_lexicon() num_words = len(words) batch = 5000 i = 0 word2ipa = dict() while i < num_words: print(f"{i}/{num_words}, {i/num_words*100:.3f}%") this_batch = words[i : i + batch] i += batch phonemes = phonemize( this_batch, language="en-us", backend="espeak", strip=True, preserve_punctuation=True, with_stress=True, ) for w, p in zip(this_batch, phonemes): word2ipa[w] = " ".join(list(p)) with open("lexicon.txt", "w", encoding="utf-8") as f: for w, p in word2ipa.items(): f.write(f"{w} {p}\n") if __name__ == "__main__": main()