KLEA / khmer_phonemizer.py
seanghay's picture
init
d5ed1ca verified
raw
history blame
No virus
1.42 kB
r"""
Khmer Phonemizer - A Free, Standalone and Open-Source Khmer Grapheme-to-Phonemes.
"""
import os
import csv
from g2p import PhonetisaurusGraph
def _read_lexicon_file(file):
lexicon = {}
with open(file) as infile:
for line in csv.reader(infile, delimiter="\t"):
word, phonemes = line
word, phonemes = word.strip(), phonemes.strip().split()
lexicon[word] = phonemes
return lexicon
_graph_file = os.path.join(os.path.dirname(__file__), "km_phonemizer.npz")
_lexicon_file = os.path.join(os.path.dirname(__file__), "km_lexicon.tsv")
_lexicon_dict = _read_lexicon_file(_lexicon_file)
_graph = PhonetisaurusGraph.load(_graph_file, preload=False)
def _phoneticize(word: str, beam: int, min_beam: int, beam_scale: float):
results = _graph.g2p_one(word, beam=beam, min_beam=min_beam, beam_scale=beam_scale)
results = list(results)
if len(results) == 0:
return None
return results[0]
def phonemize_single(
word,
beam: int = 500,
min_beam: int = 100,
beam_scale: float = 0.6,
use_lexicon: bool = True,
):
r"""
Phonemize a single word. The word must match [a-zA-Z\u1780-\u17dd]+
"""
if word is None:
return None
word = word.lower()
if use_lexicon and word in _lexicon_dict:
return _lexicon_dict[word]
return _phoneticize(word, beam=beam, min_beam=min_beam, beam_scale=beam_scale)