fhieni commited on
Commit
ec4209c
1 Parent(s): a7c6d53

Upload en_to_ipa.py

Browse files
Files changed (1) hide show
  1. en_to_ipa.py +81 -0
en_to_ipa.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from typing import List
3
+
4
+ import en_to_ipa.utils as utils
5
+ from en_to_ipa.arpa_ipa_mappings import arpa_to_ipa_dict
6
+ from en_to_ipa.build_phone_dict import add_word_to_oov_file, cmu_dict, cmu_dict_keys
7
+
8
+ from .config import PERMITTED_PUNCTUATION
9
+
10
+ __all__ = ["convert_label_to_phones", "arpa_to_ipa", "is_label_convertible"]
11
+
12
+
13
+ def convert_label_to_phones(
14
+ label: str,
15
+ ipa: bool = True,
16
+ as_list: bool = True,
17
+ raise_oov: bool = True,
18
+ warn_oov: bool = True,
19
+ ):
20
+ """Convert an entire label from graphemes to phonemes(IPA)
21
+
22
+ label: str - English grapheme label to be converted to IPA
23
+ ipa: bool - If True convert to IPA, else convert to ARPA
24
+ as_list: bool - If True return a list of characters else join them as a string
25
+ raise_oov: bool - If true raise an error if the word isn't found in CMUDict, if False ignore
26
+ warn_oov: bool - If true issue a warnings.Warning if the word isn't found in CMUDict
27
+ """
28
+ phones = []
29
+ label = utils._clean_label(label, permitted_punctuation=PERMITTED_PUNCTUATION)
30
+ for word in label.split(" "):
31
+ phones.extend(_convert_word_to_phones(word, ipa, raise_oov, warn_oov))
32
+ phones.extend(" ")
33
+ phones = phones[:-1]
34
+ return phones if as_list else "".join(phones)
35
+
36
+
37
+ def is_label_convertible(label: str):
38
+ """Check if a label is convertible to phones
39
+
40
+ Args:
41
+ label: str - A string containing English graphemes
42
+ """
43
+ clean_label = utils._clean_label(label, permitted_punctuation=PERMITTED_PUNCTUATION)
44
+ words = [word.lower() for word in clean_label.split(" ")]
45
+ return all([word in cmu_dict_keys for word in words])
46
+
47
+
48
+ def _convert_word_to_phones(
49
+ word: str,
50
+ ipa: bool = True,
51
+ raise_oov: bool = True,
52
+ warn_oov: bool = True,
53
+ ):
54
+ """Internal method for converting a word from graphemes to IPA or ARPA
55
+
56
+ Args:
57
+ word: str - The English word to be converted
58
+ ipa: bool - If true convert to IPA, else convert to ARPA
59
+ raise_oov: bool - If true raise an error if the word isn't found in CMUDict, if False ignore
60
+ warn_oov: bool - If true issue a warnings.Warning if the word isn't found in CMUDict
61
+ """
62
+ arpa_results = cmu_dict.get(word.lower(), "")
63
+ if not arpa_results:
64
+ add_word_to_oov_file(word)
65
+ if warn_oov:
66
+ warnings.warn(f"'{word}' not found in cmudict")
67
+ if raise_oov:
68
+ raise ValueError(f"{word} not found in vocabulary")
69
+ return ""
70
+ # TODO: Find a way to handle returning multiple values, keep in mind this
71
+ # is a helper to convert_label_to_phones and a label may have multiple
72
+ # words with multiple pronunciations
73
+ # OR... find a way to return the one preferred result (pos tagging?)
74
+ top_result = arpa_results[0]
75
+ arpa_list = utils._clean_arpa_list(top_result)
76
+ return arpa_to_ipa(arpa_list) if ipa else arpa_list
77
+
78
+
79
+ def arpa_to_ipa(arpa_list: List[str]):
80
+ """Convert a single word from ARPA to International Phonetic Alphabet"""
81
+ return [arpa_to_ipa_dict[arpa] for arpa in arpa_list if arpa != ""]