|
|
|
import re |
|
from os.path import join, abspath, dirname |
|
import eng_to_ipa.stress as stress |
|
from collections import defaultdict |
|
|
|
|
|
def mode_type(mode_in): |
|
"""In the case of "sql", this will return an sqlite cursor. |
|
In the case of "json", this will return a json dictionary of the data.""" |
|
if mode_in.lower() == "sql": |
|
import sqlite3 |
|
conn = sqlite3.connect(join(abspath(dirname(__file__)), "./resources/CMU_dict.db")) |
|
return conn.cursor() |
|
elif mode_in.lower() == "json": |
|
import json |
|
json_file = open(join(abspath(dirname(__file__)), "../eng_to_ipa/resources/CMU_dict.json"), encoding="UTF-8") |
|
return json.load(json_file) |
|
|
|
|
|
def preprocess(words): |
|
"""Returns a string of words stripped of punctuation""" |
|
punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» ' |
|
return ' '.join([w.strip(punct_str).lower() for w in words.split()]) |
|
|
|
|
|
def preserve_punc(words): |
|
"""converts words to IPA and finds punctuation before and after the word.""" |
|
words_preserved = [] |
|
for w in words.split(): |
|
punct_list = ["", preprocess(w), ""] |
|
before = re.search("^([^A-Za-z0-9]+)[A-Za-z]", w) |
|
after = re.search("[A-Za-z]([^A-Za-z0-9]+)$", w) |
|
if before: |
|
punct_list[0] = str(before.group(1)) |
|
if after: |
|
punct_list[2] = str(after.group(1)) |
|
words_preserved.append(punct_list) |
|
return words_preserved |
|
|
|
|
|
def apply_punct(triple, as_str=False): |
|
"""places surrounding punctuation back on center on a list of preserve_punc triples""" |
|
if type(triple[0]) == list: |
|
for i, t in enumerate(triple): |
|
triple[i] = str(''.join(triple[i])) |
|
if as_str: |
|
return ' '.join(triple) |
|
return triple |
|
if as_str: |
|
return str(''.join(t for t in triple)) |
|
return [''.join(t for t in triple)] |
|
|
|
|
|
def _punct_replace_word(original, transcription): |
|
"""Get the IPA transcription of word with the original punctuation marks""" |
|
for i, trans_list in enumerate(transcription): |
|
for j, item in enumerate(trans_list): |
|
triple = [original[i][0]] + [item] + [original[i][2]] |
|
transcription[i][j] = apply_punct(triple, as_str=True) |
|
return transcription |
|
|
|
|
|
def fetch_words(words_in, db_type="sql"): |
|
"""fetches a list of words from the database""" |
|
asset = mode_type(db_type) |
|
if db_type.lower() == "sql": |
|
quest = "?, " * len(words_in) |
|
asset.execute("SELECT word, phonemes FROM dictionary WHERE word IN ({0})".format(quest[:-2]), words_in) |
|
result = asset.fetchall() |
|
d = defaultdict(list) |
|
for k, v in result: |
|
d[k].append(v) |
|
return list(d.items()) |
|
if db_type.lower() == "json": |
|
words = [] |
|
for k, v in asset.items(): |
|
if k in words_in: |
|
words.append((k, v)) |
|
return words |
|
|
|
|
|
def get_cmu(tokens_in, db_type="sql"): |
|
"""query the SQL database for the words and return the phonemes in the order of user_in""" |
|
result = fetch_words(tokens_in, db_type) |
|
ordered = [] |
|
for word in tokens_in: |
|
this_word = [[i[1] for i in result if i[0] == word]][0] |
|
if this_word: |
|
ordered.append(this_word[0]) |
|
else: |
|
ordered.append(["__IGNORE__" + word]) |
|
return ordered |
|
|
|
|
|
def cmu_to_ipa(cmu_list, mark=True, stress_marking='all'): |
|
"""converts the CMU word lists into IPA transcriptions""" |
|
symbols = {"a": "ə", "ey": "eɪ", "aa": "ɑ", "ae": "æ", "ah": "ə", "ao": "ɔ", |
|
"aw": "aʊ", "ay": "aɪ", "ch": "ʧ", "dh": "ð", "eh": "ɛ", "er": "ər", |
|
"hh": "h", "ih": "ɪ", "jh": "ʤ", "ng": "ŋ", "ow": "oʊ", "oy": "ɔɪ", |
|
"sh": "ʃ", "th": "θ", "uh": "ʊ", "uw": "u", "zh": "ʒ", "iy": "i", "y": "j"} |
|
ipa_list = [] |
|
for word_list in cmu_list: |
|
ipa_word_list = [] |
|
for word in word_list: |
|
if stress_marking: |
|
word = stress.find_stress(word, type=stress_marking) |
|
else: |
|
if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "": |
|
pass |
|
else: |
|
word = re.sub("[0-9]", "", word) |
|
ipa_form = '' |
|
if word.startswith("__IGNORE__"): |
|
ipa_form = word.replace("__IGNORE__", "") |
|
|
|
|
|
if mark: |
|
if not re.sub("\d*", "", ipa_form) == "": |
|
ipa_form += "*" |
|
else: |
|
for piece in word.split(" "): |
|
marked = False |
|
unmarked = piece |
|
if piece[0] in ["ˈ", "ˌ"]: |
|
marked = True |
|
mark = piece[0] |
|
unmarked = piece[1:] |
|
if unmarked in symbols: |
|
if marked: |
|
ipa_form += mark + symbols[unmarked] |
|
else: |
|
ipa_form += symbols[unmarked] |
|
|
|
else: |
|
ipa_form += piece |
|
swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]] |
|
for sym in swap_list: |
|
if not ipa_form.startswith(sym[0]): |
|
ipa_form = ipa_form.replace(sym[0], sym[1]) |
|
ipa_word_list.append(ipa_form) |
|
ipa_list.append(sorted(list(set(ipa_word_list)))) |
|
return ipa_list |
|
|
|
|
|
def get_top(ipa_list): |
|
"""Returns only the one result for a query. If multiple entries for words are found, only the first is used.""" |
|
return ' '.join([word_list[-1] for word_list in ipa_list]) |
|
|
|
|
|
def get_all(ipa_list): |
|
"""utilizes an algorithm to discover and return all possible combinations of IPA transcriptions""" |
|
final_size = 1 |
|
for word_list in ipa_list: |
|
final_size *= len(word_list) |
|
list_all = ["" for s in range(final_size)] |
|
for i in range(len(ipa_list)): |
|
if i == 0: |
|
swtich_rate = final_size / len(ipa_list[i]) |
|
else: |
|
swtich_rate /= len(ipa_list[i]) |
|
k = 0 |
|
for j in range(final_size): |
|
if (j+1) % int(swtich_rate) == 0: |
|
k += 1 |
|
if k == len(ipa_list[i]): |
|
k = 0 |
|
list_all[j] = list_all[j] + ipa_list[i][k] + " " |
|
return sorted([sent[:-1] for sent in list_all]) |
|
|
|
|
|
def ipa_list(words_in, keep_punct=True, stress_marks='both', db_type="sql"): |
|
"""Returns a list of all the discovered IPA transcriptions for each word.""" |
|
if type(words_in) == str: |
|
words = [preserve_punc(w.lower())[0] for w in words_in.split()] |
|
else: |
|
words = [preserve_punc(w.lower())[0] for w in words_in] |
|
cmu = get_cmu([w[1] for w in words], db_type=db_type) |
|
ipa = cmu_to_ipa(cmu, stress_marking=stress_marks) |
|
if keep_punct: |
|
ipa = _punct_replace_word(words, ipa) |
|
return ipa |
|
|
|
|
|
def isin_cmu(word, db_type="sql"): |
|
"""checks if a word is in the CMU dictionary. Doesn't strip punctuation. |
|
If given more than one word, returns True only if all words are present.""" |
|
if type(word) == str: |
|
word = [preprocess(w) for w in word.split()] |
|
results = fetch_words(word, db_type) |
|
as_set = list(set(t[0] for t in results)) |
|
return len(as_set) == len(set(word)) |
|
|
|
|
|
def convert(text, retrieve_all=False, keep_punct=True, stress_marks='both', mode="sql"): |
|
"""takes either a string or list of English words and converts them to IPA""" |
|
ipa = ipa_list( |
|
words_in=text, |
|
keep_punct=keep_punct, |
|
stress_marks=stress_marks, |
|
db_type=mode) |
|
if retrieve_all: |
|
return get_all(ipa) |
|
return get_top(ipa) |
|
|
|
|
|
def jonvert(text, retrieve_all=False, keep_punct=True, stress_marks='both'): |
|
"""Forces use of JSON database for fetching phoneme data.""" |
|
return convert(text, retrieve_all, keep_punct, stress_marks, mode="json") |
|
|