Spaces:
Build error
Build error
# (C1)(w)V(G|C2)+T | |
#symbol " ' " for undefine symbol and sign for english | |
''' | |
C1 = initial consonant onset | |
w = labiovelar on-glide /w/ | |
V = vowel nucleus | |
G = off-glide coda (/j/ or /w/) | |
C2 = final consonant coda | |
T = tone. | |
''' | |
Cus_onsets = { u'b' : u'b', u't' : u't', u'th' : u'tʰ', u'đ' : u'd', u'ch' : u'c', | |
u'kh' : u'x', u'g' : u'ɣ', u'l' : u'l', u'm' : u'm', u'n': u'n', | |
u'ngh': u'ŋ', u'nh' : u'ɲ', u'ng' : u'ŋ', u'ph' : u'f', u'v' : u'v', | |
u'x' : u's', u'd' : u'z', u'h' : u'h', u'p' : u'p', u'qu' : u'kw', | |
u'gi' : u'j', u'tr' : u'ʈ', u'k' : u'k', u'c' : u'k', u'gh' : u'ɣ', | |
u'r' : u'ʐ', u's' : u'ʂ', u'gi': u'j'} | |
Cus_nuclei = { u'a' : u'a', u'á' : u'a', u'à' : u'a', u'ả' : u'a', u'ã' : u'a', u'ạ' : u'a', | |
u'â' : u'ɤ̆', u'ấ' : u'ɤ̆', u'ầ' : u'ɤ̆', u'ẩ' : u'ɤ̆', u'ẫ' : u'ɤ̆', u'ậ' : u'ɤ̆', | |
u'ă' : u'ă', u'ắ' : u'ă', u'ằ' : u'ă', u'ẳ' : u'ă', u'ẵ' : u'ă', u'ặ' : u'ă', | |
u'e' : u'ɛ', u'é' : u'ɛ', u'è' : u'ɛ', u'ẻ' : u'ɛ', u'ẽ' : u'ɛ', u'ẹ' : u'ɛ', | |
u'ê' : u'e', u'ế' : u'e', u'ề' : u'e', u'ể' : u'e', u'ễ' : u'e', u'ệ' : u'e', | |
u'i' : u'i', u'í' : u'i', u'ì' : u'i', u'ỉ' : u'i', u'ĩ' : u'i', u'ị' : u'i', | |
u'o' : u'ɔ', u'ó' : u'ɔ', u'ò' : u'ɔ', u'ỏ' : u'ɔ', u'õ' : u'ɔ', u'ọ' : u'ɔ', | |
u'ô' : u'o', u'ố' : u'o', u'ồ' : u'o', u'ổ' : u'o', u'ỗ' : u'o', u'ộ' : u'o', | |
u'ơ' : u'ɤ', u'ớ' : u'ɤ', u'ờ' : u'ɤ', u'ở' : u'ɤ', u'ỡ' : u'ɤ', u'ợ' : u'ɤ', | |
u'u' : u'u', u'ú' : u'u', u'ù' : u'u', u'ủ' : u'u', u'ũ' : u'u', u'ụ' : u'u', | |
u'ư' : u'ɯ', u'ứ' : u'ɯ', u'ừ' : u'ɯ', u'ử' : u'ɯ', u'ữ' : u'ɯ', u'ự' : u'ɯ', | |
u'y' : u'i', u'ý' : u'i', u'ỳ' : u'i', u'ỷ' : u'i', u'ỹ' : u'i', u'ỵ' : u'i', | |
u'eo' : u'eo', u'éo' : u'eo', u'èo' : u'eo', u'ẻo' : u'eo', u'ẽo': u'eo', u'ẹo' : u'eo', | |
u'êu' : u'ɛu', u'ếu' : u'ɛu', u'ều' : u'ɛu', u'ểu' : u'ɛu', u'ễu': u'ɛu', u'ệu' : u'ɛu', | |
u'ia' : u'iə', u'ía' : u'iə', u'ìa' : u'iə', u'ỉa' : u'iə', u'ĩa' : u'iə', u'ịa' : u'iə', | |
u'ia' : u'iə', u'iá' : u'iə', u'ià' : u'iə', u'iả' : u'iə', u'iã' : u'iə', u'iạ' : u'iə', | |
u'iê' : u'iə', u'iế' : u'iə', u'iề' : u'iə', u'iể' : u'iə', u'iễ' : u'iə', u'iệ' : u'iə', | |
u'oo' : u'ɔ', u'óo' : u'ɔ', u'òo' : u'ɔ', u'ỏo' : u'ɔ', u'õo' : u'ɔ', u'ọo' : u'ɔ', | |
u'oo' : u'ɔ', u'oó' : u'ɔ', u'oò' : u'ɔ', u'oỏ' : u'ɔ', u'oõ' : u'ɔ', u'oọ' : u'ɔ', | |
u'ôô' : u'o', u'ốô' : u'o', u'ồô' : u'o', u'ổô' : u'o', u'ỗô' : u'o', u'ộô' : u'o', | |
u'ôô' : u'o', u'ôố' : u'o', u'ôồ' : u'o', u'ôổ' : u'o', u'ôỗ' : u'o', u'ôộ' : u'o', | |
u'ua' : u'uə', u'úa' : u'uə', u'ùa' : u'uə', u'ủa' : u'uə', u'ũa' : u'uə', u'ụa' : u'uə', | |
u'uô' : u'uə', u'uố' : u'uə', u'uồ' : u'uə', u'uổ' : u'uə', u'uỗ' : u'uə', u'uộ' : u'uə', | |
u'ưa' : u'ɯə', u'ứa' : u'ɯə', u'ừa' : u'ɯə', u'ửa' : u'ɯə', u'ữa' : u'ɯə', u'ựa' : u'ɯə', | |
u'ươ' : u'ɯə', u'ướ' : u'ɯə', u'ườ' : u'ɯə', u'ưở' : u'ɯə', u'ưỡ' : u'ɯə', u'ượ' : u'ɯə', | |
u'yê' : u'iɛ', u'yế' : u'iɛ', u'yề' : u'iɛ', u'yể' : u'iɛ', u'yễ' : u'iɛ', u'yệ' : u'iɛ', | |
u'uơ' : u'uə', u'uở' : u'uə', u'uờ': u'uə', u'uở' : u'uə', u'uỡ' : u'uə', u'uợ' : u'uə', | |
} | |
Cus_offglides = { u'ai' : u'aj', u'ái' : u'aj', u'ài' : u'aj', u'ải' : u'aj', u'ãi' : u'aj', u'ại' : u'aj', | |
u'ay' : u'ăj', u'áy' : u'ăj', u'ày' : u'ăj', u'ảy' : u'ăj', u'ãy' : u'ăj', u'ạy' : u'ăj', | |
u'ao' : u'aw', u'áo' : u'aw', u'ào' : u'aw', u'ảo' : u'aw', u'ão' : u'aw', u'ạo' : u'aw', | |
u'au' : u'ăw', u'áu' : u'ăw', u'àu' : u'ăw', u'ảu' : u'ăw', u'ãu' : u'ăw', u'ạu' : u'ăw', | |
u'ây' : u'ɤ̆j', u'ấy' : u'ɤ̆j', u'ầy' : u'ɤ̆j', u'ẩy' : u'ɤ̆j', u'ẫy' : u'ɤ̆j', u'ậy' : u'ɤ̆j', | |
u'âu' : u'ɤ̆w', u'ấu' : u'ɤ̆w', u'ầu': u'ɤ̆w', u'ẩu' : u'ɤ̆w', u'ẫu' : u'ɤ̆w', u'ậu' : u'ɤ̆w', | |
u'eo' : u'ew', u'éo' : u'ew', u'èo' : u'ew', u'ẻo' : u'ew', u'ẽo' : u'ew', u'ẹo' : u'ew', | |
u'iu' : u'iw', u'íu' : u'iw', u'ìu' : u'iw', u'ỉu' : u'iw', u'ĩu' : u'iw', u'ịu' : u'iw', | |
u'oi' : u'ɔj', u'ói' : u'ɔj', u'òi' : u'ɔj', u'ỏi' : u'ɔj', u'õi' : u'ɔj', u'ọi' : u'ɔj', | |
u'ôi' : u'oj', u'ối' : u'oj', u'ồi' : u'oj', u'ổi' : u'oj', u'ỗi' : u'oj', u'ội' : u'oj', | |
u'ui' : u'uj', u'úi' : u'uj', u'ùi' : u'uj', u'ủi' : u'uj', u'ũi' : u'uj', u'ụi' : u'uj', | |
#u'uy' : u'uj', u'úy' : u'uj', u'ùy' : u'uj', u'ủy' : u'uj', u'ũy' : u'uj', u'ụy' : u'uj', | |
u'uy' : u'ʷi', u'úy' : u'uj', u'ùy' : u'uj', u'ủy' : u'uj', u'ũy' : u'uj', u'ụy' : u'uj', | |
#thay để hạn chế trùng âm | |
u'uy' : u'ʷi', u'uý' : u'ʷi', u'uỳ' : u'ʷi', u'uỷ' : u'ʷi', u'uỹ' : u'ʷi', u'uỵ' : u'ʷi', | |
u'ơi' : u'ɤj', u'ới' : u'ɤj', u'ời' : u'ɤj', u'ởi' : u'ɤj', u'ỡi' : u'ɤj', u'ợi' : u'ɤj', | |
u'ưi' : u'ɯj', u'ứi' : u'ɯj', u'ừi' : u'ɯj', u'ửi' : u'ɯj', u'ữi' : u'ɯj', u'ựi' : u'ɯj', | |
u'ưu' : u'ɯw', u'ứu' : u'ɯw', u'ừu' : u'ɯw', u'ửu' : u'ɯw', u'ữu' : u'ɯw', u'ựu' : u'ɯw', | |
u'iêu' : u'iəw', u'iếu' : u'iəw', u'iều' : u'iəw', u'iểu' : u'iəw', u'iễu' : u'iəw', u'iệu' : u'iəw', | |
u'yêu' : u'iəw', u'yếu' : u'iəw', u'yều' : u'iəw', u'yểu' : u'iəw', u'yễu' : u'iəw', u'yệu' : u'iəw', | |
u'uôi' : u'uəj', u'uối' : u'uəj', u'uồi' : u'uəj', u'uổi' : u'uəj', u'uỗi' : u'uəj', u'uội' : u'uəj', | |
u'ươi' : u'ɯəj', u'ưới' : u'ɯəj', u'ười' : u'ɯəj', u'ưởi' : u'ɯəj', u'ưỡi' : u'ɯəj', u'ượi' : u'ɯəj', | |
u'ươu' : u'ɯəw', u'ướu' : u'ɯəw', u'ườu' : u'ɯəw', u'ưởu' : u'ɯəw', 'ưỡu' : u'ɯəw', u'ượu' : u'ɯəw' | |
} | |
#Các âm vòng ở đây i chang không vòm: không có w ở trước => Try to add ʷ | |
Cus_onglides = { u'oa' : u'ʷa', u'oá' : u'ʷa', u'oà' : u'ʷa', u'oả' : u'ʷa', u'oã' : u'ʷa', u'oạ' : u'ʷa', | |
u'óa' : u'ʷa', u'òa' : u'ʷa', u'ỏa' : u'ʷa', u'õa' : u'ʷa', u'ọa' : u'ʷa', | |
u'oă' : u'ʷă', u'oắ' : u'ʷă', u'oằ' : u'ʷă', u'oẳ' : u'ʷă', u'oẵ' : u'ʷă', u'oặ' : u'ʷă', | |
u'oe' : u'ʷɛ', u'oé' : u'ʷɛ', u'oè' : u'ʷɛ', u'oẻ' : u'ʷɛ', u'oẽ' : u'ʷɛ', u'oẹ' : u'ʷɛ', | |
u'oe' : u'ʷɛ', u'óe' : u'ʷɛ', u'òe' : u'ʷɛ', u'ỏe' : u'ʷɛ', u'õe' : u'ʷɛ', u'ọe' : u'ʷɛ', | |
u'ua' : u'ʷa', u'uá' : u'ʷa', u'uà' : u'ʷa', u'uả' : u'ʷa', u'uã' : u'ʷa', u'uạ' : u'ʷa', | |
u'uă' : u'ʷă', u'uắ' : u'ʷă', u'uằ' : u'ʷă', u'uẳ' : u'ʷă', u'uẵ' : u'ʷă', u'uặ' : u'ʷă', | |
u'uâ' : u'ʷɤ̆', u'uấ' : u'ʷɤ̆', u'uầ' : u'ʷɤ̆', u'uẩ' : u'ʷɤ̆', u'uẫ' : u'ʷɤ̆', u'uậ' : u'ʷɤ̆', | |
u'ue' : u'ʷɛ', u'ué' : u'ʷɛ', u'uè' : u'ʷɛ', u'uẻ' : u'ʷɛ', u'uẽ' : u'ʷɛ', u'uẹ' : u'ʷɛ', | |
u'uê' : u'ʷe', u'uế' : u'ʷe', u'uề' : u'ʷe', u'uể' : u'ʷe', u'uễ' : u'ʷe', u'uệ' : u'ʷe', | |
u'uơ' : u'ʷɤ', u'uớ' : u'ʷɤ', u'uờ' : u'ʷɤ', u'uở' : u'ʷɤ', u'uỡ' : u'ʷɤ', u'uợ' : u'ʷɤ', | |
u'uy' : u'ʷi', u'uý' : u'ʷi', u'uỳ' : u'ʷi', u'uỷ' : u'ʷi', u'uỹ' : u'ʷi', u'uỵ' : u'ʷi', | |
u'uya' : u'ʷiə', u'uyá' : u'ʷiə', u'uyà' : u'ʷiə', u'uyả' : u'ʷiə', u'uyã' : u'ʷiə', u'uyạ' : u'ʷiə', | |
u'uyê' : u'ʷiə', u'uyế' : u'ʷiə', u'uyề' : u'ʷiə', u'uyể' : u'ʷiə', u'uyễ' : u'ʷiə', u'uyệ' : u'ʷiə', | |
u'uyu' : u'ʷiu', u'uyú' : u'ʷiu', u'uyù' : u'ʷiu', u'uyủ' : u'ʷiu', u'uyũ' : u'ʷiu', u'uyụ' : u'ʷiu', | |
u'uyu' : u'ʷiu', u'uýu' : u'ʷiu', u'uỳu' : u'ʷiu', u'uỷu' : u'ʷiu', u'uỹu' : u'ʷiu', u'uỵu' : u'ʷiu', | |
u'oen' : u'ʷen', u'oén' : u'ʷen', u'oèn' : u'ʷen', u'oẻn' : u'ʷen', u'oẽn' : u'ʷen', u'oẹn' : u'ʷen', | |
u'oet' : u'ʷet', u'oét' : u'ʷet', u'oèt' : u'ʷet', u'oẻt' : u'ʷet', u'oẽt' : u'ʷet', u'oẹt' : u'ʷet' | |
} | |
Cus_onoffglides = { u'oe' : u'ɛj', u'oé' : u'ɛj', u'oè' : u'ɛj', u'oẻ' : u'ɛj', u'oẽ' : u'ɛj', u'oẹ' : u'ɛj', | |
u'oai' : u'aj', u'oái' : u'aj', u'oài' : u'aj', u'oải' : u'aj', u'oãi' : u'aj', u'oại' : u'aj', | |
u'oay' : u'ăj', u'oáy' : u'ăj', u'oày' : u'ăj', u'oảy' : u'ăj', u'oãy' : u'ăj', u'oạy' : u'ăj', | |
u'oao' : u'aw', u'oáo' : u'aw', u'oào' : u'aw', u'oảo' : u'aw', u'oão' : u'aw', u'oạo' : u'aw', | |
u'oeo' : u'ew', u'oéo' : u'ew', u'oèo' : u'ew', u'oẻo' : u'ew', u'oẽo' : u'ew', u'oẹo' : u'ew', | |
u'oeo' : u'ew', u'óeo' : u'ew', u'òeo' : u'ew', u'ỏeo' : u'ew', u'õeo' : u'ew', u'ọeo' : u'ew', | |
u'ueo' : u'ew', u'uéo' : u'ew', u'uèo' : u'ew', u'uẻo' : u'ew', u'uẽo' : u'ew', u'uẹo' : u'ew', | |
u'uai' : u'aj', u'uái' : u'aj', u'uài' : u'aj', u'uải' : u'aj', u'uãi' : u'aj', u'uại' : u'aj', | |
u'uay' : u'ăj', u'uáy' : u'ăj', u'uày' : u'ăj', u'uảy' : u'ăj', u'uãy' : u'ăj', u'uạy' : u'ăj', | |
u'uây' : u'ɤ̆j', u'uấy' : u'ɤ̆j', u'uầy' : u'ɤ̆j', u'uẩy' : u'ɤ̆j', u'uẫy' : u'ɤ̆j', u'uậy' : u'ɤ̆j' | |
} | |
Cus_codas = { u'p' : u'p', u't' : u't', u'c' : u'k', u'm' : u'm', u'n' : u'n', u'ng' : u'ŋ', u'nh' : u'ɲ', u'ch' : u'tʃ' } | |
Cus_tones_p = { u'á' : 5, u'à' : 2, u'ả' : 4, u'ã' : 3, u'ạ' : 6, | |
u'ấ' : 5, u'ầ' : 2, u'ẩ' : 4, u'ẫ' : 3, u'ậ' : 6, | |
u'ắ' : 5, u'ằ' : 2, u'ẳ' : 4, u'ẵ' : 3, u'ặ' : 6, | |
u'é' : 5, u'è' : 2, u'ẻ' : 4, u'ẽ' : 3, u'ẹ' : 6, | |
u'ế' : 5, u'ề' : 2, u'ể' : 4, u'ễ' : 3, u'ệ' : 6, | |
u'í' : 5, u'ì' : 2, u'ỉ' : 4, u'ĩ' : 3, u'ị' : 6, | |
u'ó' : 5, u'ò' : 2, u'ỏ' : 4, u'õ' : 3, u'ọ' : 6, | |
u'ố' : 5, u'ồ' : 2, u'ổ' : 4, u'ỗ' : 3, u'ộ' : 6, | |
u'ớ' : 5, u'ờ' : 2, u'ở' : 4, u'ỡ' : 3, u'ợ' : 6, | |
u'ú' : 5, u'ù' : 2, u'ủ' : 4, u'ũ' : 3, u'ụ' : 6, | |
u'ứ' : 5, u'ừ' : 2, u'ử' : 4, u'ữ' : 3, u'ự' : 6, | |
u'ý' : 5, u'ỳ' : 2, u'ỷ' : 4, u'ỹ' : 3, u'ỵ' : 6, | |
} | |
Cus_gi = { u'gi' : u'zi', u'gí': u'zi', u'gì' : u'zi', u'gì' : u'zi', u'gĩ' : u'zi', u'gị' : u'zi'} | |
Cus_qu = {u'quy' : u'kwi', u'qúy' : u'kwi', u'qùy' : u'kwi', u'qủy' : u'kwi', u'qũy' : u'kwi', u'qụy' : u'kwi'} | |
################################################3 | |
import sys, codecs, re | |
from io import StringIO | |
from optparse import OptionParser | |
from string import punctuation | |
#import prosodic as p | |
def trans(word, dialect, glottal, pham, cao, palatals): | |
#Custom | |
onsets, nuclei, codas, onglides, offglides, onoffglides, qu, gi = Cus_onsets, Cus_nuclei, Cus_codas, Cus_onglides, Cus_offglides, Cus_onoffglides, Cus_qu, Cus_gi | |
if pham or cao: | |
#Custom | |
tones_p = Cus_tones_p | |
tones = tones_p | |
ons = '' | |
nuc = '' | |
cod = '' | |
ton = 0 | |
oOffset = 0 | |
cOffset = 0 | |
l = len(word) | |
if l > 0: | |
if word[0:3] in onsets: # if onset is 'ngh' | |
ons = onsets[word[0:3]] | |
oOffset = 3 | |
elif word[0:2] in onsets: # if onset is 'nh', 'gh', 'kʷ' etc | |
ons = onsets[word[0:2]] | |
oOffset = 2 | |
elif word[0] in onsets: # if single onset | |
ons = onsets[word[0]] | |
oOffset = 1 | |
if word[l-2:l] in codas: # if two-character coda | |
cod = codas[word[l-2:l]] | |
cOffset = 2 | |
elif word[l-1] in codas: # if one-character coda | |
cod = codas[word[l-1]] | |
cOffset = 1 | |
#if word[0:2] == u'gi' and cod and len(word) == 3: # if you just have 'gi' and a coda... | |
if word[0:2] in gi and cod and len(word) == 3: # if you just have 'gi' and a coda... | |
nucl = u'i' | |
ons = u'z' | |
else: | |
nucl = word[oOffset:l-cOffset] | |
if nucl in nuclei: | |
if oOffset == 0: | |
if glottal == 1: | |
if word[0] not in onsets: # if there isn't an onset.... | |
ons = u'ʔ'+nuclei[nucl] # add a glottal stop | |
else: # otherwise... | |
nuc = nuclei[nucl] # there's your nucleus | |
else: | |
nuc = nuclei[nucl] # there's your nucleus | |
else: # otherwise... | |
nuc = nuclei[nucl] # there's your nucleus | |
elif nucl in onglides and ons != u'kw': # if there is an onglide... | |
nuc = onglides[nucl] # modify the nuc accordingly | |
if ons: # if there is an onset... | |
ons = ons+u'w' # labialize it, but... | |
else: # if there is no onset... | |
ons = u'w' # add a labiovelar onset | |
elif nucl in onglides and ons == u'kw': | |
nuc = onglides[nucl] | |
elif nucl in onoffglides: | |
cod = onoffglides[nucl][-1] | |
nuc = onoffglides[nucl][0:-1] | |
if ons != u'kw': | |
if ons: | |
ons = ons+u'w' | |
else: | |
ons = u'w' | |
elif nucl in offglides: | |
cod = offglides[nucl][-1] | |
nuc = offglides[nucl][:-1] | |
elif word in gi: # if word == 'gi', 'gì',... | |
ons = gi[word][0] | |
nuc = gi[word][1] | |
elif word in qu: # if word == 'quy', 'qúy',... | |
ons = qu[word][:-1] | |
nuc = qu[word][-1] | |
else: | |
# Something is non-Viet | |
return (None, None, None, None) | |
# Velar Fronting (Northern dialect) | |
if dialect == 'n': | |
if nuc == u'a': | |
if cod == u'k' and cOffset == 2: nuc = u'ɛ' | |
if cod == u'ɲ' and nuc == u'a': nuc = u'ɛ' | |
# Final palatals (Northern dialect) | |
if nuc not in [u'i', u'e', u'ɛ']: | |
if cod == u'ɲ': | |
cod = u'ɲ' # u'ŋ' | |
elif palatals != 1 and nuc in [u'i', u'e', u'ɛ']: | |
if cod == u'ɲ': | |
cod = u'ɲ'#u'ŋ' | |
if palatals == 1: | |
if cod == u'k' and nuc in [u'i', u'e', u'ɛ']: | |
cod = u'c' | |
# Velar Fronting (Southern and Central dialects) | |
else: | |
if nuc in [u'i', u'e']: | |
if cod == u'k': cod = u't' | |
if cod == u'ŋ': cod = u'n' | |
# There is also this reverse fronting, see Thompson 1965:94 ff. | |
elif nuc in [u'iə', u'ɯə', u'uə', u'u', u'ɯ', u'ɤ', u'o', u'ɔ', u'ă', u'ɤ̆']: | |
if cod == u't': | |
cod = u'k' | |
if cod == u'n': cod = u'ŋ' | |
# Monophthongization (Southern dialects: Thompson 1965: 86; Hoàng 1985: 181) | |
if dialect == 's': | |
if cod in [u'm', u'p']: | |
if nuc == u'iə': nuc = u'i' | |
if nuc == u'uə': nuc = u'u' | |
if nuc == u'ɯə': nuc = u'ɯ' | |
# Tones | |
# Modified 20 Sep 2008 to fix aberrant 33 error | |
tonelist = [tones[word[i]] for i in range(0,l) if word[i] in tones] | |
if tonelist: | |
ton = str(tonelist[len(tonelist)-1]) | |
else: | |
if not (pham or cao): | |
if dialect == 'c': | |
ton = str('35') | |
else: | |
ton = str('33') | |
else: | |
ton = str('1') | |
# Modifications for closed syllables | |
if cOffset !=0: | |
# Obstruent-final nang tones are modal voice | |
if (dialect == 'n' or dialect == 's') and ton == u'21g' and cod in ['p', 't', 'k']: | |
#if ton == u'21\u02C0' and cod in ['p', 't', 'k']: # fixed 8 Nov 2016 | |
ton = u'21' | |
# Modification for sắc in closed syllables (Northern and Central only) | |
if ((dialect == 'n' and ton == u'24') or (dialect == 'c' and ton == u'13')) and cod in ['p', 't', 'k']: | |
ton = u'45' | |
# Modification for 8-tone system | |
if cao == 1: | |
if ton == u'5' and cod in ['p', 't', 'k']: | |
ton = u'5b' | |
if ton == u'6' and cod in ['p', 't', 'k']: | |
ton = u'6b' | |
# labialized allophony (added 17.09.08) | |
if nuc in [u'u', u'o', u'ɔ']: | |
if cod == u'ŋ': | |
cod = u'ŋ͡m' | |
if cod == u'k': | |
cod = u'k͡p' | |
return (ons, nuc, cod, ton) | |
def convert(word, dialect, glottal, pham, cao, palatals, delimit): | |
"""Convert a single orthographic string to IPA.""" | |
ons = '' | |
nuc = '' | |
cod = '' | |
ton = 0 | |
seq = '' | |
try: | |
(ons, nuc, cod, ton) = trans(word, dialect, glottal, pham, cao, palatals) | |
if None in (ons, nuc, cod, ton): | |
seq = u'['+word+u']' | |
else: | |
seq = delimit+delimit.join(filter(None, (ons, nuc, cod, ton)))+delimit | |
except (TypeError): | |
pass | |
return seq | |
########################333 | |
from vinorm import * | |
from underthesea import word_tokenize | |
import eng_to_ipa | |
syms=['ɯəj', 'ɤ̆j', 'ʷiə', 'ɤ̆w', 'ɯəw', 'ʷet', 'iəw', 'uəj', 'ʷen', 'tʰw', 'ʷɤ̆', 'ʷiu', 'kwi', 'ŋ͡m', 'k͡p', 'cw', 'jw', 'uə', 'eə', 'bw', 'oj', 'ʷi', 'vw', 'ăw', 'ʈw', 'ʂw', 'aʊ', 'fw', 'ɛu', 'tʰ', 'tʃ', 'ɔɪ', 'xw', 'ʷɤ', 'ɤ̆', 'ŋw', 'ʊə', 'zi', 'ʷă', 'dw', 'eɪ', 'aɪ', 'ew', 'iə', 'ɣw', 'zw', 'ɯj', 'ʷɛ', 'ɯw', 'ɤj', 'ɔ:', 'əʊ', 'ʷa', 'mw', 'ɑ:', 'hw', 'ɔj', 'uj', 'lw', 'ɪə', 'ăj', 'u:', 'aw', 'ɛj', 'iw', 'aj', 'ɜ:', 'kw', 'nw', 't∫', 'ɲw', 'eo', 'sw', 'tw', 'ʐw', 'iɛ', 'ʷe', 'i:', 'ɯə', 'dʒ', 'ɲ', 'θ', 'ʌ', 'l', 'w', '1', 'ɪ', 'ɯ', 'd', '∫', 'p', 'ə', 'u', 'o', '3', 'ɣ', '!', 'ð', 'ʧ', '6', 'ʒ', 'ʐ', 'z', 'v', 'g', 'ă', '_', 'æ', 'ɤ', '2', 'ʤ', 'i', '.', 'ɒ', 'b', 'h', 'n', 'ʂ', 'ɔ', 'ɛ', 'k', 'm', '5', ' ', 'c', 'j', 'x', 'ʈ', ',', '4', 'ʊ', 's', 'ŋ', 'a', 'ʃ', '?', 'r', ':', 'η', 'f', ';', 'e', 't', "'"] | |
def normEng (eng,delemit): | |
return "" | |
''' | |
x= p.Text(eng) | |
x.parse() | |
PAR = str(x.bestParses()[0]).split("|") | |
SYL = x.syllables() | |
if len(PAR) != len(SYL): | |
print("check dif len: ", eng) | |
result="/"+"/".join(list(eng)) | |
return result | |
result = "" | |
for i,syl in enumerate(SYL): | |
syllable = str(syl).replace("'","").replace("ː","").replace("ɑ","a") | |
if PAR[i].lower().upper() == PAR[i]: | |
result+=syllable+"'5"+" " | |
else: | |
result+=syllable+"'1"+" " | |
result=result.rstrip(" ") | |
if delemit !="": | |
takemore="" | |
for r in result: | |
if r in syms: | |
takemore+=delemit+r | |
result=takemore | |
return result | |
''' | |
def Parsing(listParse, text, delimit): | |
undefine_symbol = "'" | |
if listParse == "default": | |
listParse=['ɯəj', 'ɤ̆j', 'ʷiə', 'ɤ̆w', 'ɯəw', 'ʷet', 'iəw', 'uəj', 'ʷen', 'tʰw', 'ʷɤ̆', 'ʷiu', 'kwi', 'ŋ͡m', 'k͡p', 'cw', 'jw', 'uə', 'eə', 'bw', 'oj', 'ʷi', 'vw', 'ăw', 'ʈw', 'ʂw', 'aʊ', 'fw', 'ɛu', 'tʰ', 'tʃ', 'ɔɪ', 'xw', 'ʷɤ', 'ɤ̆', 'ŋw', 'ʊə', 'zi', 'ʷă', 'dw', 'eɪ', 'aɪ', 'ew', 'iə', 'ɣw', 'zw', 'ɯj', 'ʷɛ', 'ɯw', 'ɤj', 'ɔ:', 'əʊ', 'ʷa', 'mw', 'ɑ:', 'hw', 'ɔj', 'uj', 'lw', 'ɪə', 'ăj', 'u:', 'aw', 'ɛj', 'iw', 'aj', 'ɜ:', 'kw', 'nw', 't∫', 'ɲw', 'eo', 'sw', 'tw', 'ʐw', 'iɛ', 'ʷe', 'i:', 'ɯə', 'dʒ', 'ɲ', 'θ', 'ʌ', 'l', 'w', '1', 'ɪ', 'ɯ', 'd', '∫', 'p', 'ə', 'u', 'o', '3', 'ɣ', '!', 'ð', 'ʧ', '6', 'ʒ', 'ʐ', 'z', 'v', 'g', 'ă', '_', 'æ', 'ɤ', '2', 'ʤ', 'i', '.', 'ɒ', 'b', 'h', 'n', 'ʂ', 'ɔ', 'ɛ', 'k', 'm', '5', ' ', 'c', 'j', 'x', 'ʈ', ',', '4', 'ʊ', 's', 'ŋ', 'a', 'ʃ', '?', 'r', ':', 'η', 'f', ';', 'e', 't', "'"] | |
listParse.sort(reverse = True,key=len) | |
output="" | |
skip=0 | |
for ic,char in enumerate(text): | |
##print(char,skip) | |
check = 0 | |
if skip>0: | |
skip=skip-1 | |
continue | |
for l in listParse: | |
if len(l) <= len(text[ic:]) and l == text[ic:ic+len(l)]: | |
output+=delimit+l | |
check =1 | |
skip=len(l)-1 | |
break | |
if check == 0: | |
#Case symbol not in list | |
if str(char) in ["ˈ","ˌ","*"]: | |
continue | |
#print("this is not in symbol :"+ char+":") | |
output+=delimit+undefine_symbol | |
return output.rstrip()+delimit | |
def T2IPA_split(text,delimit): | |
sys.path.append('./Rules') # make sure we can find the Rules files | |
#Setup option | |
glottal = 0 | |
pham = 0 | |
cao = 0 | |
palatals = 0 | |
tokenize = 0 | |
dialect='n' #"c""s" | |
tone_type=0 | |
if tone_type==0: | |
pham=1 | |
else: | |
cao=1 | |
#Input text | |
line = text | |
if line =='\n': | |
return "" | |
else: | |
compound = u'' | |
ortho = u'' | |
words = line.split() | |
## toss len==0 junk | |
words = [word for word in words if len(word)>0] | |
## hack to get rid of single hyphens or underscores | |
words = [word for word in words if word!=u'-'] | |
words = [word for word in words if word!=u'_'] | |
for i in range(0,len(words)): | |
word = words[i].strip() | |
ortho += word | |
word = word.strip(punctuation).lower() | |
## 29.03.16: check if tokenize is true | |
## if true, call this routine for each substring | |
## and re-concatenate | |
if (tokenize and '-' in word) or (tokenize and '_' in word): | |
substrings = re.split(r'(_|-)', word) | |
values = substrings[::2] | |
delimiters = substrings[1::2] + [''] | |
ipa = [convert(x, dialect, glottal, pham, cao, palatals, delimit).strip() for x in values] | |
seq = ''.join(v+d for v,d in zip(ipa, delimiters)) | |
else: | |
seq = convert(word, dialect, glottal, pham, cao, palatals, delimit).strip() | |
# concatenate | |
if len(words) >= 2: | |
ortho += ' ' | |
if i < len(words)-1: | |
seq = seq+u' ' | |
compound = compound + seq | |
return compound | |
def T2IPA(text): | |
sys.path.append('./Rules') # make sure we can find the Rules files | |
#Setup option | |
glottal = 0 | |
pham = 0 | |
cao = 0 | |
palatals = 0 | |
tokenize = 0 | |
delimit = '' | |
dialect='n' #"c""s" | |
tone_type=0 | |
if tone_type==0: | |
pham=1 | |
else: | |
cao=1 | |
#Input text | |
line = text | |
if line =='\n': | |
return "" | |
else: | |
compound = u'' | |
ortho = u'' | |
words = line.split() | |
## toss len==0 junk | |
words = [word for word in words if len(word)>0] | |
## hack to get rid of single hyphens or underscores | |
words = [word for word in words if word!=u'-'] | |
words = [word for word in words if word!=u'_'] | |
for i in range(0,len(words)): | |
word = words[i].strip() | |
ortho += word | |
word = word.strip(punctuation).lower() | |
## 29.03.16: check if tokenize is true | |
## if true, call this routine for each substring | |
## and re-concatenate | |
if (tokenize and '-' in word) or (tokenize and '_' in word): | |
substrings = re.split(r'(_|-)', word) | |
values = substrings[::2] | |
delimiters = substrings[1::2] + [''] | |
ipa = [convert(x, dialect, glottal, pham, cao, palatals, delimit).strip() for x in values] | |
seq = ''.join(v+d for v,d in zip(ipa, delimiters)) | |
else: | |
seq = convert(word, dialect, glottal, pham, cao, palatals, delimit).strip() | |
# concatenate | |
if len(words) >= 2: | |
ortho += ' ' | |
if i < len(words)-1: | |
seq = seq+u' ' | |
compound = compound + seq | |
return compound | |
EN={"a":"ây","ă":"á","â":"ớ","b":"bi","c":"si","d":"đi","đ":"đê","e":"i","ê":"ê","f":"ép","g":"giy","h":"ếch","i":"ai","j":"giây","k":"cây","l":"eo","m":"em","n":"en","o":"âu","ô":"ô","ơ":"ơ","p":"pi","q":"kiu","r":"a","s":"ét","t":"ti","u":"diu","ư":"ư","v":"vi","w":"đắp liu","x":"ít","y":"quai","z":"giét"} | |
import re | |
def vi2IPA_split(texts,delimit): | |
content=[] | |
with open(imp.find_module('viphoneme')[1]+"/Popular.txt",encoding="utf-8") as f: | |
content=f.read().splitlines() | |
tess = texts.split(".") | |
Results ="" | |
for text in tess: | |
#print("------------------------------------------------------") | |
TN= TTSnorm(text) | |
#TN=text | |
#print("------------------------------------------------------") | |
#print("Text normalize: ",TN) | |
TK= word_tokenize(TN) | |
#print("Vietnamese Tokenize: ",TK) | |
for iuv,under_valid in enumerate(TK): | |
token_under=under_valid.split(" ") | |
checkinvalid=0 | |
##print(token_under) | |
if len(token_under) >1: | |
for tok in token_under: | |
if tok not in content or "[" in T2IPA(tok): | |
checkinvalid=1 | |
if checkinvalid==1: | |
TK = TK[:iuv] + TK[iuv+1 :] | |
for tok in reversed(token_under): | |
TK.insert(iuv, tok) | |
IPA="" | |
for tk in TK: | |
ipa = T2IPA_split(tk,delimit).replace(" ","_") | |
if ipa =="": | |
IPA+=delimit+tk+delimit+" " | |
elif ipa[0]=="[" and ipa[-1]=="]": | |
eng = eng_to_ipa.convert(tk) | |
if eng[-1] == "*": | |
if tk.lower().upper() == tk: | |
##print("ENGLISH",tk) | |
#Đọc tiếng anh từng chữ | |
letter2sound="" | |
for char in tk: | |
CHAR = str(char).lower() | |
if CHAR in list(EN.keys()): | |
letter2sound+=EN[CHAR]+" " | |
else: | |
letter2sound+=char+" " | |
IPA+=T2IPA_split(letter2sound,delimit)+" " | |
else: | |
#Giữ nguyên | |
#Future: test experiment" Nếu từ unknow có thể dùng eng_norm để chuyển qua thay thế chứ không cần giữ nguyên như này | |
IPA+=Parsing("default",tk.lower(),delimit)+" " | |
else: | |
#This use for version english not splited by syllable | |
#IPA+=Parsing("default",eng,delimit)+" " | |
#This version will split english to each syllable | |
IPA+=normEng(tk,delimit)+ delimit+" " | |
#Check tu dien tieng anh Etrain bưc | |
#Neu co Mapping | |
#Neu khong, check co nguyen am | |
#Neu co de nguyen | |
#Neu khong danh van | |
#print(" ..................Out of domain word: " ,ipa) | |
else: | |
IPA+=ipa+" " | |
IPA=re.sub(delimit+'+', delimit, IPA) | |
IPA=re.sub(' +', ' ', IPA) | |
#print("IPA Vietnamese: ",IPA) | |
#print("------------------------------------------------------") | |
Results+= IPA.rstrip()+" "+delimit+"."+delimit+" " | |
return Results.rstrip() | |
def vi2IPA(text): | |
#print("------------------------------------------------------") | |
TN= TTSnorm(text) | |
#print("------------------------------------------------------") | |
#print("Text normalize: ",TN) | |
TK= word_tokenize(TN) | |
#print("Vietnamese Tokenize: ",TK) | |
IPA="" | |
for tk in TK: | |
ipa = T2IPA(tk).replace(" ","_") | |
if ipa =="": | |
IPA+=tk+" " | |
elif ipa[0]=="[" and ipa[-1]=="]": | |
eng = eng_to_ipa.convert(tk) | |
if eng[-1] == "*": | |
if tk.lower().upper() == tk: | |
#Đọc tiếng anh từng chữ | |
letter2sound="" | |
for char in tk: | |
CHAR = str(char).lower() | |
if CHAR in list(EN.keys()): | |
letter2sound+=EN[CHAR]+" " | |
else: | |
letter2sound+=char+" " | |
IPA+=T2IPA_split(letter2sound,"")+" " | |
else: | |
#Giữ nguyên | |
IPA+=Parsing("default",tk,"")+" " | |
else: | |
IPA+=eng+" " | |
#Check tu dien tieng anh Etrain bưc | |
#Neu co Mapping | |
#Neu khong, check co nguyen am | |
#Neu co de nguyen | |
#Neu khong danh van | |
#print(" ..................Out of domain word: " ,ipa) | |
else: | |
IPA+=ipa+" " | |
IPA=re.sub(' +', ' ', IPA) | |
#print("IPA Vietnamese: ",IPA) | |
#print("------------------------------------------------------") | |
return IPA | |