|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from itertools import chain |
|
|
|
|
|
SLAX = { |
|
"IH1", |
|
"IH2", |
|
"EH1", |
|
"EH2", |
|
"AE1", |
|
"AE2", |
|
"AH1", |
|
"AH2", |
|
"UH1", |
|
"UH2", |
|
} |
|
VOWELS = { |
|
"IY1", |
|
"IY2", |
|
"IY0", |
|
"EY1", |
|
"EY2", |
|
"EY0", |
|
"AA1", |
|
"AA2", |
|
"AA0", |
|
"ER1", |
|
"ER2", |
|
"ER0", |
|
"AW1", |
|
"AW2", |
|
"AW0", |
|
"AO1", |
|
"AO2", |
|
"AO0", |
|
"AY1", |
|
"AY2", |
|
"AY0", |
|
"OW1", |
|
"OW2", |
|
"OW0", |
|
"OY1", |
|
"OY2", |
|
"OY0", |
|
"IH0", |
|
"EH0", |
|
"AE0", |
|
"AH0", |
|
"UH0", |
|
"UW1", |
|
"UW2", |
|
"UW0", |
|
"UW", |
|
"IY", |
|
"EY", |
|
"AA", |
|
"ER", |
|
"AW", |
|
"AO", |
|
"AY", |
|
"OW", |
|
"OY", |
|
"UH", |
|
"IH", |
|
"EH", |
|
"AE", |
|
"AH", |
|
"UH", |
|
} | SLAX |
|
|
|
|
|
|
|
O2 = { |
|
("P", "R"), |
|
("T", "R"), |
|
("K", "R"), |
|
("B", "R"), |
|
("D", "R"), |
|
("G", "R"), |
|
("F", "R"), |
|
("TH", "R"), |
|
("P", "L"), |
|
("K", "L"), |
|
("B", "L"), |
|
("G", "L"), |
|
("F", "L"), |
|
("S", "L"), |
|
("K", "W"), |
|
("G", "W"), |
|
("S", "W"), |
|
("S", "P"), |
|
("S", "T"), |
|
("S", "K"), |
|
("HH", "Y"), |
|
("R", "W"), |
|
} |
|
O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")} |
|
|
|
|
|
|
|
|
|
|
|
def syllabify(pron, alaska_rule=True): |
|
""" |
|
Syllabifies a CMU dictionary (ARPABET) word string |
|
|
|
# Alaska rule: |
|
>>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska |
|
'-AH0-.L-AE1-S.K-AH0-' |
|
>>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska |
|
'-AH0-.L-AE1-.S K-AH0-' |
|
|
|
# huge medial onsets: |
|
>>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel |
|
'M-IH1-N.S T R-AH0-L' |
|
>>> pprint(syllabify('AA1 K T R W AA0 R'.split())) # octroi |
|
'-AA1-K.T R W-AA0-R' |
|
|
|
# destressing |
|
>>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split()))) |
|
'M-IH-.L-AH-.T-EH-.R-IY-' |
|
|
|
# normal treatment of 'j': |
|
>>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu |
|
'M-EH1-N.Y-UW0-' |
|
>>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel |
|
'S P-AE1-N.Y-AH0-L' |
|
>>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon |
|
'K-AE1-N.Y-AH0-N' |
|
>>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet |
|
'M-IH0-N.Y-UW2-.-EH1-T' |
|
>>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior |
|
'JH-UW1-N.Y-ER0-' |
|
>>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew |
|
'K L-EH-.R-IH-.HH Y-UW-' |
|
|
|
# nuclear treatment of 'j' |
|
>>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue |
|
'R-EH1-S.K-Y UW0-' |
|
>>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute |
|
'T R-IH1-B.Y-UW0-T' |
|
>>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula |
|
'N-EH1-B.Y-AH0-.L-AH0-' |
|
>>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula |
|
'S P-AE1-.CH-UH0-.L-AH0-' |
|
>>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen |
|
'-AH0-K.Y-UW1-.M-AH0-N' |
|
>>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent |
|
'S-AH1-K.Y-AH0-.L-IH0-N T' |
|
>>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula |
|
'F-AO1 R-M.Y-AH0-.L-AH0-' |
|
>>> pprint(syllabify('V AE1 L Y UW0'.split())) # value |
|
'V-AE1-L.Y-UW0-' |
|
|
|
# everything else |
|
>>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic |
|
'N-AO0-.S T-AE1-L.JH-IH0-K' |
|
>>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen |
|
'CH-ER1-CH.M-AH0-N' |
|
>>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate |
|
'K-AA1-M.P-AH0-N.S-EY2-T' |
|
>>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE |
|
'-IH0-N.S-EH1-N S' |
|
>>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense |
|
'-IH1-N.S-EH2-N S' |
|
>>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend |
|
'-AH0-.S-EH1-N D' |
|
>>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate |
|
'R-OW1-.T-EY2-T' |
|
>>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist |
|
'-AA1 R-.T-AH0-S T' |
|
>>> pprint(syllabify('AE1 K T ER0'.split())) # actor |
|
'-AE1-K.T-ER0-' |
|
>>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster |
|
'P L-AE1-S.T-ER0-' |
|
>>> pprint(syllabify('B AH1 T ER0'.split())) # butter |
|
'B-AH1-.T-ER0-' |
|
>>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel |
|
'K-AE1-.M-AH0-L' |
|
>>> pprint(syllabify('AH1 P ER0'.split())) # upper |
|
'-AH1-.P-ER0-' |
|
>>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon |
|
'B-AH0-.L-UW1-N' |
|
>>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim |
|
'P R-OW0-.K L-EY1-M' |
|
>>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane |
|
'-IH0-N.S-EY1-N' |
|
>>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude |
|
'-IH0-K.S K L-UW1-D' |
|
""" |
|
|
|
mypron = list(pron) |
|
nuclei = [] |
|
onsets = [] |
|
i = -1 |
|
for (j, seg) in enumerate(mypron): |
|
if seg in VOWELS: |
|
nuclei.append([seg]) |
|
onsets.append(mypron[i + 1 : j]) |
|
i = j |
|
codas = [mypron[i + 1 :]] |
|
|
|
for i in range(1, len(onsets)): |
|
coda = [] |
|
|
|
if len(onsets[i]) > 1 and onsets[i][0] == "R": |
|
nuclei[i - 1].append(onsets[i].pop(0)) |
|
if len(onsets[i]) > 2 and onsets[i][-1] == "Y": |
|
nuclei[i].insert(0, onsets[i].pop()) |
|
if len(onsets[i]) > 1 and alaska_rule and nuclei[i - 1][-1] in SLAX and onsets[i][0] == "S": |
|
coda.append(onsets[i].pop(0)) |
|
|
|
depth = 1 |
|
if len(onsets[i]) > 1: |
|
if tuple(onsets[i][-2:]) in O2: |
|
depth = 3 if tuple(onsets[i][-3:]) in O3 else 2 |
|
for j in range(len(onsets[i]) - depth): |
|
coda.append(onsets[i].pop(0)) |
|
|
|
codas.insert(i - 1, coda) |
|
|
|
|
|
output = list(zip(onsets, nuclei, codas)) |
|
flat_output = list(chain.from_iterable(chain.from_iterable(output))) |
|
if flat_output != mypron: |
|
raise ValueError(f"could not syllabify {mypron}, got {flat_output}") |
|
return output |
|
|
|
|
|
def pprint(syllab): |
|
""" |
|
Pretty-print a syllabification |
|
""" |
|
return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab) |
|
|
|
|
|
def destress(syllab): |
|
""" |
|
Generate a syllabification with nuclear stress information removed |
|
""" |
|
syls = [] |
|
for (onset, nucleus, coda) in syllab: |
|
nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus] |
|
syls.append((onset, nuke, coda)) |
|
return syls |
|
|
|
|
|
if __name__ == "__main__": |
|
import doctest |
|
|
|
doctest.testmod() |
|
|