Spaces:
Sleeping
Sleeping
File size: 7,739 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Christopher Hench <chris.l.hench@gmail.com>
# Alex Estes
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
openness of the lips. Syllable breaks occur before troughs in sonority. For more
on the SSP see Selkirk (1984).
The default implementation uses the English alphabet, but the `sonority_hiearchy`
can be modified to IPA or any other alphabet for the use-case. The SSP is a
universal syllabification algorithm, but that does not mean it performs equally
across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
if utilizing IPA (pg. 311).
Importantly, if a custom hierarchy is supplied and vowels span across more than
one level, they should be given separately to the `vowels` class attribute.
References:
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
Cambridge, MIT Press. pp. 107-136.
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
In HLT-NAACL. pp. 308-316.
"""
import re
import warnings
from string import punctuation
from nltk.tokenize.api import TokenizerI
from nltk.util import ngrams
class SyllableTokenizer(TokenizerI):
"""
Syllabifies words based on the Sonority Sequencing Principle (SSP).
>>> from nltk.tokenize import SyllableTokenizer
>>> from nltk import word_tokenize
>>> SSP = SyllableTokenizer()
>>> SSP.tokenize('justification')
['jus', 'ti', 'fi', 'ca', 'tion']
>>> text = "This is a foobar-like sentence."
>>> [SSP.tokenize(token) for token in word_tokenize(text)]
[['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
"""
def __init__(self, lang="en", sonority_hierarchy=False):
"""
:param lang: Language parameter, default is English, 'en'
:type lang: str
:param sonority_hierarchy: Sonority hierarchy according to the
Sonority Sequencing Principle.
:type sonority_hierarchy: list(str)
"""
# Sonority hierarchy should be provided in descending order.
# If vowels are spread across multiple levels, they should be
# passed assigned self.vowels var together, otherwise should be
# placed in first index of hierarchy.
if not sonority_hierarchy and lang == "en":
sonority_hierarchy = [
"aeiouy", # vowels.
"lmnrw", # nasals.
"zvsf", # fricatives.
"bcdgtkpqxhj", # stops.
]
self.vowels = sonority_hierarchy[0]
self.phoneme_map = {}
for i, level in enumerate(sonority_hierarchy):
for c in level:
sonority_level = len(sonority_hierarchy) - i
self.phoneme_map[c] = sonority_level
self.phoneme_map[c.upper()] = sonority_level
def assign_values(self, token):
"""
Assigns each phoneme its value from the sonority hierarchy.
Note: Sentence/text has to be tokenized first.
:param token: Single word or token
:type token: str
:return: List of tuples, first element is character/phoneme and
second is the soronity value.
:rtype: list(tuple(str, int))
"""
syllables_values = []
for c in token:
try:
syllables_values.append((c, self.phoneme_map[c]))
except KeyError:
if c not in "0123456789" and c not in punctuation:
warnings.warn(
"Character not defined in sonority_hierarchy,"
" assigning as vowel: '{}'".format(c)
)
syllables_values.append((c, max(self.phoneme_map.values())))
if c not in self.vowels:
self.vowels += c
else: # If it's a punctuation or numbers, assign -1.
syllables_values.append((c, -1))
return syllables_values
def validate_syllables(self, syllable_list):
"""
Ensures each syllable has at least one vowel.
If the following syllable doesn't have vowel, add it to the current one.
:param syllable_list: Single word or token broken up into syllables.
:type syllable_list: list(str)
:return: Single word or token broken up into syllables
(with added syllables if necessary)
:rtype: list(str)
"""
valid_syllables = []
front = ""
vowel_pattern = re.compile("|".join(self.vowels))
for i, syllable in enumerate(syllable_list):
if syllable in punctuation:
valid_syllables.append(syllable)
continue
if not vowel_pattern.search(syllable):
if len(valid_syllables) == 0:
front += syllable
else:
valid_syllables = valid_syllables[:-1] + [
valid_syllables[-1] + syllable
]
else:
if len(valid_syllables) == 0:
valid_syllables.append(front + syllable)
else:
valid_syllables.append(syllable)
return valid_syllables
def tokenize(self, token):
"""
Apply the SSP to return a list of syllables.
Note: Sentence/text has to be tokenized first.
:param token: Single word or token
:type token: str
:return syllable_list: Single word or token broken up into syllables.
:rtype: list(str)
"""
# assign values from hierarchy
syllables_values = self.assign_values(token)
# if only one vowel return word
if sum(token.count(x) for x in self.vowels) <= 1:
return [token]
syllable_list = []
syllable = syllables_values[0][0] # start syllable with first phoneme
for trigram in ngrams(syllables_values, n=3):
phonemes, values = zip(*trigram)
# Sonority of previous, focal and following phoneme
prev_value, focal_value, next_value = values
# Focal phoneme.
focal_phoneme = phonemes[1]
# These cases trigger syllable break.
if focal_value == -1: # If it's a punctuation, just break.
syllable_list.append(syllable)
syllable_list.append(focal_phoneme)
syllable = ""
elif prev_value >= focal_value == next_value:
syllable += focal_phoneme
syllable_list.append(syllable)
syllable = ""
elif prev_value > focal_value < next_value:
syllable_list.append(syllable)
syllable = ""
syllable += focal_phoneme
# no syllable break
else:
syllable += focal_phoneme
syllable += syllables_values[-1][0] # append last phoneme
syllable_list.append(syllable)
return self.validate_syllables(syllable_list)
|