# Transformations of text sequences for matching from __future__ import annotations from typing import TYPE_CHECKING from .symbols import consonants import re if TYPE_CHECKING: from .cmudictext import CMUDictExt _re_digit = re.compile(r'\d+') class Processor: def __init__(self, cde: CMUDictExt): self._lookup = cde.lookup self._cmu_get = cde.dict.get self._segment = cde.segment self._tag = cde.h2p.tag self._stem = cde.stem # Number of times respective methods were called self.stat_hits = { 'plural': 0, 'possessives': 0, 'contractions': 0, 'hyphenated': 0, 'compound': 0, 'compound_l2': 0, 'stem': 0 } # Number of times respective methods returned value (not None) self.stat_resolves = { 'plural': 0, 'possessives': 0, 'contractions': 0, 'hyphenated': 0, 'compound': 0, 'compound_l2': 0, 'stem': 0 } # Holds events when features encountered unexpected language syntax self.stat_unexpected = { 'plural': [], 'possessives': [], 'contractions': [], 'hyphenated': [], 'compound': [], 'compound_l2': [], 'stem': [] } def auto_possessives(self, word: str) -> str | None: """ Auto-possessives :param word: Input of possible possessive word :return: Phoneme of word as SDS, or None if unresolvable """ if not word.endswith("'s"): return None # If the word ends with "'s", register a hit self.stat_hits['possessives'] += 1 """ There are 3 general cases: 1. Base words ending in one of 6 special consonants (sibilants) - i.e. Tess's, Rose's, Butch's, Midge's, Rush's, Garage's - With consonants ending of [s], [z], [ch], [j], [sh], [zh] - In ARPAbet: {S}, {Z}, {CH}, {JH}, {SH}, {ZH} - These require a suffix of {IH0 Z} 2. Base words ending in vowels and voiced consonants: - i.e. Fay's, Hugh's, Bob's, Ted's, Meg's, Sam's, Dean's, Claire's, Paul's, Bing's - In ARPAbet: {IY0}, {EY1}, {UW1}, {B}, {D}, {G}, {M}, {N}, {R}, {L}, {NG} - Vowels need a wildcard match of any numbered variant - These require a suffix of {Z} 3. Base words ending in voiceless consonants: - i.e. Hope's, Pat's, Clark's, Ruth's - In ARPAbet: {P}, {T}, {K}, {TH} - These require a suffix of {S} """ # Method to return phoneme and increment stat def _resolve(phoneme: str) -> str: self.stat_resolves['possessives'] += 1 return phoneme core = word[:-2] # Get core word without possessive ph = self._lookup(core, ph_format='list') # find core word using recursive search if ph is None: return None # Core word not found # [Case 1] if ph[-1] in {'S', 'Z', 'CH', 'JH', 'SH', 'ZH'}: ph += 'IH0' + 'Z' return _resolve(ph) # [Case 2] """ Valid for case 2: 'AA', 'AO', 'EY', 'OW', 'UW', 'AE', 'AW', 'EH', 'IH', 'OY', 'AH', 'AY', 'ER', 'IY', 'UH', 'UH', 'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG' To simplify matching, we will check for the listed single-letter variants and 'NG' and then check for any numbered variant """ if ph[-1] in {'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'} or ph[-1][-1].isdigit(): ph += 'Z' return _resolve(ph) # [Case 3] if ph[-1] in ['P', 'T', 'K', 'TH']: ph += 'S' return _resolve(ph) return None # No match found def auto_contractions(self, word: str) -> str | None: """ Auto contracts form and finds phonemes :param word: :return: """ """ Supported contractions: - 'll - 'd """ # First, check if the word is a contraction parts = word.split("\'") # Split on ['] if len(parts) == 1 or parts[1] not in {'ll', 'd'}: return None # No contraction found if len(parts) > 2: self.stat_unexpected['contraction'] += word return None # More than 2 parts, can't be a contraction # If initial check passes, register a hit self.stat_hits['contractions'] += 1 # Get the core word core = parts[0] # Get the phoneme for the core word recursively ph = self._lookup(core, ph_format='list') if ph is None: return None # Core word not found # Add the phoneme with the appropriate suffix if parts[1] == 'll': ph += 'L' elif parts[1] == 'd': ph += 'D' # Return the phoneme self.stat_resolves['contractions'] += 1 return ph def auto_hyphenated(self, word: str) -> str | None: """ Splits hyphenated words and attempts to resolve components :param word: :return: """ # First, check if the word is a hyphenated word if '-' not in word: return None # No hyphen found # If initial check passes, register a hit self.stat_hits['hyphenated'] += 1 # Split the word into parts parts = word.split('-') # Get the phonemes for each part ph = [] for part in parts: ph_part = self._lookup(part, ph_format='sds') if ph_part is None: return None # Part not found ph.append(ph_part) # Join the phonemes ph = ' '.join(ph) # Return the phoneme self.stat_resolves['hyphenated'] += 1 return ph def auto_compound(self, word: str) -> str | None: """ Splits compound words and attempts to resolve components :param word: :return: """ # Split word into parts parts = self._segment(word) if len(parts) == 1: return None # No compound found # If initial check passes, register a hit self.stat_hits['compound'] += 1 # Get the phonemes for each part ph = [] for part in parts: ph_part = self._lookup(part, ph_format='sds') if ph_part is None: return None # Part not found ph.append(ph_part) # Join the phonemes ph = ' '.join(ph) # Return the phoneme self.stat_resolves['compound'] += 1 return ph def auto_plural(self, word: str, pos: str = None) -> str | None: """ Finds singular form of plurals and attempts to resolve separately Optionally a pos tag can be provided. If no tags are provided, there will be a single word pos inference, which is not ideal. :param pos: :param word: :return: """ # First, check if the word is a replaceable plural # Needs to end in 's' or 'es' if word[-1] != 's': return None # No plural found # Now check if the word is a plural using pos if pos is None: pos = self._tag(word) if pos is None or len(pos) == 0 or (pos[0] != 'NNS' and pos[0] != 'NNPS'): return None # No tag found # If initial check passes, register a hit self.stat_hits['plural'] += 1 """ Case 1: > Word ends in 'oes' > Remove the 'es' to get the singular """ if len(word) > 3 and word[-3:] == 'oes': singular = word[:-2] # Look up the possessive form (since the pronunciation is the same) ph = self.auto_possessives(singular + "'s") if ph is not None: self.stat_resolves['plural'] += 1 return ph # Return the phoneme """ Case 2: > Word ends in 's' > Remove the 's' to get the singular """ if len(word) > 1 and word[-1] == 's': singular = word[:-1] # Look up the possessive form (since the pronunciation is the same) ph = self.auto_possessives(singular + "'s") if ph is not None: self.stat_resolves['plural'] += 1 return ph # Return the phoneme # If no matches, return None return None def auto_stem(self, word: str) -> str | None: """ Attempts to resolve using the root stem of a word. Supported modes: - "ing" - "ingly" - "ly" :param word: :return: """ # noinspection SpellCheckingInspection """ 'ly' has no special rules, always add phoneme 'L IY0' 'ing' relevant rules: > If the original verb ended in [e], remove it and add [ing] - i.e. take -> taking, make -> making - We will search once with the original verb, and once with [e] added - 1st attempt: tak, mak - 2nd attempt: take, make > If the input word has a repeated consonant before [ing], it's likely that the original verb has only 1 of the consonants - i.e. running -> run, stopping -> stop - We will search for repeated consonants, and perform 2 attempts: - 1st attempt: without the repeated consonant (run, stop) - 2nd attempt: with the repeated consonant (runn, stopp) """ # Discontinue if word is too short if len(word) < 3 or (not word.endswith('ly') and not word.endswith('ing')): return None # Register a hit self.stat_hits['stem'] += 1 # Register hit # For ly case if word.endswith('ly'): # Get the root word root = word[:-2] # Recursively get the root ph_root = self._lookup(root, ph_format='sds') # If not exist, return None if ph_root is None: return None ph_ly = 'L IY0' ph_joined = ' '.join([ph_root, ph_ly]) self.stat_resolves['stem'] += 1 return ph_joined # For ing case if word.endswith('ing'): # Get the root word root = word[:-3] # Recursively get the root ph_root = self._lookup(root, ph_format='sds') # If not exist, return None if ph_root is None: return None ph_ly = 'IH0 NG' ph_joined = ' '.join([ph_root, ph_ly]) self.stat_resolves['stem'] += 1 return ph_joined def auto_component(self, word: str) -> str | None: """ Searches for target word as component of a larger word :param word: :return: """ """ This processing step checks for words as a component of a larger word - i.e. 'synth' is not in the cmu dictionary - Stage 1: We will search for any word beginning with 'synth' (10 matches) - This is because most unseen short words are likely shortened versions - We will split - Stage 2: Search for any word containing 'synth' (13 matches) """ raise NotImplementedError def auto_compound_l2(self, word: str, recursive: bool = True) -> str | None: """ Searches for target word as a compound word. > Does not use n-gram splitting like auto_compound() > Splits words manually into every possible combination > Returns the match with the highest length of both words :param recursive: True to enable recursive lookups, otherwise only use base CMU dictionary :param word: :return: """ # Word must be fully alphabetic if not word.isalpha() or len(word) < 3: return None self.stat_hits['compound_l2'] += 1 # Register hit # Define lookup mode def _lu(search_word: str) -> str | None: if recursive: return self._lookup(search_word, ph_format='sds') else: return self._cmu_get(search_word) # Check if the last part is a single character # And that it is repeated in the last char of the first part # This is likely silent so remove it # i.e. 'Derakk' -> 'Derak' # If the word contains a repeated consonant at the end, remove it # First check repeated last 2 letters if word[-2:][0] == word[-2:][1]: # Remove the last char from the word word = word[:-1] # Holds all matches as tuples # (len1, len2, p1, p2, ph1, ph2) matches = [] # Splits the word into every possible combination for i in range(1, len(word)): p1 = word[:i] p2 = word[i:] # Looks up both words ph1 = _lu(p1) if ph1 is None: continue # Skip if not found ph2 = _lu(p2) if ph2 is None: continue # Skip if not found # If both words exist, add to list as tuple matches.append((len(p1), len(p2), p1, p2, ph1, ph2)) # Pick the match with the highest length of both words if len(matches) == 0: return None else: # Sort by the minimum of len1 and len2 matches.sort(key=lambda x: min(x[0], x[1])) # Get the highest minimum length match match = matches[-1] # Otherwise, return the full joined match self.stat_resolves['compound_l2'] += 1 # Register resolve return match[4] + ' ' + match[5]