import re import nltk from nltk.tokenize import word_tokenize import phonemizer from phonemizer.backend import EspeakBackend import numpy as np class TextProcessor: def __init__(self): # Initialize phonemizer with English backend self.backend = EspeakBackend('en-us') def process(self, text): """ Process text into phonemes with duration and stress markers for singing Args: text (str): Input text to be processed Returns: tuple: (phonemes, durations, stress_markers) """ # Clean text text = self._clean_text(text) # Tokenize tokens = word_tokenize(text) # Get phonemes phonemes = self._text_to_phonemes(text) # Estimate durations durations = self._estimate_durations(tokens, phonemes) # Mark stress for singing emphasis stress_markers = self._mark_stress(tokens, phonemes) return phonemes, durations, stress_markers def _clean_text(self, text): """Clean and normalize text""" # Convert to lowercase text = text.lower() # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() # Remove special characters but keep punctuation important for phrasing text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text) return text def _text_to_phonemes(self, text): """Convert text to phoneme sequence""" phonemes = self.backend.phonemize([text], strip=True)[0] # Clean up phoneme representation phonemes = re.sub(r'\s+', ' ', phonemes).strip() return phonemes def _estimate_durations(self, tokens, phonemes): """Estimate phoneme durations for singing""" # Split phonemes into list phoneme_list = phonemes.split() # Default duration (in seconds) for each phoneme base_duration = 0.1 # Assign longer durations to vowels and certain consonants durations = [] for p in phoneme_list: # Vowels get longer duration if re.search(r'[aeiou]', p): durations.append(base_duration * 2) # Certain consonants get medium duration elif re.search(r'[lrmnw]', p): durations.append(base_duration * 1.5) # Other phonemes get standard duration else: durations.append(base_duration) # Adjust for punctuation (create pauses) for i, token in enumerate(tokens): if token in ['.', ',', '!', '?', ';', ':']: # Add a pause duration at the end of sentences or phrases durations.append(base_duration * 3 if token in ['.', '!', '?'] else base_duration * 1.5) return durations def _mark_stress(self, tokens, phonemes): """Mark which phonemes should be stressed in singing""" # Simple heuristic: mark first syllable of content words stress_markers = np.zeros(len(phonemes.split())) # POS tagging to identify content words tagged = nltk.pos_tag(tokens) content_word_indices = [] for i, (word, tag) in enumerate(tagged): # Content words: nouns, verbs, adjectives, adverbs if tag.startswith(('N', 'V', 'J', 'R')) and len(word) > 2: content_word_indices.append(i) # Estimate phoneme positions for content words and mark stress phoneme_idx = 0 word_idx = 0 phoneme_list = phonemes.split() # This is a simplified approach - in practice, you'd need # a more sophisticated alignment between words and phonemes for i, word in enumerate(tokens): if i in content_word_indices: # Mark the first vowel phoneme of this word word_phonemes = len(word) # This is an approximation for j in range(word_phonemes): if phoneme_idx + j < len(phoneme_list): phon = phoneme_list[phoneme_idx + j] if re.search(r'[aeiou]', phon): stress_markers[phoneme_idx + j] = 1 break phoneme_idx += len(word) # Approximate phoneme position return stress_markers