import unicodedata import re def clean_transcription(text): # Normalize the text to NFKD form normalized_text = unicodedata.normalize('NFKD', text) # Remove diacritics cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)]) # Explicitly remove the leading ʻ character and any other specific characters cleaned_text = cleaned_text.replace('ʻ', '') # Remove any remaining special characters (if any) cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) # Ensure the text is stripped of any unwanted leading or trailing whitespace cleaned_text = cleaned_text.strip() return cleaned_text