Spaces:
Running
on
T4
Running
on
T4
import unicodedata | |
import re | |
def clean_transcription(text): | |
# Normalize the text to NFKD form | |
normalized_text = unicodedata.normalize('NFKD', text) | |
# Remove diacritics | |
cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)]) | |
# Explicitly remove the leading ʻ character and any other specific characters | |
cleaned_text = cleaned_text.replace('ʻ', '') | |
# Remove any remaining special characters (if any) | |
cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) | |
# Ensure the text is stripped of any unwanted leading or trailing whitespace | |
cleaned_text = cleaned_text.strip() | |
return cleaned_text | |