Spaces:

MothersTongue
/

voice-matcher-api

Running on T4

File size: 686 Bytes

fe79a8f

import unicodedata
import re


def clean_transcription(text):
    # Normalize the text to NFKD form
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Remove diacritics
    cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
    
    # Explicitly remove the leading ʻ character and any other specific characters
    cleaned_text = cleaned_text.replace('ʻ', '')
    
    # Remove any remaining special characters (if any)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    
    # Ensure the text is stripped of any unwanted leading or trailing whitespace
    cleaned_text = cleaned_text.strip()
    
    return cleaned_text