File size: 686 Bytes
fe79a8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import unicodedata
import re


def clean_transcription(text):
    # Normalize the text to NFKD form
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Remove diacritics
    cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
    
    # Explicitly remove the leading ʻ character and any other specific characters
    cleaned_text = cleaned_text.replace('ʻ', '')
    
    # Remove any remaining special characters (if any)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    
    # Ensure the text is stripped of any unwanted leading or trailing whitespace
    cleaned_text = cleaned_text.strip()
    
    return cleaned_text