import re from klpt.preprocess import Preprocess from klpt.tokenize import Tokenize import unicodedata preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Arabic") tokenizer_ckb = Tokenize("Sorani", "Arabic") unify_numbers = { "٠|۰": "0", "١|۱": "1", "٢|۲": "2", "٣|۳": "3", "٤|۴": "4", "٥|۵": "5", "٦|۶": "6", "٧|۷": "7", "٨|۸": "8", "٩|۹": "9" } # Taken from AsoSoft library def number_to_word(text): # convert numbers to latin for k, v in unify_numbers.items(): text = re.sub(k, v, text) text = re.sub(r"([0-9]{1,3})[,،](?=[0-9]{3})", r"\1", text); # remove thousend seperator 12,345,678 => 12345678 text = re.sub(r"(? 0 def keep_kurdish_characters(text): kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ") cleaned_text = ''.join(char for char in text if char in kurdish_characters) return cleaned_text def remove_emojis(text): emoji_pattern = re.compile("[" "\U0001F600-\U0001F64F" # Emoticons "\U0001F300-\U0001F5FF" # Symbols & Pictographs "\U0001F680-\U0001F6FF" # Transport & Map Symbols "\U0001F700-\U0001F77F" # Alchemical Symbols "\U0001F780-\U0001F7FF" # Geometric Shapes Extended "\U0001F800-\U0001F8FF" # Supplemental Arrows-C "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs "\U0001FA00-\U0001FA6F" # Chess Symbols "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A "\U00002702-\U000027B0" # Dingbats "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', text) def remove_language_families(text): patterns = [ "[\u1100-\u11FF\u2E80-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF]+", # Asian scripts "[\u0000-\u024F]+", # Basic Latin and Latin-1 Supplement "[\u0400-\u04FF]+", # Cyrillic "[\u0370-\u03FF]+", # Greek "[\u0900-\u097F]+", # Devanagari r"\u0B80-\u0BFF", # Tamil r"\u4E00-\u9FFF", # Han r"\u10A0-\u10FF", # Georgian r"\u0C80-\u0CFF" # Kannada ] combined_pattern = re.compile("|".join(patterns)) cleaned_text = combined_pattern.sub(r'', text) return cleaned_text clean_punctuation = re.compile(r"(? str: # Replace , with ، text = text.replace(',', '،') # Replace ? with ؟ text = text.replace('?', '؟') # Replace two or three of the same punctuation marks with a single one text = re.sub(r'([.,;:?!،؛؟])\1{1,2}', r'\1', text) # Replace double opening and closing parentheses with guillemets text = re.sub(r'\(\(', '«', text) text = re.sub(r'\)\)', '»', text) # Normalize space around the guillemets and other punctuation marks text = re.sub(r'\s*«\s*', ' «', text) text = re.sub(r'\s*»\s*', '» ', text) # Additional punctuation normalization text = re.sub(r'\s*([,،؟])\s*', r'\1 ', text) # Ensure there is no space before a guillemet at the beginning of the text or after a # guillemet at the end of the text text = re.sub(r'^\s*«', '«', text) text = re.sub(r'»\s*$', '»', text) # If multiple punctuation marks come after each other only keep the first one # text = re.sub(r'([.!?؟،؛])\1+', r'\1', text) # if conective punctuation marks come after each other only keep the first one text = re.sub(r'([.!?؟،؛])\1+', r'\1', text) # if punctuation marks come after each other with space between them like: ? ? ? keep the first one remove the rest text = re.sub(r'([.!?؟،؛])\s\1+', r'\1', text) # Trim leading and trailing spaces and return the normalized text text = text.strip() return text def fix_sentence(sentence): if sentence.startswith('"') and sentence.endswith('"'): # we can remove trailing quotation marks as they do not affect the sentence sentence = sentence[1:-1] if sentence[-1] not in [".", "?", "!"]: # append a full-stop to sentences that do not end in punctuation sentence = sentence + "." # sentence = sentence[:-1].translate(str.maketrans('', '', string.punctuation)) + sentence[-1] return sentence def add_period_abbreviations(text): abbreviations = set(["پ", "د"]) # Add more abbreviations as needed # Define a regular expression pattern to match a letter followed by a space and then a word character pattern = re.compile(r'([{}]) (?=\w)'.format(''.join(abbreviations))) # Use regex to add periods after the specified abbreviations with a space after the period text = pattern.sub(r'\1. ', text) # Add periods after each letter if "د" and "خ" appear together text = re.sub(r'د\sخ|خ ?د|د\.?خ|خ\.?د', 'د. خ.', text) # Abbreviated dates # text = re.sub(r'\b(پ\. ز)\b', r'\1.', text) return text def process_text(text): # text = replace_words_in_corpus(text) text = resolve_ae(text) # text = number_to_word(text) text = preprocessor_ckb.preprocess(text) # text = normalizer(text).strip() text = remove_emojis(text) text = normalize_punctuations(text) text = fix_sentence(text) text = apply_char_replacements(text) return text if __name__ == "__main__": # text = "لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت. 'õ'\u200c\u200f\u200e'ھ'" # print(process_text(text)) # print(contains_non_kurdish_characters(text)) # text = "دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟" # correct = "دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟" # print("Before punctuation normalization:", text) # print("After punctuation normalization:", normalize_punctuations(text)) # print("Correct:\t\t\t", correct) # print(normalize_punctuations(text) == correct) # print(normalize_punctuations("ڕەوا بورهان 4 تەمموز ، کوردستانی سلێمانی?!!")) # print(normalize_punctuations("یانەی کوردژین تکایە چۆن بە شی سە ڕە کی و لاوە کی بۆ مالپە ڕە کە م زیاد بکە م؟؟ ؟ ؟ لە سکرێپە یتی ژومیلە")) # with open('data/data.ckb.txt', 'r', encoding='utf-8') as src_file: # source_data = src_file.read() # unified_data = normalize_punctuations(source_data) # # Save the unified data to a new file # with open('data/unified_data.txt', 'w', encoding='utf-8') as file: # file.writelines(unified_data) # print("Unified data saved to unified_data.txt") text = "Hello ((Friend)) Hello , Friend World" # print(remove_repeated_ngram(text, 2)) # print(remove_repeated_ngrams(text, )) print(process_text(text))