kurd-spell-app / ckb_helpers.py
ariahmed's picture
Upload folder using huggingface_hub
e489264 verified
import re
from klpt.preprocess import Preprocess
from klpt.tokenize import Tokenize
import unicodedata
preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Arabic")
tokenizer_ckb = Tokenize("Sorani", "Arabic")
unify_numbers = {
"٠|۰": "0",
"١|۱": "1",
"٢|۲": "2",
"٣|۳": "3",
"٤|۴": "4",
"٥|۵": "5",
"٦|۶": "6",
"٧|۷": "7",
"٨|۸": "8",
"٩|۹": "9"
}
# Taken from AsoSoft library
def number_to_word(text):
# convert numbers to latin
for k, v in unify_numbers.items():
text = re.sub(k, v, text)
text = re.sub(r"([0-9]{1,3})[,،](?=[0-9]{3})", r"\1", text); # remove thousend seperator 12,345,678 => 12345678
text = re.sub(r"(?<![0-9])-([0-9]+)", r"ناقس \1", text); # negative
text = text.replace("٪", "%") # Replace arabic percent sign with latin
text = re.sub(r"(?<![0-9])% ?([0-9]+)", r"لە سەددا \1", text); # percent sign before
text = re.sub(r"([0-9]+) ?%", r"\1 لە سەد", text); # percent sign after
text = re.sub(r"\$ ?([0-9]+(\.[0-9]+)?)", r"\1 دۆلار", text) # $ querency
text = re.sub(r"£ ?([0-9]+(\.[0-9]+)?)", r"\1 پاوەن", text) # £ querency
text = re.sub(r"€ ?([0-9]+(\.[0-9]+)?)", r"\1 یۆرۆ", text) # € querency
# convert float numbers
text = re.sub(r"([0-9]+)\.([0-9]+)", lambda x: float_name(x.group(1), x.group(2)), text)
# convert remaining integr numbers
text = re.sub(r"([0-9]+)", lambda match: integer_name(match.group(1)), text)
return text
def float_name(integerPart, decimalPart):
zeros = re.search("^0+", decimalPart)
point = " پۆینت "
if(zeros):
point = point + re.sub("0", " سفر ", zeros[0])
return integer_name(integerPart) + point + integer_name(decimalPart)
ones = ["", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ"]
teens = [ "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" ]
tens = [ "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد"]
hundreds = ["", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد"]
thousands = ["", " هەزار", " ملیۆن", " ملیار", " ترلیۆن", " کوادرلیۆن", " کوینتلیۆن"]
def integer_name(inputInteger):
output = ""
if (inputInteger != "0"):
temp = inputInteger
for i in range(0, len(inputInteger), 3):
matched_numbers = re.findall(r"[0-9]{1,3}$", temp)
currentThree = matched_numbers[0] if matched_numbers else ""
temp = temp[:len(temp) - len(currentThree)]
currentThree = currentThree.rjust(3, '0')
C = int(currentThree[0])
X = int(currentThree[1])
I = int(currentThree[2])
conjunction1 = " و " if (C != 0) and (X != 0 or I != 0) else ""
conjunction2 = " و " if X != 0 and I != 0 else ""
if (X == 1):
currentThree = hundreds[C] + conjunction1 + teens[I]
else:
currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I]
currentThree += "" if currentThree == "" else thousands[i // 3]
conjunction3 = "" if output == "" else " و "
if (currentThree != ""):
output = currentThree + conjunction3 + output
output = output.replace("یەک هەزار", "هەزار")
else: # if input number = 0
output = "سفر"
return output
def replace_words_in_corpus(sentence):
modified_corpus = []
words = sentence.split()
modified_words = []
for word in words:
if word in word_replacements:
modified_words.append(word_replacements[word])
else:
modified_words.append(word)
modified_sentence = " ".join(modified_words)
return modified_sentence
# put this in a json file
word_replacements = {
"ههڵاڵەەي": "هەڵاڵەی",
"وهەمهەمه": "وهەمهەمه",
"ئهباتههوه": "ئەباتەوە",
"بەخءرایی": "بەخێرایی",
"ئیثانۆڵ": "ئیسانۆڵ",
"عەبدوڵڵاهـ": "عەبدوڵڵا",
"کولاهـ": "کولاه",
"ئاھ": "ئاه",
}
char_replacements = {
'\u200e': '',
'\u200f': '',
'\u200c': '',
'õ': '',
'ھ': 'ه'
}
def apply_char_replacements(text: str):
for old, new in char_replacements.items():
text = text.replace(old, new)
return text
def remove_arabic_alphabets(text: str):
"""
Removes ``Arabic`` words and digits from a ``text``
Args:
text (str): Sorani text
Returns:
str: ``str`` object with arabic alphabets removed
"""
characters = "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ"
table = str.maketrans({key: None for key in characters})
return text.translate(table)
def filtered_arabic_characters():
kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ")
arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ")
# Create a new set of Arabic characters without the Kurdish characters
filtered_arabic_characters = arabic_characters - kurdish_characters
return ''.join(filtered_arabic_characters)
def is_arabic_string(text):
"""Returns True if the text contains any Arabic characters, False otherwise."""
# arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ")
arabic_characters = filtered_arabic_characters()
for ch in text:
if ch in arabic_characters:
return True
return False
def contains_arabic(text):
arabic_characters = filtered_arabic_characters()
return any(char in arabic_characters for char in text)
def is_english_string(text):
"""Returns True if the text contains only English characters, False otherwise."""
english_pattern = re.compile(r'[a-zA-Z]')
return bool(english_pattern.search(text))
def remove_english_alphabets(text: str):
"""
Removes ``English`` words and digits from a ``text``
"""
characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
table = str.maketrans({key: None for key in characters})
return text.translate(table)
def resolve_ae(text):
"""
This function takes a text input in Central Kurdish (Sorani) script and performs a series of character replacements
to standardize variations in the script. Specifically, it addresses cases where the character 'ە' (Arabic letter
AE) may be used in different contexts.
"""
# First replace all occurrences of 'ه' with 'ە'
text = re.sub("ه", "ە", text)
# Replace specific combinations with 'ها', 'هێ', and 'ه'
text = re.sub("ەا", "ها", text) # Replace ەا with ها
text = re.sub("ەێ", "هێ", text) # Replace ەێ with هێ
text = re.sub("ەۆ", "هۆ", text) # Replace ەۆ with هۆ
# Replace ە (AE) at the beginning of a word with ه (HEH)
text = re.sub(r"\b(ە\w*)", lambda match: "ه" + match.group(1)[1:], text)
# Replace ALEF+AE with ALEF+HEH
text = re.sub("اە", "اه", text)
# Special words should go here before the replcement of 'ە' at the end of the word
# Special case: گەهـ or گاهـ but without the tatweel since tatweel is not a phoneme in Kurdish and it will be a class for the model
text = re.sub(r'\bگەە[-ـ]?\b', "گەه", text)
# Replace 'ەە' at the beginning and end with 'هە'
text = re.sub(r"\bەە|ەە\b", "هە", text)
# Special case if two AEs come before ۆ it should be replaced with AE+HEH
text = re.sub(r"ەە(?=ۆ)", "ەه", text)
# Special case if two AEs come after either و or ب or ئ or ڕ or ق or ز they should be replaced with AE+HEH
text = re.sub(r"(?<=\b[بوئڕقزژ])ەە", "ەه", text)
# The following special case should happen after the previous special case and before the following speciall case
# Special case when two words are together with waw and the the AEs after the waw becomes HEH+AE
text = re.sub(r'(?<=و)ەە(?=\w)', "هە", text)
# Replace Three AEs with AE+HEH+AE (This has to be run before the following special case so words like لەهەوادا will not be ruined)
text = re.sub(r"(?<=\w)ەەە(?=\w)", "ەهە", text)
# Special case if two AEs are in the middle of a word and come before YEH ی or TCHEH چ or و they will be replaced with AE+HEH if the YEH or TCHEH are not at the END of the word
text = re.sub(r"(?<=\w)ەە(?=[چیو]\B)", "ەه", text)
# Replace 'ەە'AE+AE in the middle of a word with HEH+AE
text = re.sub(r"(?<=\w)ەە(?=\w)", "هە", text)
# Replace two AE with spaces in between with AE HEH
text = re.sub("ە ە", "ە ه", text)
# Replace all HEH DOACHASHMEE with HEH
# text = text.replace('ھ', 'ە')
return text
clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!\/](?!\d)")
def remove_punctuation(text):
"""Remove all punctuation from string, except if it's between digits"""
return clean_punctuation.sub("", text)
def extract_punctuation(text):
# Initialize an empty string to store the extracted punctuation
extracted_punctuation = ""
# Iterate through each character in the input text
for char in text:
# Check if the character is categorized as punctuation
if unicodedata.category(char).startswith('P'):
extracted_punctuation += char # Add it to the result
return set(extracted_punctuation)
ARABIC_PUCTUATIONS = "،؛۔٫٪؟"
CKB_PUNCTUATIONS = "!.:;?،؛؟«»" + ARABIC_PUCTUATIONS
KURDISH_CHARS = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
def contains_non_kurdish_characters(text):
# kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
non_kurdish_chars = set(text) - kurdish_characters
return len(non_kurdish_chars) > 0
def keep_kurdish_characters(text):
kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
cleaned_text = ''.join(char for char in text if char in kurdish_characters)
return cleaned_text
def remove_emojis(text):
emoji_pattern = re.compile("["
"\U0001F600-\U0001F64F" # Emoticons
"\U0001F300-\U0001F5FF" # Symbols & Pictographs
"\U0001F680-\U0001F6FF" # Transport & Map Symbols
"\U0001F700-\U0001F77F" # Alchemical Symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def remove_language_families(text):
patterns = [
"[\u1100-\u11FF\u2E80-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF]+", # Asian scripts
"[\u0000-\u024F]+", # Basic Latin and Latin-1 Supplement
"[\u0400-\u04FF]+", # Cyrillic
"[\u0370-\u03FF]+", # Greek
"[\u0900-\u097F]+", # Devanagari
r"\u0B80-\u0BFF", # Tamil
r"\u4E00-\u9FFF", # Han
r"\u10A0-\u10FF", # Georgian
r"\u0C80-\u0CFF" # Kannada
]
combined_pattern = re.compile("|".join(patterns))
cleaned_text = combined_pattern.sub(r'', text)
return cleaned_text
clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!،.؟؛:](?!\d)")
def remove_punctuation(text):
"""Remove all punctuation from string, except if it's between digits"""
return clean_punctuation.sub("", text)
def contains_repeated_ngram(window, n):
ngrams = generate_ngrams(window, n)
ngram_set = set(ngrams)
return len(ngrams) != len(ngram_set)
def generate_ngrams(text, n):
words = text.split()
output = []
for i in range(len(words)- n+1):
output.append(tuple(words[i:i+n]))
return output
def remove_repeated_ngram(text, n):
words = text.split()
output = []
for i in range(len(words)- n+1):
if not contains_repeated_ngram(" ".join(words[i:i+n]), n):
output.append(words[i])
return " ".join(output)
def normalize_punctuations(text: str) -> str:
# Replace , with ،
text = text.replace(',', '،')
# Replace ? with ؟
text = text.replace('?', '؟')
# Replace two or three of the same punctuation marks with a single one
text = re.sub(r'([.,;:?!،؛؟])\1{1,2}', r'\1', text)
# Replace double opening and closing parentheses with guillemets
text = re.sub(r'\(\(', '«', text)
text = re.sub(r'\)\)', '»', text)
# Normalize space around the guillemets and other punctuation marks
text = re.sub(r'\s*«\s*', ' «', text)
text = re.sub(r'\s*»\s*', '» ', text)
# Additional punctuation normalization
text = re.sub(r'\s*([,،؟])\s*', r'\1 ', text)
# Ensure there is no space before a guillemet at the beginning of the text or after a
# guillemet at the end of the text
text = re.sub(r'^\s*«', '«', text)
text = re.sub(r'»\s*$', '»', text)
# If multiple punctuation marks come after each other only keep the first one
# text = re.sub(r'([.!?؟،؛])\1+', r'\1', text)
# if conective punctuation marks come after each other only keep the first one
text = re.sub(r'([.!?؟،؛])\1+', r'\1', text)
# if punctuation marks come after each other with space between them like: ? ? ? keep the first one remove the rest
text = re.sub(r'([.!?؟،؛])\s\1+', r'\1', text)
# Trim leading and trailing spaces and return the normalized text
text = text.strip()
return text
def fix_sentence(sentence):
if sentence.startswith('"') and sentence.endswith('"'):
# we can remove trailing quotation marks as they do not affect the sentence
sentence = sentence[1:-1]
if sentence[-1] not in [".", "?", "!"]:
# append a full-stop to sentences that do not end in punctuation
sentence = sentence + "."
# sentence = sentence[:-1].translate(str.maketrans('', '', string.punctuation)) + sentence[-1]
return sentence
def add_period_abbreviations(text):
abbreviations = set(["پ", "د"]) # Add more abbreviations as needed
# Define a regular expression pattern to match a letter followed by a space and then a word character
pattern = re.compile(r'([{}]) (?=\w)'.format(''.join(abbreviations)))
# Use regex to add periods after the specified abbreviations with a space after the period
text = pattern.sub(r'\1. ', text)
# Add periods after each letter if "د" and "خ" appear together
text = re.sub(r'د\sخ|خ ?د|د\.?خ|خ\.?د', 'د. خ.', text)
# Abbreviated dates
# text = re.sub(r'\b(پ\. ز)\b', r'\1.', text)
return text
def process_text(text):
# text = replace_words_in_corpus(text)
text = resolve_ae(text)
# text = number_to_word(text)
text = preprocessor_ckb.preprocess(text)
# text = normalizer(text).strip()
text = remove_emojis(text)
text = normalize_punctuations(text)
text = fix_sentence(text)
text = apply_char_replacements(text)
return text
if __name__ == "__main__":
# text = "لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت. 'õ'\u200c\u200f\u200e'ھ'"
# print(process_text(text))
# print(contains_non_kurdish_characters(text))
# text = "دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟"
# correct = "دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟"
# print("Before punctuation normalization:", text)
# print("After punctuation normalization:", normalize_punctuations(text))
# print("Correct:\t\t\t", correct)
# print(normalize_punctuations(text) == correct)
# print(normalize_punctuations("ڕەوا بورهان 4 تەمموز ، کوردستانی سلێمانی?!!"))
# print(normalize_punctuations("یانەی کوردژین تکایە چۆن بە شی سە ڕە کی و لاوە کی بۆ مالپە ڕە کە م زیاد بکە م؟؟ ؟ ؟ لە سکرێپە یتی ژومیلە"))
# with open('data/data.ckb.txt', 'r', encoding='utf-8') as src_file:
# source_data = src_file.read()
# unified_data = normalize_punctuations(source_data)
# # Save the unified data to a new file
# with open('data/unified_data.txt', 'w', encoding='utf-8') as file:
# file.writelines(unified_data)
# print("Unified data saved to unified_data.txt")
text = "Hello ((Friend)) Hello , Friend World"
# print(remove_repeated_ngram(text, 2))
# print(remove_repeated_ngrams(text, ))
print(process_text(text))