Spaces:

ariahmed
/

kurd-spell-app

Runtime error

App Files Files Community

kurd-spell-app / ckb_helpers.py

ariahmed

Upload folder using huggingface_hub

e489264 verified 30 days ago

raw

history blame contribute delete

17.8 kB

	import re
	from klpt.preprocess import Preprocess
	from klpt.tokenize import Tokenize

	import unicodedata

	preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Arabic")
	tokenizer_ckb = Tokenize("Sorani", "Arabic")


	unify_numbers = {
	"٠\|۰": "0",
	"١\|۱": "1",
	"٢\|۲": "2",
	"٣\|۳": "3",
	"٤\|۴": "4",
	"٥\|۵": "5",
	"٦\|۶": "6",
	"٧\|۷": "7",
	"٨\|۸": "8",
	"٩\|۹": "9"
	}

	# Taken from AsoSoft library
	def number_to_word(text):
	# convert numbers to latin
	for k, v in unify_numbers.items():
	text = re.sub(k, v, text)

	text = re.sub(r"([0-9]{1,3})[,،](?=[0-9]{3})", r"\1", text); # remove thousend seperator 12,345,678 => 12345678
	text = re.sub(r"(?<![0-9])-([0-9]+)", r"ناقس \1", text); # negative
	text = text.replace("٪", "%") # Replace arabic percent sign with latin
	text = re.sub(r"(?<![0-9])% ?([0-9]+)", r"لە سەددا \1", text); # percent sign before
	text = re.sub(r"([0-9]+) ?%", r"\1 لە سەد", text); # percent sign after
	text = re.sub(r"\$ ?([0-9]+(\.[0-9]+)?)", r"\1 دۆلار", text) # $ querency
	text = re.sub(r"£ ?([0-9]+(\.[0-9]+)?)", r"\1 پاوەن", text) # £ querency
	text = re.sub(r"€ ?([0-9]+(\.[0-9]+)?)", r"\1 یۆرۆ", text) # € querency

	# convert float numbers
	text = re.sub(r"([0-9]+)\.([0-9]+)", lambda x: float_name(x.group(1), x.group(2)), text)

	# convert remaining integr numbers
	text = re.sub(r"([0-9]+)", lambda match: integer_name(match.group(1)), text)

	return text

	def float_name(integerPart, decimalPart):
	zeros = re.search("^0+", decimalPart)
	point = " پۆینت "
	if(zeros):
	point = point + re.sub("0", " سفر ", zeros[0])
	return integer_name(integerPart) + point + integer_name(decimalPart)

	ones = ["", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ"]
	teens = [ "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" ]
	tens = [ "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد"]
	hundreds = ["", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد"]
	thousands = ["", " هەزار", " ملیۆن", " ملیار", " ترلیۆن", " کوادرلیۆن", " کوینتلیۆن"]

	def integer_name(inputInteger):
	output = ""
	if (inputInteger != "0"):
	temp = inputInteger
	for i in range(0, len(inputInteger), 3):
	matched_numbers = re.findall(r"[0-9]{1,3}$", temp)
	currentThree = matched_numbers[0] if matched_numbers else ""

	temp = temp[:len(temp) - len(currentThree)]
	currentThree = currentThree.rjust(3, '0')
	C = int(currentThree[0])
	X = int(currentThree[1])
	I = int(currentThree[2])
	conjunction1 = " و " if (C != 0) and (X != 0 or I != 0) else ""
	conjunction2 = " و " if X != 0 and I != 0 else ""

	if (X == 1):
	currentThree = hundreds[C] + conjunction1 + teens[I]
	else:
	currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I]

	currentThree += "" if currentThree == "" else thousands[i // 3]

	conjunction3 = "" if output == "" else " و "
	if (currentThree != ""):
	output = currentThree + conjunction3 + output
	output = output.replace("یەک هەزار", "هەزار")
	else: # if input number = 0
	output = "سفر"
	return output




	def replace_words_in_corpus(sentence):
	modified_corpus = []

	words = sentence.split()
	modified_words = []

	for word in words:
	if word in word_replacements:
	modified_words.append(word_replacements[word])
	else:
	modified_words.append(word)

	modified_sentence = " ".join(modified_words)

	return modified_sentence

	# put this in a json file
	word_replacements = {
	"ههڵاڵەەي": "هەڵاڵەی",
	"وهەمهەمه": "وهەمهەمه",
	"ئهباتههوه": "ئەباتەوە",
	"بەخءرایی": "بەخێرایی",
	"ئیثانۆڵ": "ئیسانۆڵ",
	"عەبدوڵڵاهـ": "عەبدوڵڵا",
	"کولاهـ": "کولاه",
	"ئاھ": "ئاه",
	}


	char_replacements = {
	'\u200e': '',
	'\u200f': '',
	'\u200c': '',
	'õ': '',
	'ھ': 'ه'
	}
	def apply_char_replacements(text: str):

	for old, new in char_replacements.items():
	text = text.replace(old, new)
	return text


	def remove_arabic_alphabets(text: str):
	"""
	Removes ``Arabic`` words and digits from a ``text``

	Args:
	text (str): Sorani text
	Returns:
	str: ``str`` object with arabic alphabets removed
	"""
	characters = "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ"
	table = str.maketrans({key: None for key in characters})
	return text.translate(table)



	def filtered_arabic_characters():
	kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ")
	arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ")

	# Create a new set of Arabic characters without the Kurdish characters
	filtered_arabic_characters = arabic_characters - kurdish_characters

	return ''.join(filtered_arabic_characters)



	def is_arabic_string(text):
	"""Returns True if the text contains any Arabic characters, False otherwise."""
	# arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ")
	arabic_characters = filtered_arabic_characters()
	for ch in text:
	if ch in arabic_characters:
	return True
	return False

	def contains_arabic(text):
	arabic_characters = filtered_arabic_characters()
	return any(char in arabic_characters for char in text)


	def is_english_string(text):
	"""Returns True if the text contains only English characters, False otherwise."""
	english_pattern = re.compile(r'[a-zA-Z]')
	return bool(english_pattern.search(text))


	def remove_english_alphabets(text: str):
	"""
	Removes ``English`` words and digits from a ``text``
	"""
	characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
	table = str.maketrans({key: None for key in characters})
	return text.translate(table)




	def resolve_ae(text):
	"""
	This function takes a text input in Central Kurdish (Sorani) script and performs a series of character replacements
	to standardize variations in the script. Specifically, it addresses cases where the character 'ە' (Arabic letter
	AE) may be used in different contexts.
	"""
	# First replace all occurrences of 'ه' with 'ە'
	text = re.sub("ه", "ە", text)
	# Replace specific combinations with 'ها', 'هێ', and 'ه'
	text = re.sub("ەا", "ها", text) # Replace ەا with ها
	text = re.sub("ەێ", "هێ", text) # Replace ەێ with هێ
	text = re.sub("ەۆ", "هۆ", text) # Replace ەۆ with هۆ

	# Replace ە (AE) at the beginning of a word with ه (HEH)
	text = re.sub(r"\b(ە\w*)", lambda match: "ه" + match.group(1)[1:], text)

	# Replace ALEF+AE with ALEF+HEH
	text = re.sub("اە", "اه", text)

	# Special words should go here before the replcement of 'ە' at the end of the word
	# Special case: گەهـ or گاهـ but without the tatweel since tatweel is not a phoneme in Kurdish and it will be a class for the model
	text = re.sub(r'\bگەە[-ـ]?\b', "گەه", text)

	# Replace 'ەە' at the beginning and end with 'هە'
	text = re.sub(r"\bەە\|ەە\b", "هە", text)

	# Special case if two AEs come before ۆ it should be replaced with AE+HEH
	text = re.sub(r"ەە(?=ۆ)", "ەه", text)

	# Special case if two AEs come after either و or ب or ئ or ڕ or ق or ز they should be replaced with AE+HEH
	text = re.sub(r"(?<=\b[بوئڕقزژ])ەە", "ەه", text)
	# The following special case should happen after the previous special case and before the following speciall case
	# Special case when two words are together with waw and the the AEs after the waw becomes HEH+AE
	text = re.sub(r'(?<=و)ەە(?=\w)', "هە", text)

	# Replace Three AEs with AE+HEH+AE (This has to be run before the following special case so words like لەهەوادا will not be ruined)
	text = re.sub(r"(?<=\w)ەەە(?=\w)", "ەهە", text)

	# Special case if two AEs are in the middle of a word and come before YEH ی or TCHEH چ or و they will be replaced with AE+HEH if the YEH or TCHEH are not at the END of the word
	text = re.sub(r"(?<=\w)ەە(?=[چیو]\B)", "ەه", text)

	# Replace 'ەە'AE+AE in the middle of a word with HEH+AE
	text = re.sub(r"(?<=\w)ەە(?=\w)", "هە", text)

	# Replace two AE with spaces in between with AE HEH
	text = re.sub("ە ە", "ە ه", text)

	# Replace all HEH DOACHASHMEE with HEH
	# text = text.replace('ھ', 'ە')
	return text

	clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!\/](?!\d)")
	def remove_punctuation(text):
	"""Remove all punctuation from string, except if it's between digits"""
	return clean_punctuation.sub("", text)


	def extract_punctuation(text):
	# Initialize an empty string to store the extracted punctuation
	extracted_punctuation = ""

	# Iterate through each character in the input text
	for char in text:
	# Check if the character is categorized as punctuation
	if unicodedata.category(char).startswith('P'):
	extracted_punctuation += char # Add it to the result

	return set(extracted_punctuation)



	ARABIC_PUCTUATIONS = "،؛۔٫٪؟"
	CKB_PUNCTUATIONS = "!.:;?،؛؟«»" + ARABIC_PUCTUATIONS
	KURDISH_CHARS = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")

	def contains_non_kurdish_characters(text):
	# kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
	kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
	non_kurdish_chars = set(text) - kurdish_characters

	return len(non_kurdish_chars) > 0


	def keep_kurdish_characters(text):
	kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")

	cleaned_text = ''.join(char for char in text if char in kurdish_characters)
	return cleaned_text



	def remove_emojis(text):
	emoji_pattern = re.compile("["
	"\U0001F600-\U0001F64F" # Emoticons
	"\U0001F300-\U0001F5FF" # Symbols & Pictographs
	"\U0001F680-\U0001F6FF" # Transport & Map Symbols
	"\U0001F700-\U0001F77F" # Alchemical Symbols
	"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U0001FA00-\U0001FA6F" # Chess Symbols
	"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	"\U00002702-\U000027B0" # Dingbats
	"]+", flags=re.UNICODE)
	return emoji_pattern.sub(r'', text)


	def remove_language_families(text):
	patterns = [
	"[\u1100-\u11FF\u2E80-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF]+", # Asian scripts
	"[\u0000-\u024F]+", # Basic Latin and Latin-1 Supplement
	"[\u0400-\u04FF]+", # Cyrillic
	"[\u0370-\u03FF]+", # Greek
	"[\u0900-\u097F]+", # Devanagari
	r"\u0B80-\u0BFF", # Tamil
	r"\u4E00-\u9FFF", # Han
	r"\u10A0-\u10FF", # Georgian
	r"\u0C80-\u0CFF" # Kannada
	]

	combined_pattern = re.compile("\|".join(patterns))

	cleaned_text = combined_pattern.sub(r'', text)
	return cleaned_text


	clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!،.؟؛:](?!\d)")
	def remove_punctuation(text):
	"""Remove all punctuation from string, except if it's between digits"""
	return clean_punctuation.sub("", text)

	def contains_repeated_ngram(window, n):
	ngrams = generate_ngrams(window, n)
	ngram_set = set(ngrams)
	return len(ngrams) != len(ngram_set)


	def generate_ngrams(text, n):
	words = text.split()
	output = []
	for i in range(len(words)- n+1):
	output.append(tuple(words[i:i+n]))
	return output

	def remove_repeated_ngram(text, n):
	words = text.split()
	output = []
	for i in range(len(words)- n+1):
	if not contains_repeated_ngram(" ".join(words[i:i+n]), n):
	output.append(words[i])
	return " ".join(output)

	def normalize_punctuations(text: str) -> str:
	# Replace , with ،
	text = text.replace(',', '،')
	# Replace ? with ؟
	text = text.replace('?', '؟')
	# Replace two or three of the same punctuation marks with a single one
	text = re.sub(r'([.,;:?!،؛؟])\1{1,2}', r'\1', text)


	# Replace double opening and closing parentheses with guillemets
	text = re.sub(r'\(\(', '«', text)
	text = re.sub(r'\)\)', '»', text)

	# Normalize space around the guillemets and other punctuation marks
	text = re.sub(r'\s«\s', ' «', text)
	text = re.sub(r'\s»\s', '» ', text)

	# Additional punctuation normalization
	text = re.sub(r'\s([,،؟])\s', r'\1 ', text)

	# Ensure there is no space before a guillemet at the beginning of the text or after a
	# guillemet at the end of the text
	text = re.sub(r'^\s*«', '«', text)
	text = re.sub(r'»\s*$', '»', text)

	# If multiple punctuation marks come after each other only keep the first one
	# text = re.sub(r'([.!?؟،؛])\1+', r'\1', text)

	# if conective punctuation marks come after each other only keep the first one
	text = re.sub(r'([.!?؟،؛])\1+', r'\1', text)

	# if punctuation marks come after each other with space between them like: ? ? ? keep the first one remove the rest
	text = re.sub(r'([.!?؟،؛])\s\1+', r'\1', text)
	# Trim leading and trailing spaces and return the normalized text
	text = text.strip()
	return text


	def fix_sentence(sentence):

	if sentence.startswith('"') and sentence.endswith('"'):
	# we can remove trailing quotation marks as they do not affect the sentence
	sentence = sentence[1:-1]

	if sentence[-1] not in [".", "?", "!"]:
	# append a full-stop to sentences that do not end in punctuation
	sentence = sentence + "."
	# sentence = sentence[:-1].translate(str.maketrans('', '', string.punctuation)) + sentence[-1]
	return sentence


	def add_period_abbreviations(text):

	abbreviations = set(["پ", "د"]) # Add more abbreviations as needed

	# Define a regular expression pattern to match a letter followed by a space and then a word character
	pattern = re.compile(r'([{}]) (?=\w)'.format(''.join(abbreviations)))

	# Use regex to add periods after the specified abbreviations with a space after the period
	text = pattern.sub(r'\1. ', text)

	# Add periods after each letter if "د" and "خ" appear together
	text = re.sub(r'د\sخ\|خ ?د\|د\.?خ\|خ\.?د', 'د. خ.', text)

	# Abbreviated dates
	# text = re.sub(r'\b(پ\. ز)\b', r'\1.', text)

	return text


	def process_text(text):
	# text = replace_words_in_corpus(text)
	text = resolve_ae(text)
	# text = number_to_word(text)
	text = preprocessor_ckb.preprocess(text)
	# text = normalizer(text).strip()
	text = remove_emojis(text)
	text = normalize_punctuations(text)
	text = fix_sentence(text)
	text = apply_char_replacements(text)
	return text

	if __name__ == "__main__":
	# text = "لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت. 'õ'\u200c\u200f\u200e'ھ'"

	# print(process_text(text))
	# print(contains_non_kurdish_characters(text))
	# text = "دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟"
	# correct = "دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟"
	# print("Before punctuation normalization:", text)
	# print("After punctuation normalization:", normalize_punctuations(text))
	# print("Correct:\t\t\t", correct)
	# print(normalize_punctuations(text) == correct)
	# print(normalize_punctuations("ڕەوا بورهان 4 تەمموز ، کوردستانی سلێمانی?!!"))
	# print(normalize_punctuations("یانەی کوردژین تکایە چۆن بە شی سە ڕە کی و لاوە کی بۆ مالپە ڕە کە م زیاد بکە م؟؟ ؟ ؟ لە سکرێپە یتی ژومیلە"))
	# with open('data/data.ckb.txt', 'r', encoding='utf-8') as src_file:
	# source_data = src_file.read()

	# unified_data = normalize_punctuations(source_data)

	# # Save the unified data to a new file
	# with open('data/unified_data.txt', 'w', encoding='utf-8') as file:
	# file.writelines(unified_data)

	# print("Unified data saved to unified_data.txt")

	text = "Hello ((Friend)) Hello , Friend World"
	# print(remove_repeated_ngram(text, 2))
	# print(remove_repeated_ngrams(text, ))
	print(process_text(text))