Spaces:

aubmindlab
/

Arabic-NLP

Runtime error

App Files Files Community

Arabic-NLP / backend /processor.py

wissamantoun

added language generation

c59ebda about 3 years ago

raw

history blame

6.11 kB

	import streamlit as st
	import awesome_streamlit as ast
	from .preprocess import (
	ArabertPreprocessor,
	white_spaced_back_quotation_regex,
	white_spaced_double_quotation_regex,
	white_spaced_em_dash,
	white_spaced_single_quotation_regex,
	left_and_right_spaced_chars,
	left_spaced_chars,
	right_spaced_chars,
	)
	import re

	MODELS_to_SELECT = [
	"None",
	"bert-base-arabertv01",
	"bert-base-arabert",
	"bert-base-arabertv02",
	"bert-base-arabertv2",
	"bert-large-arabertv02",
	"bert-large-arabertv2",
	"araelectra-base",
	"araelectra-base-discriminator",
	"araelectra-base-generator",
	"araelectra-base-artydiqa",
	"aragpt2-base",
	"aragpt2-medium",
	"aragpt2-large",
	"aragpt2-mega",
	]


	def unpreprocess(text: str) -> str:
	"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
	The objective is to make the generated text of any model appear natural and not preprocessed.

	Args:
	text (:obj:`str`): input text to be un-preprocessed
	desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..

	Returns:
	str: The unpreprocessed (and possibly Farasa-desegmented) text.
	"""

	text = desegment(text)

	# removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
	# https://stackoverflow.com/a/53436792/5381220
	text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
	text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
	text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
	text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)

	# during generation, sometimes the models don't put a space after the dot, this handles it
	text = text.replace(".", " . ")
	text = " ".join(text.split())

	# handle decimals
	text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
	text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)

	text = re.sub(left_and_right_spaced_chars, r"\1", text)
	text = re.sub(left_spaced_chars, r"\1", text)
	text = re.sub(right_spaced_chars, r"\1", text)

	return text


	def desegment(text: str) -> str:
	"""
	Use this function if sentence tokenization was done using
	`from arabert.preprocess_arabert import preprocess` with Farasa enabled
	AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
	and after before the '+' for suffixes

	Example:
	>>> desegment('ال+ دراس +ات')
	الدراسات
	"""
	text = text.replace("+ ", "+")
	text = text.replace(" +", "+")
	text = " ".join([_desegmentword(word) for word in text.split(" ")])
	return text


	def _desegmentword(orig_word: str) -> str:
	"""
	Word segmentor that takes a Farasa Segmented Word and removes the '+' signs

	Example:
	>>> _desegmentword("ال+يومي+ة")
	اليومية
	"""
	word = orig_word.replace("ل+ال+", "لل")
	if "ال+ال" not in orig_word:
	word = word.replace("ل+ال", "لل")
	word = word.replace("+", "")
	word = word.replace("للل", "لل")
	return word


	def write():
	_, col1, _ = st.beta_columns(3)

	with col1:
	col1.title("Arabic Text Pre-Processor")
	st.markdown(
	"""
	<style>
	p, div, input, label {
	text-align: right;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)
	input_text = st.text_input(
	"Text to Pre-Process",
	value="ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري",
	)

	st.sidebar.title("Model Selector")
	model_selector = st.sidebar.selectbox(
	"""Select None to enable further filters""", options=MODELS_to_SELECT, index=3
	)
	if model_selector == "None":
	keep_emojis = st.sidebar.checkbox("Keep emojis", False)
	remove_html_markup = st.sidebar.checkbox("Remove html markup", True)
	strip_tashkeel = st.sidebar.checkbox("Strip tashkeel", True)
	replace_urls_emails_mentions = st.sidebar.checkbox(
	"Replace urls and emails", True
	)
	strip_tatweel = st.sidebar.checkbox("Strip tatweel", True)
	insert_white_spaces = st.sidebar.checkbox("Insert white spaces", True)
	remove_non_digit_repetition = st.sidebar.checkbox(
	"Remove non-digit repetition", True
	)
	replace_slash_with_dash = st.sidebar.checkbox("Replace slash with dash", None)
	map_hindi_numbers_to_arabic = st.sidebar.checkbox(
	"Map hindi numbers to arabic", None
	)
	apply_farasa_segmentation = st.sidebar.checkbox(
	"Apply farasa segmentation", None
	)

	run_preprocessor = st.button("Run Pre-Processor")

	prep_text = None
	if run_preprocessor:
	if model_selector == "None":
	arabert_preprocessor = ArabertPreprocessor(
	model_selector,
	keep_emojis,
	remove_html_markup,
	replace_urls_emails_mentions,
	strip_tashkeel,
	strip_tatweel,
	insert_white_spaces,
	remove_non_digit_repetition,
	replace_slash_with_dash,
	map_hindi_numbers_to_arabic,
	apply_farasa_segmentation,
	)
	else:
	arabert_preprocessor = ArabertPreprocessor(model_name=model_selector)
	prep_text = arabert_preprocessor._preprocess_v3(input_text)
	st.write(prep_text)

	st.write("-----")
	input_text_unprep = st.text_input(
	"Text to Undo the Pre-Processing",
	value=prep_text
	if prep_text
	else "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري",
	)
	run_unpreprocessor = st.button("Run Un-Pre-Processor")

	if run_unpreprocessor:
	st.write(unpreprocess(input_text_unprep))