import streamlit as st import awesome_streamlit as ast from .preprocess import ( ArabertPreprocessor, white_spaced_back_quotation_regex, white_spaced_double_quotation_regex, white_spaced_em_dash, white_spaced_single_quotation_regex, left_and_right_spaced_chars, left_spaced_chars, right_spaced_chars, ) import re MODELS_to_SELECT = [ "None", "bert-base-arabertv01", "bert-base-arabert", "bert-base-arabertv02", "bert-base-arabertv2", "bert-large-arabertv02", "bert-large-arabertv2", "araelectra-base", "araelectra-base-discriminator", "araelectra-base-generator", "araelectra-base-artydiqa", "aragpt2-base", "aragpt2-medium", "aragpt2-large", "aragpt2-mega", ] def unpreprocess(text: str) -> str: """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. The objective is to make the generated text of any model appear natural and not preprocessed. Args: text (:obj:`str`): input text to be un-preprocessed desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. Returns: str: The unpreprocessed (and possibly Farasa-desegmented) text. """ text = desegment(text) # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple # https://stackoverflow.com/a/53436792/5381220 text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) # during generation, sometimes the models don't put a space after the dot, this handles it text = text.replace(".", " . ") text = " ".join(text.split()) # handle decimals text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(left_and_right_spaced_chars, r"\1", text) text = re.sub(left_spaced_chars, r"\1", text) text = re.sub(right_spaced_chars, r"\1", text) return text def desegment(text: str) -> str: """ Use this function if sentence tokenization was done using `from arabert.preprocess_arabert import preprocess` with Farasa enabled AraBERT segmentation using Farasa adds a space after the '+' for prefixes, and after before the '+' for suffixes Example: >>> desegment('ال+ دراس +ات') الدراسات """ text = text.replace("+ ", "+") text = text.replace(" +", "+") text = " ".join([_desegmentword(word) for word in text.split(" ")]) return text def _desegmentword(orig_word: str) -> str: """ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs Example: >>> _desegmentword("ال+يومي+ة") اليومية """ word = orig_word.replace("ل+ال+", "لل") if "ال+ال" not in orig_word: word = word.replace("ل+ال", "لل") word = word.replace("+", "") word = word.replace("للل", "لل") return word def write(): st.markdown( """