Spaces:
Running
Running
File size: 6,161 Bytes
9c398de 9b5004f 9c398de 3036e92 c59ebda 3036e92 9c398de 3036e92 9c398de 3036e92 9c398de 3036e92 9c398de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import streamlit as st
import awesome_streamlit as ast
from .preprocess import (
ArabertPreprocessor,
white_spaced_back_quotation_regex,
white_spaced_double_quotation_regex,
white_spaced_em_dash,
white_spaced_single_quotation_regex,
left_and_right_spaced_chars,
left_spaced_chars,
right_spaced_chars,
)
import re
MODELS_to_SELECT = [
"None",
"bert-base-arabertv01",
"bert-base-arabert",
"bert-base-arabertv02",
"bert-base-arabertv2",
"bert-large-arabertv02",
"bert-large-arabertv2",
"araelectra-base",
"araelectra-base-discriminator",
"araelectra-base-generator",
"araelectra-base-artydiqa",
"aragpt2-base",
"aragpt2-medium",
"aragpt2-large",
"aragpt2-mega",
]
def unpreprocess(text: str) -> str:
"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
The objective is to make the generated text of any model appear natural and not preprocessed.
Args:
text (:obj:`str`): input text to be un-preprocessed
desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..
Returns:
str: The unpreprocessed (and possibly Farasa-desegmented) text.
"""
text = desegment(text)
# removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
# https://stackoverflow.com/a/53436792/5381220
text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
# during generation, sometimes the models don't put a space after the dot, this handles it
text = text.replace(".", " . ")
text = " ".join(text.split())
# handle decimals
text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
text = re.sub(left_and_right_spaced_chars, r"\1", text)
text = re.sub(left_spaced_chars, r"\1", text)
text = re.sub(right_spaced_chars, r"\1", text)
return text
def desegment(text: str) -> str:
"""
Use this function if sentence tokenization was done using
`from arabert.preprocess_arabert import preprocess` with Farasa enabled
AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
and after before the '+' for suffixes
Example:
>>> desegment('ال+ دراس +ات')
الدراسات
"""
text = text.replace("+ ", "+")
text = text.replace(" +", "+")
text = " ".join([_desegmentword(word) for word in text.split(" ")])
return text
def _desegmentword(orig_word: str) -> str:
"""
Word segmentor that takes a Farasa Segmented Word and removes the '+' signs
Example:
>>> _desegmentword("ال+يومي+ة")
اليومية
"""
word = orig_word.replace("ل+ال+", "لل")
if "ال+ال" not in orig_word:
word = word.replace("ل+ال", "لل")
word = word.replace("+", "")
word = word.replace("للل", "لل")
return word
def write():
st.markdown(
"""
<h1 style="text-align:left;">Arabic Text Pre-Processor</h1>
""",
unsafe_allow_html=True,
)
st.markdown(
"""
<style>
p, div, input, label {
text-align: right;
}
</style>
""",
unsafe_allow_html=True,
)
input_text = st.text_input(
"Text to Pre-Process",
value="ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري",
)
st.sidebar.title("Model Selector")
model_selector = st.sidebar.selectbox(
"""Select None to enable further filters""", options=MODELS_to_SELECT, index=3
)
if model_selector == "None":
keep_emojis = st.sidebar.checkbox("Keep emojis", False)
remove_html_markup = st.sidebar.checkbox("Remove html markup", True)
strip_tashkeel = st.sidebar.checkbox("Strip tashkeel", True)
replace_urls_emails_mentions = st.sidebar.checkbox(
"Replace urls and emails", True
)
strip_tatweel = st.sidebar.checkbox("Strip tatweel", True)
insert_white_spaces = st.sidebar.checkbox("Insert white spaces", True)
remove_non_digit_repetition = st.sidebar.checkbox(
"Remove non-digit repetition", True
)
replace_slash_with_dash = st.sidebar.checkbox("Replace slash with dash", None)
map_hindi_numbers_to_arabic = st.sidebar.checkbox(
"Map hindi numbers to arabic", None
)
apply_farasa_segmentation = st.sidebar.checkbox(
"Apply farasa segmentation", None
)
run_preprocessor = st.button("Run Pre-Processor")
prep_text = None
if run_preprocessor:
if model_selector == "None":
arabert_preprocessor = ArabertPreprocessor(
model_selector,
keep_emojis,
remove_html_markup,
replace_urls_emails_mentions,
strip_tashkeel,
strip_tatweel,
insert_white_spaces,
remove_non_digit_repetition,
replace_slash_with_dash,
map_hindi_numbers_to_arabic,
apply_farasa_segmentation,
)
else:
arabert_preprocessor = ArabertPreprocessor(model_name=model_selector)
prep_text = arabert_preprocessor._preprocess_v3(input_text)
st.write(prep_text)
st.write("-----")
input_text_unprep = st.text_input(
"Text to Undo the Pre-Processing",
value=prep_text
if prep_text
else "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري",
)
run_unpreprocessor = st.button("Run Un-Pre-Processor")
if run_unpreprocessor:
st.write(unpreprocess(input_text_unprep))
|