Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
e2db848
1
Parent(s):
6114f21
redorder preprocessing steps and add new
Browse files- src/utils.py +36 -16
src/utils.py
CHANGED
@@ -18,7 +18,7 @@ from stqdm import stqdm
|
|
18 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
19 |
|
20 |
from .configs import Languages, ModelConfigs, SupportedFiles
|
21 |
-
|
22 |
stqdm.pandas()
|
23 |
|
24 |
|
@@ -27,13 +27,17 @@ def get_logo(path):
|
|
27 |
return Image.open(path)
|
28 |
|
29 |
|
30 |
-
# @st.cache(suppress_st_warning=True)
|
|
|
31 |
def read_file(uploaded_file) -> pd.DataFrame:
|
32 |
|
33 |
file_type = uploaded_file.name.split(".")[-1]
|
34 |
if file_type in set(i.name for i in SupportedFiles):
|
35 |
read_f = SupportedFiles[file_type].value[0]
|
36 |
-
|
|
|
|
|
|
|
37 |
|
38 |
else:
|
39 |
st.error("File type not supported")
|
@@ -155,16 +159,20 @@ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs
|
|
155 |
|
156 |
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
157 |
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
158 |
-
|
|
|
|
|
159 |
|
|
|
|
|
|
|
160 |
|
|
|
161 |
def normalize_useless_spaces(t):
|
162 |
return _re_space.sub(" ", t)
|
163 |
|
164 |
|
165 |
_re_rep = re.compile(r"(\S)(\1{2,})")
|
166 |
-
|
167 |
-
|
168 |
def normalize_repeating_chars(t):
|
169 |
def _replace_rep(m):
|
170 |
c, cc = m.groups()
|
@@ -174,8 +182,6 @@ def normalize_repeating_chars(t):
|
|
174 |
|
175 |
|
176 |
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
177 |
-
|
178 |
-
|
179 |
def normalize_repeating_words(t):
|
180 |
def _replace_wrep(m):
|
181 |
c, cc, e = m.groups()
|
@@ -248,18 +254,20 @@ class TextPreprocessor:
|
|
248 |
("normalize_hyphenated_words", normalize.hyphenated_words),
|
249 |
("normalize_quotation_marks", normalize.quotation_marks),
|
250 |
("normalize_whitespace", normalize.whitespace),
|
251 |
-
("
|
252 |
-
("remove_brackets", remove.brackets),
|
253 |
-
("remove_html_tags", remove.html_tags),
|
254 |
-
("remove_punctuation", remove.punctuation),
|
255 |
("replace_currency_symbols", replace.currency_symbols),
|
256 |
("replace_emails", replace.emails),
|
257 |
("replace_emojis", replace.emojis),
|
258 |
("replace_hashtags", replace.hashtags),
|
259 |
("replace_numbers", replace.numbers),
|
260 |
("replace_phone_numbers", replace.phone_numbers),
|
261 |
-
("replace_urls", replace.urls),
|
262 |
("replace_user_handles", replace.user_handles),
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
("normalize_useless_spaces", normalize_useless_spaces),
|
264 |
("normalize_repeating_chars", normalize_repeating_chars),
|
265 |
("normalize_repeating_words", normalize_repeating_words),
|
@@ -286,15 +294,27 @@ class TextPreprocessor:
|
|
286 |
|
287 |
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
288 |
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
-
source
|
|
|
|
|
292 |
|
293 |
bars = (
|
294 |
alt.Chart(source)
|
295 |
.mark_bar()
|
296 |
.encode(
|
297 |
-
x="Labels:O",
|
298 |
y="Counts:Q",
|
299 |
)
|
300 |
)
|
|
|
18 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
19 |
|
20 |
from .configs import Languages, ModelConfigs, SupportedFiles
|
21 |
+
import string
|
22 |
stqdm.pandas()
|
23 |
|
24 |
|
|
|
27 |
return Image.open(path)
|
28 |
|
29 |
|
30 |
+
# @st.cache(suppress_st_warning=True)
|
31 |
+
@st.cache(allow_output_mutation=True)
|
32 |
def read_file(uploaded_file) -> pd.DataFrame:
|
33 |
|
34 |
file_type = uploaded_file.name.split(".")[-1]
|
35 |
if file_type in set(i.name for i in SupportedFiles):
|
36 |
read_f = SupportedFiles[file_type].value[0]
|
37 |
+
df = read_f(uploaded_file)
|
38 |
+
# remove any NA
|
39 |
+
df = df.dropna()
|
40 |
+
return df
|
41 |
|
42 |
else:
|
43 |
st.error("File type not supported")
|
|
|
159 |
|
160 |
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
161 |
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
162 |
+
_re_normalize_acronyms = re.compile("(?:[a-zA-Z]\.){2,}")
|
163 |
+
def normalize_acronyms(t):
|
164 |
+
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
165 |
|
166 |
+
_re_non_word = re.compile("\W")
|
167 |
+
def remove_non_word(t):
|
168 |
+
return _re_non_word.sub(" ", t)
|
169 |
|
170 |
+
_re_space = re.compile(" {2,}")
|
171 |
def normalize_useless_spaces(t):
|
172 |
return _re_space.sub(" ", t)
|
173 |
|
174 |
|
175 |
_re_rep = re.compile(r"(\S)(\1{2,})")
|
|
|
|
|
176 |
def normalize_repeating_chars(t):
|
177 |
def _replace_rep(m):
|
178 |
c, cc = m.groups()
|
|
|
182 |
|
183 |
|
184 |
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
|
|
|
|
185 |
def normalize_repeating_words(t):
|
186 |
def _replace_wrep(m):
|
187 |
c, cc, e = m.groups()
|
|
|
254 |
("normalize_hyphenated_words", normalize.hyphenated_words),
|
255 |
("normalize_quotation_marks", normalize.quotation_marks),
|
256 |
("normalize_whitespace", normalize.whitespace),
|
257 |
+
("replace_urls", replace.urls),
|
|
|
|
|
|
|
258 |
("replace_currency_symbols", replace.currency_symbols),
|
259 |
("replace_emails", replace.emails),
|
260 |
("replace_emojis", replace.emojis),
|
261 |
("replace_hashtags", replace.hashtags),
|
262 |
("replace_numbers", replace.numbers),
|
263 |
("replace_phone_numbers", replace.phone_numbers),
|
|
|
264 |
("replace_user_handles", replace.user_handles),
|
265 |
+
("normalize_acronyms", normalize_acronyms),
|
266 |
+
("remove_accents", remove.accents),
|
267 |
+
("remove_brackets", remove.brackets),
|
268 |
+
("remove_html_tags", remove.html_tags),
|
269 |
+
("remove_punctuation", remove.punctuation),
|
270 |
+
("remove_non_words", remove_non_word),
|
271 |
("normalize_useless_spaces", normalize_useless_spaces),
|
272 |
("normalize_repeating_chars", normalize_repeating_chars),
|
273 |
("normalize_repeating_words", normalize_repeating_words),
|
|
|
294 |
|
295 |
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
296 |
|
297 |
+
unique_value_limit = 100
|
298 |
+
|
299 |
+
if data[label_column].nunique() > unique_value_limit:
|
300 |
+
|
301 |
+
st.warning(f"""
|
302 |
+
The column you selected has more than {unique_value_limit}.
|
303 |
+
Are you sure it's the right column? If it is, please note that
|
304 |
+
this will impact __Wordify__ performance.
|
305 |
+
""")
|
306 |
+
|
307 |
+
return
|
308 |
|
309 |
+
source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
|
310 |
+
source["Props"] = source["Counts"] / source["Counts"].sum()
|
311 |
+
source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
312 |
|
313 |
bars = (
|
314 |
alt.Chart(source)
|
315 |
.mark_bar()
|
316 |
.encode(
|
317 |
+
x=alt.X("Labels:O", sort="-y"),
|
318 |
y="Counts:Q",
|
319 |
)
|
320 |
)
|