Spaces:
Runtime error
Runtime error
Create sync_streamlit_to_space.yml
Browse files- README.md +3 -25
- app.py +144 -0
- assets/ltr.css +5 -0
- assets/rtl.css +14 -0
- assets/style.css +8 -0
- libs/__init__.py +0 -0
- libs/dummy.py +3 -0
- libs/examples.py +31 -0
- libs/normalizer.py +86 -0
- libs/utils.py +10 -0
- meta.py +9 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,33 +1,11 @@
|
|
1 |
---
|
2 |
title: Typo Detector
|
3 |
-
emoji:
|
4 |
colorFrom: red
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
`title`: _string_
|
14 |
-
Display title for the Space
|
15 |
-
|
16 |
-
`emoji`: _string_
|
17 |
-
Space emoji (emoji-only character allowed)
|
18 |
-
|
19 |
-
`colorFrom`: _string_
|
20 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
21 |
-
|
22 |
-
`colorTo`: _string_
|
23 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
24 |
-
|
25 |
-
`sdk`: _string_
|
26 |
-
Can be either `gradio` or `streamlit`
|
27 |
-
|
28 |
-
`app_file`: _string_
|
29 |
-
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
30 |
-
Path is relative to the root of the repository.
|
31 |
-
|
32 |
-
`pinned`: _boolean_
|
33 |
-
Whether the Space stays on top of your list.
|
|
|
1 |
---
|
2 |
title: Typo Detector
|
3 |
+
emoji: ⚡
|
4 |
colorFrom: red
|
5 |
+
colorTo: red
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
|
11 |
+
# Typo Detector using Transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from transformers import pipeline, set_seed
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
from libs.normalizer import Normalizer
|
8 |
+
from libs.examples import LANGUAGES, EXAMPLES
|
9 |
+
from libs.dummy import outputs as dummy_outputs
|
10 |
+
from libs.utils import local_css, remote_css
|
11 |
+
|
12 |
+
import meta
|
13 |
+
|
14 |
+
MODELS = {
|
15 |
+
"English (en)": "m3hrdadfi/typo-detector-distilbert-en",
|
16 |
+
"Persian (fa)": "m3hrdadfi/typo-detector-distilbert-fa",
|
17 |
+
"Icelandic (is)": "m3hrdadfi/typo-detector-distilbert-is",
|
18 |
+
}
|
19 |
+
|
20 |
+
|
21 |
+
class TypoDetector:
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
model_name_or_path: str = "m3hrdadfi/typo-detector-distilbert-en"
|
25 |
+
) -> None:
|
26 |
+
self.debug = False
|
27 |
+
self.dummy_outputs = dummy_outputs
|
28 |
+
self.model_name_or_path = model_name_or_path
|
29 |
+
self.task_name = "token-classification"
|
30 |
+
|
31 |
+
self.tokenizer = None
|
32 |
+
self.nlp = None
|
33 |
+
self.normalizer = None
|
34 |
+
|
35 |
+
def load(self):
|
36 |
+
if not self.debug:
|
37 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
|
38 |
+
self.nlp = pipeline(self.task_name, model=self.model_name_or_path, tokenizer=self.model_name_or_path)
|
39 |
+
|
40 |
+
self.normalizer = Normalizer()
|
41 |
+
|
42 |
+
def detect(self, sentence):
|
43 |
+
if self.debug:
|
44 |
+
return self.dummy_outputs[0]
|
45 |
+
|
46 |
+
typos = [sentence[r["start"]: r["end"]] for r in self.nlp(sentence)]
|
47 |
+
|
48 |
+
detected = sentence
|
49 |
+
for typo in typos:
|
50 |
+
detected = detected.replace(typo, f'<span class="typo">{typo}</span>')
|
51 |
+
|
52 |
+
return detected
|
53 |
+
|
54 |
+
|
55 |
+
@st.cache(allow_output_mutation=True)
|
56 |
+
def load_typo_detectors():
|
57 |
+
en_detector = TypoDetector(MODELS["English (en)"])
|
58 |
+
en_detector.load()
|
59 |
+
|
60 |
+
fa_detector = TypoDetector(MODELS["Persian (fa)"])
|
61 |
+
fa_detector.load()
|
62 |
+
|
63 |
+
is_detector = TypoDetector(MODELS["Icelandic (is)"])
|
64 |
+
is_detector.load()
|
65 |
+
return {
|
66 |
+
"en": en_detector,
|
67 |
+
"fa": fa_detector,
|
68 |
+
"is": is_detector
|
69 |
+
}
|
70 |
+
|
71 |
+
|
72 |
+
def main():
|
73 |
+
st.set_page_config(
|
74 |
+
page_title="Typo Detector",
|
75 |
+
page_icon="⚡",
|
76 |
+
layout="wide",
|
77 |
+
initial_sidebar_state="expanded"
|
78 |
+
)
|
79 |
+
remote_css("https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font/dist/font-face.css")
|
80 |
+
local_css("assets/style.css")
|
81 |
+
detectors = load_typo_detectors()
|
82 |
+
|
83 |
+
col1, col2 = st.beta_columns([6, 4])
|
84 |
+
with col2:
|
85 |
+
st.markdown(meta.INFO, unsafe_allow_html=True)
|
86 |
+
|
87 |
+
with col1:
|
88 |
+
language = st.selectbox(
|
89 |
+
'Examples (select from this list)',
|
90 |
+
LANGUAGES,
|
91 |
+
index=0
|
92 |
+
)
|
93 |
+
detector = detectors[language]
|
94 |
+
is_rtl = "rtl" if language == "fa" else "ltr"
|
95 |
+
if language == "fa":
|
96 |
+
local_css("assets/rtl.css")
|
97 |
+
else:
|
98 |
+
local_css("assets/ltr.css")
|
99 |
+
|
100 |
+
prompts = list(EXAMPLES[language].keys()) + ["Custom"]
|
101 |
+
prompt = st.selectbox(
|
102 |
+
'Examples (select from this list)',
|
103 |
+
prompts,
|
104 |
+
# index=len(prompts) - 1,
|
105 |
+
index=0
|
106 |
+
)
|
107 |
+
|
108 |
+
if prompt == "Custom":
|
109 |
+
prompt_box = ""
|
110 |
+
else:
|
111 |
+
prompt_box = EXAMPLES[language][prompt]
|
112 |
+
|
113 |
+
text = st.text_area(
|
114 |
+
'Insert your text: ',
|
115 |
+
detector.normalizer(prompt_box),
|
116 |
+
height=100
|
117 |
+
)
|
118 |
+
text = detector.normalizer(text)
|
119 |
+
entered_text = st.empty()
|
120 |
+
|
121 |
+
detect_typos = st.button('Detect Typos !')
|
122 |
+
|
123 |
+
st.markdown(
|
124 |
+
"<hr />",
|
125 |
+
unsafe_allow_html=True
|
126 |
+
)
|
127 |
+
if detect_typos:
|
128 |
+
words = text.split()
|
129 |
+
with st.spinner("Detecting..."):
|
130 |
+
if not len(words) > 3:
|
131 |
+
entered_text.markdown(
|
132 |
+
"Insert your text (at least three words)"
|
133 |
+
)
|
134 |
+
else:
|
135 |
+
detected = detector.detect(text)
|
136 |
+
detected = f"<p class='typo-box {is_rtl}'>{detected}</p>"
|
137 |
+
st.markdown(
|
138 |
+
detected,
|
139 |
+
unsafe_allow_html=True
|
140 |
+
)
|
141 |
+
|
142 |
+
|
143 |
+
if __name__ == '__main__':
|
144 |
+
main()
|
assets/ltr.css
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
textarea {
|
2 |
+
font-family: "IBM Plex Sans", sans-serif;
|
3 |
+
text-align: left;
|
4 |
+
direction: ltr;
|
5 |
+
}
|
assets/rtl.css
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.rtl,
|
2 |
+
textarea {
|
3 |
+
font-family: Vazir !important;
|
4 |
+
text-align: right;
|
5 |
+
direction: rtl !important;
|
6 |
+
}
|
7 |
+
.rtl-box {
|
8 |
+
border-bottom: 1px solid #ddd;
|
9 |
+
padding-bottom: 20px;
|
10 |
+
}
|
11 |
+
.ltr {
|
12 |
+
text-align: left;
|
13 |
+
direction: ltr !important;
|
14 |
+
}
|
assets/style.css
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
span.typo {
|
4 |
+
background: #ff520059;
|
5 |
+
border: 1px solid #ff5200a6;
|
6 |
+
padding: 2px 3px;
|
7 |
+
margin: auto 2px;
|
8 |
+
}
|
libs/__init__.py
ADDED
File without changes
|
libs/dummy.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
outputs = [
|
2 |
+
"He had also <span class='typo'>stgruggled</span> with addiction during his time in Congress ."
|
3 |
+
]
|
libs/examples.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LANGUAGES = ["en", "fa", "is"]
|
2 |
+
EXAMPLES = {
|
3 |
+
"en": {
|
4 |
+
"Example 1": "He had also stgruggled with addiction during his time in Congress .",
|
5 |
+
"Example 2": "The review thoroughla assessed all aspects of JLENS SuR and CPG esign maturit and confidence .",
|
6 |
+
"Example 3": "Letterma also apologized two his staff for the satyation .",
|
7 |
+
"Example 4": "Vincent Jay had earlier won France 's first gold in gthe 10km biathlon sprint .",
|
8 |
+
"Example 5": "It is left to the directors to figure out hpw to bring the stry across to tye audience .",
|
9 |
+
},
|
10 |
+
"fa": {
|
11 |
+
"Example 1": "و گلوله دور مقابکل غلم بود .",
|
12 |
+
"Example 2": "شلام تاریکی، دوسته قدیمی من",
|
13 |
+
"Example 3": "در سدای سکوت، در روایئ ناآرام تنها غدم میزنم",
|
14 |
+
"Example 4": "زیر هلقه نور چراغ خیابان",
|
15 |
+
"Example 5": "و در صدای سکوت ضمضمه می شود",
|
16 |
+
"Example 6": "ویرایستیار متن برای نویسندگان ، روزنامه نگاران و اسحاب رصانهه",
|
17 |
+
"Example 7": "جکیم ابوالقفاسم فرذدوسی ساعر حماصی سصرای غرن جهارم استت ( تمامما قلط )",
|
18 |
+
"Example 8": "میان عاشق و معشوق هیچ هائل نیست",
|
19 |
+
"Example 9": "عذاهای زود حزم برای معده بهتر است .",
|
20 |
+
"Example 10": "غضا خوردم و رفتم .",
|
21 |
+
"Example 11": "او شاگرد خاص و عقرب به استاد بود ",
|
22 |
+
},
|
23 |
+
"is": {
|
24 |
+
"Example 1": "Páli, vini mínum, langaði að horfa á sjónnvarpið.",
|
25 |
+
"Example 2": "Leggir þciðursins eru þaktir fjöðrum til bað edravn fuglnn gekgn kuldanué .",
|
26 |
+
"Example 3": "Þar hitta þeir konu Björns og segir ovs :",
|
27 |
+
"Example 4": "Ingvar Sæmundsson ekgk rú sveitinni árið 2015 og etnbeitii sér að hinni þungarokkssvedt svnni Momentum .",
|
28 |
+
"Example 5": "Þar hitta þeir konu Björns og segir ovs :",
|
29 |
+
"Example 6": "Var hann síðaún hkluti af leikhópnum sem ferðaðist um Bandaríkin til að sýan söngleikinn ."
|
30 |
+
}
|
31 |
+
}
|
libs/normalizer.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import regex
|
3 |
+
import sys
|
4 |
+
import textwrap
|
5 |
+
from typing import Any, Dict, Optional
|
6 |
+
|
7 |
+
punctuations = [
|
8 |
+
'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
|
9 |
+
'/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
|
10 |
+
'`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
|
11 |
+
]
|
12 |
+
|
13 |
+
|
14 |
+
class Normalizer:
|
15 |
+
"""A general normalizer for every language"""
|
16 |
+
|
17 |
+
_whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
|
18 |
+
_dictionary = {}
|
19 |
+
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
whitelist: str = None,
|
23 |
+
dictionary: Dict[str, str] = None,
|
24 |
+
) -> None:
|
25 |
+
self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
|
26 |
+
self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
|
27 |
+
|
28 |
+
def chars_to_map(self, sentence: str) -> str:
|
29 |
+
"""Maps every character, words, and phrase into a proper one.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
sentence (str): A piece of text.
|
33 |
+
"""
|
34 |
+
if not len(self.dictionary) > 0:
|
35 |
+
return sentence
|
36 |
+
|
37 |
+
pattern = "|".join(map(re.escape, self.dictionary.keys()))
|
38 |
+
return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
|
39 |
+
|
40 |
+
def chars_to_preserve(
|
41 |
+
self,
|
42 |
+
sentence: str,
|
43 |
+
) -> str:
|
44 |
+
"""Keeps specified characters from sentence
|
45 |
+
|
46 |
+
Args:
|
47 |
+
sentence (str): A piece of text.
|
48 |
+
"""
|
49 |
+
try:
|
50 |
+
tokenized = regex.findall(self.whitelist, sentence)
|
51 |
+
return " ".join(tokenized)
|
52 |
+
except Exception as error:
|
53 |
+
print(
|
54 |
+
textwrap.dedent(
|
55 |
+
f"""
|
56 |
+
Bad characters range {self.whitelist},
|
57 |
+
{error}
|
58 |
+
"""
|
59 |
+
)
|
60 |
+
)
|
61 |
+
raise
|
62 |
+
|
63 |
+
def text_level_normalizer(self, text: str) -> str:
|
64 |
+
"""A text level of normalization"""
|
65 |
+
|
66 |
+
text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
|
67 |
+
text = text.strip()
|
68 |
+
|
69 |
+
return text
|
70 |
+
|
71 |
+
def __call__(
|
72 |
+
self,
|
73 |
+
text: str,
|
74 |
+
do_lowercase: Optional[bool] = False
|
75 |
+
) -> Any:
|
76 |
+
"""Normalization caller"""
|
77 |
+
|
78 |
+
text = self.chars_to_map(text)
|
79 |
+
text = self.chars_to_preserve(text)
|
80 |
+
text = self.text_level_normalizer(text)
|
81 |
+
text = re.sub(r"\s+", " ", text)
|
82 |
+
|
83 |
+
if do_lowercase:
|
84 |
+
text = text.lower()
|
85 |
+
|
86 |
+
return text
|
libs/utils.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def local_css(css_path):
|
5 |
+
with open(css_path) as f:
|
6 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
7 |
+
|
8 |
+
|
9 |
+
def remote_css(css_url):
|
10 |
+
st.markdown(f'<link href="{css_url}" rel="stylesheet">', unsafe_allow_html=True)
|
meta.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INFO = """
|
2 |
+
# Typo Detector ⚡
|
3 |
+
|
4 |
+
Currently, Typo Detector supports English, Persian and Icelandic.
|
5 |
+
|
6 |
+
- [typo-detector-distilbert-fa 🇮🇷](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-fa)
|
7 |
+
- [typo-detector-distilbert-en 🇺🇸](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-en)
|
8 |
+
- [typo-detector-distilbert-is 🇮🇸](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-is)
|
9 |
+
""".strip()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
regex
|
5 |
+
plotly
|