m3hrdadfi commited on
Commit
5a1441b
1 Parent(s): c8f36af

Create sync_streamlit_to_space.yml

Browse files
Files changed (12) hide show
  1. README.md +3 -25
  2. app.py +144 -0
  3. assets/ltr.css +5 -0
  4. assets/rtl.css +14 -0
  5. assets/style.css +8 -0
  6. libs/__init__.py +0 -0
  7. libs/dummy.py +3 -0
  8. libs/examples.py +31 -0
  9. libs/normalizer.py +86 -0
  10. libs/utils.py +10 -0
  11. meta.py +9 -0
  12. requirements.txt +5 -0
README.md CHANGED
@@ -1,33 +1,11 @@
1
  ---
2
  title: Typo Detector
3
- emoji: 📚
4
  colorFrom: red
5
- colorTo: green
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
- # Configuration
12
-
13
- `title`: _string_
14
- Display title for the Space
15
-
16
- `emoji`: _string_
17
- Space emoji (emoji-only character allowed)
18
-
19
- `colorFrom`: _string_
20
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
-
22
- `colorTo`: _string_
23
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
-
25
- `sdk`: _string_
26
- Can be either `gradio` or `streamlit`
27
-
28
- `app_file`: _string_
29
- Path to your main application file (which contains either `gradio` or `streamlit` Python code).
30
- Path is relative to the root of the repository.
31
-
32
- `pinned`: _boolean_
33
- Whether the Space stays on top of your list.
 
1
  ---
2
  title: Typo Detector
3
+ emoji:
4
  colorFrom: red
5
+ colorTo: red
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
+ # Typo Detector using Transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ import torch
4
+ from transformers import pipeline, set_seed
5
+ from transformers import AutoTokenizer
6
+
7
+ from libs.normalizer import Normalizer
8
+ from libs.examples import LANGUAGES, EXAMPLES
9
+ from libs.dummy import outputs as dummy_outputs
10
+ from libs.utils import local_css, remote_css
11
+
12
+ import meta
13
+
14
+ MODELS = {
15
+ "English (en)": "m3hrdadfi/typo-detector-distilbert-en",
16
+ "Persian (fa)": "m3hrdadfi/typo-detector-distilbert-fa",
17
+ "Icelandic (is)": "m3hrdadfi/typo-detector-distilbert-is",
18
+ }
19
+
20
+
21
+ class TypoDetector:
22
+ def __init__(
23
+ self,
24
+ model_name_or_path: str = "m3hrdadfi/typo-detector-distilbert-en"
25
+ ) -> None:
26
+ self.debug = False
27
+ self.dummy_outputs = dummy_outputs
28
+ self.model_name_or_path = model_name_or_path
29
+ self.task_name = "token-classification"
30
+
31
+ self.tokenizer = None
32
+ self.nlp = None
33
+ self.normalizer = None
34
+
35
+ def load(self):
36
+ if not self.debug:
37
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
38
+ self.nlp = pipeline(self.task_name, model=self.model_name_or_path, tokenizer=self.model_name_or_path)
39
+
40
+ self.normalizer = Normalizer()
41
+
42
+ def detect(self, sentence):
43
+ if self.debug:
44
+ return self.dummy_outputs[0]
45
+
46
+ typos = [sentence[r["start"]: r["end"]] for r in self.nlp(sentence)]
47
+
48
+ detected = sentence
49
+ for typo in typos:
50
+ detected = detected.replace(typo, f'<span class="typo">{typo}</span>')
51
+
52
+ return detected
53
+
54
+
55
+ @st.cache(allow_output_mutation=True)
56
+ def load_typo_detectors():
57
+ en_detector = TypoDetector(MODELS["English (en)"])
58
+ en_detector.load()
59
+
60
+ fa_detector = TypoDetector(MODELS["Persian (fa)"])
61
+ fa_detector.load()
62
+
63
+ is_detector = TypoDetector(MODELS["Icelandic (is)"])
64
+ is_detector.load()
65
+ return {
66
+ "en": en_detector,
67
+ "fa": fa_detector,
68
+ "is": is_detector
69
+ }
70
+
71
+
72
+ def main():
73
+ st.set_page_config(
74
+ page_title="Typo Detector",
75
+ page_icon="⚡",
76
+ layout="wide",
77
+ initial_sidebar_state="expanded"
78
+ )
79
+ remote_css("https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font/dist/font-face.css")
80
+ local_css("assets/style.css")
81
+ detectors = load_typo_detectors()
82
+
83
+ col1, col2 = st.beta_columns([6, 4])
84
+ with col2:
85
+ st.markdown(meta.INFO, unsafe_allow_html=True)
86
+
87
+ with col1:
88
+ language = st.selectbox(
89
+ 'Examples (select from this list)',
90
+ LANGUAGES,
91
+ index=0
92
+ )
93
+ detector = detectors[language]
94
+ is_rtl = "rtl" if language == "fa" else "ltr"
95
+ if language == "fa":
96
+ local_css("assets/rtl.css")
97
+ else:
98
+ local_css("assets/ltr.css")
99
+
100
+ prompts = list(EXAMPLES[language].keys()) + ["Custom"]
101
+ prompt = st.selectbox(
102
+ 'Examples (select from this list)',
103
+ prompts,
104
+ # index=len(prompts) - 1,
105
+ index=0
106
+ )
107
+
108
+ if prompt == "Custom":
109
+ prompt_box = ""
110
+ else:
111
+ prompt_box = EXAMPLES[language][prompt]
112
+
113
+ text = st.text_area(
114
+ 'Insert your text: ',
115
+ detector.normalizer(prompt_box),
116
+ height=100
117
+ )
118
+ text = detector.normalizer(text)
119
+ entered_text = st.empty()
120
+
121
+ detect_typos = st.button('Detect Typos !')
122
+
123
+ st.markdown(
124
+ "<hr />",
125
+ unsafe_allow_html=True
126
+ )
127
+ if detect_typos:
128
+ words = text.split()
129
+ with st.spinner("Detecting..."):
130
+ if not len(words) > 3:
131
+ entered_text.markdown(
132
+ "Insert your text (at least three words)"
133
+ )
134
+ else:
135
+ detected = detector.detect(text)
136
+ detected = f"<p class='typo-box {is_rtl}'>{detected}</p>"
137
+ st.markdown(
138
+ detected,
139
+ unsafe_allow_html=True
140
+ )
141
+
142
+
143
+ if __name__ == '__main__':
144
+ main()
assets/ltr.css ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ textarea {
2
+ font-family: "IBM Plex Sans", sans-serif;
3
+ text-align: left;
4
+ direction: ltr;
5
+ }
assets/rtl.css ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .rtl,
2
+ textarea {
3
+ font-family: Vazir !important;
4
+ text-align: right;
5
+ direction: rtl !important;
6
+ }
7
+ .rtl-box {
8
+ border-bottom: 1px solid #ddd;
9
+ padding-bottom: 20px;
10
+ }
11
+ .ltr {
12
+ text-align: left;
13
+ direction: ltr !important;
14
+ }
assets/style.css ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ span.typo {
4
+ background: #ff520059;
5
+ border: 1px solid #ff5200a6;
6
+ padding: 2px 3px;
7
+ margin: auto 2px;
8
+ }
libs/__init__.py ADDED
File without changes
libs/dummy.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ outputs = [
2
+ "He had also <span class='typo'>stgruggled</span> with addiction during his time in Congress ."
3
+ ]
libs/examples.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LANGUAGES = ["en", "fa", "is"]
2
+ EXAMPLES = {
3
+ "en": {
4
+ "Example 1": "He had also stgruggled with addiction during his time in Congress .",
5
+ "Example 2": "The review thoroughla assessed all aspects of JLENS SuR and CPG esign maturit and confidence .",
6
+ "Example 3": "Letterma also apologized two his staff for the satyation .",
7
+ "Example 4": "Vincent Jay had earlier won France 's first gold in gthe 10km biathlon sprint .",
8
+ "Example 5": "It is left to the directors to figure out hpw to bring the stry across to tye audience .",
9
+ },
10
+ "fa": {
11
+ "Example 1": "و گلوله دور مقابکل غلم بود .",
12
+ "Example 2": "شلام تاریکی، دوسته قدیمی من",
13
+ "Example 3": "در سدای سکوت، در روایئ ناآرام تنها غدم می‌زنم",
14
+ "Example 4": "زیر هلقه نور چراغ خیابان",
15
+ "Example 5": "و در صدای سکوت ضمضمه می شود",
16
+ "Example 6": "ویرایستیار متن برای نویسندگان ، روزنامه نگاران و اسحاب رصانهه",
17
+ "Example 7": "جکیم ابوالقفاسم فرذدوسی ساعر حماصی سصرای غرن جهارم استت ( تمامما قلط )",
18
+ "Example 8": "میان عاشق و معشوق هیچ هائل نیست",
19
+ "Example 9": "عذاهای زود حزم برای معده بهتر است .",
20
+ "Example 10": "غضا خوردم و رفتم .",
21
+ "Example 11": "او شاگرد خاص و عقرب به استاد بود ",
22
+ },
23
+ "is": {
24
+ "Example 1": "Páli, vini mínum, langaði að horfa á sjónnvarpið.",
25
+ "Example 2": "Leggir þciðursins eru þaktir fjöðrum til bað edravn fuglnn gekgn kuldanué .",
26
+ "Example 3": "Þar hitta þeir konu Björns og segir ovs :",
27
+ "Example 4": "Ingvar Sæmundsson ekgk rú sveitinni árið 2015 og etnbeitii sér að hinni þungarokkssvedt svnni Momentum .",
28
+ "Example 5": "Þar hitta þeir konu Björns og segir ovs :",
29
+ "Example 6": "Var hann síðaún hkluti af leikhópnum sem ferðaðist um Bandaríkin til að sýan söngleikinn ."
30
+ }
31
+ }
libs/normalizer.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import regex
3
+ import sys
4
+ import textwrap
5
+ from typing import Any, Dict, Optional
6
+
7
+ punctuations = [
8
+ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
9
+ '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
10
+ '`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
11
+ ]
12
+
13
+
14
+ class Normalizer:
15
+ """A general normalizer for every language"""
16
+
17
+ _whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
18
+ _dictionary = {}
19
+
20
+ def __init__(
21
+ self,
22
+ whitelist: str = None,
23
+ dictionary: Dict[str, str] = None,
24
+ ) -> None:
25
+ self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
26
+ self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
27
+
28
+ def chars_to_map(self, sentence: str) -> str:
29
+ """Maps every character, words, and phrase into a proper one.
30
+
31
+ Args:
32
+ sentence (str): A piece of text.
33
+ """
34
+ if not len(self.dictionary) > 0:
35
+ return sentence
36
+
37
+ pattern = "|".join(map(re.escape, self.dictionary.keys()))
38
+ return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
39
+
40
+ def chars_to_preserve(
41
+ self,
42
+ sentence: str,
43
+ ) -> str:
44
+ """Keeps specified characters from sentence
45
+
46
+ Args:
47
+ sentence (str): A piece of text.
48
+ """
49
+ try:
50
+ tokenized = regex.findall(self.whitelist, sentence)
51
+ return " ".join(tokenized)
52
+ except Exception as error:
53
+ print(
54
+ textwrap.dedent(
55
+ f"""
56
+ Bad characters range {self.whitelist},
57
+ {error}
58
+ """
59
+ )
60
+ )
61
+ raise
62
+
63
+ def text_level_normalizer(self, text: str) -> str:
64
+ """A text level of normalization"""
65
+
66
+ text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
67
+ text = text.strip()
68
+
69
+ return text
70
+
71
+ def __call__(
72
+ self,
73
+ text: str,
74
+ do_lowercase: Optional[bool] = False
75
+ ) -> Any:
76
+ """Normalization caller"""
77
+
78
+ text = self.chars_to_map(text)
79
+ text = self.chars_to_preserve(text)
80
+ text = self.text_level_normalizer(text)
81
+ text = re.sub(r"\s+", " ", text)
82
+
83
+ if do_lowercase:
84
+ text = text.lower()
85
+
86
+ return text
libs/utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def local_css(css_path):
5
+ with open(css_path) as f:
6
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
7
+
8
+
9
+ def remote_css(css_url):
10
+ st.markdown(f'<link href="{css_url}" rel="stylesheet">', unsafe_allow_html=True)
meta.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ INFO = """
2
+ # Typo Detector ⚡
3
+
4
+ Currently, Typo Detector supports English, Persian and Icelandic.
5
+
6
+ - [typo-detector-distilbert-fa 🇮🇷](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-fa)
7
+ - [typo-detector-distilbert-en 🇺🇸](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-en)
8
+ - [typo-detector-distilbert-is 🇮🇸](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-is)
9
+ """.strip()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ regex
5
+ plotly