Spaces:

m3hrdadfi
/

typo-detector

Runtime error

App Files Files Community

m3hrdadfi commited on Aug 1, 2021

Commit

5a1441b

1 Parent(s): c8f36af

Create sync_streamlit_to_space.yml

Browse files

Files changed (12) hide show

README.md +3 -25
app.py +144 -0
assets/ltr.css +5 -0
assets/rtl.css +14 -0
assets/style.css +8 -0
libs/__init__.py +0 -0
libs/dummy.py +3 -0
libs/examples.py +31 -0
libs/normalizer.py +86 -0
libs/utils.py +10 -0
meta.py +9 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,33 +1,11 @@
 ---
 title: Typo Detector
-emoji: 📚
 colorFrom: red
-colorTo: green
 sdk: streamlit
 app_file: app.py
 pinned: false
 ---
-# Configuration
-`title`: _string_
-Display title for the Space
-`emoji`: _string_
-Space emoji (emoji-only character allowed)
-`colorFrom`: _string_
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`colorTo`: _string_
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`sdk`: _string_
-Can be either `gradio` or `streamlit`
-`app_file`: _string_
-Path to your main application file (which contains either `gradio` or `streamlit` Python code).
-Path is relative to the root of the repository.
-`pinned`: _boolean_
-Whether the Space stays on top of your list.

 ---
 title: Typo Detector
+emoji: ⚡
 colorFrom: red
+colorTo: red
 sdk: streamlit
 app_file: app.py
 pinned: false
 ---
+# Typo Detector using Transformers

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import streamlit as st
+import torch
+from transformers import pipeline, set_seed
+from transformers import AutoTokenizer
+from libs.normalizer import Normalizer
+from libs.examples import LANGUAGES, EXAMPLES
+from libs.dummy import outputs as dummy_outputs
+from libs.utils import local_css, remote_css
+import meta
+MODELS = {
+    "English (en)": "m3hrdadfi/typo-detector-distilbert-en",
+    "Persian (fa)": "m3hrdadfi/typo-detector-distilbert-fa",
+    "Icelandic (is)": "m3hrdadfi/typo-detector-distilbert-is",
+}
+class TypoDetector:
+    def __init__(
+            self,
+            model_name_or_path: str = "m3hrdadfi/typo-detector-distilbert-en"
+    ) -> None:
+        self.debug = False
+        self.dummy_outputs = dummy_outputs
+        self.model_name_or_path = model_name_or_path
+        self.task_name = "token-classification"
+        self.tokenizer = None
+        self.nlp = None
+        self.normalizer = None
+    def load(self):
+        if not self.debug:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+            self.nlp = pipeline(self.task_name, model=self.model_name_or_path, tokenizer=self.model_name_or_path)
+        self.normalizer = Normalizer()
+    def detect(self, sentence):
+        if self.debug:
+            return self.dummy_outputs[0]
+        typos = [sentence[r["start"]: r["end"]] for r in self.nlp(sentence)]
+        detected = sentence
+        for typo in typos:
+            detected = detected.replace(typo, f'<span class="typo">{typo}</span>')
+        return detected
+@st.cache(allow_output_mutation=True)
+def load_typo_detectors():
+    en_detector = TypoDetector(MODELS["English (en)"])
+    en_detector.load()
+    fa_detector = TypoDetector(MODELS["Persian (fa)"])
+    fa_detector.load()
+    is_detector = TypoDetector(MODELS["Icelandic (is)"])
+    is_detector.load()
+    return {
+        "en": en_detector,
+        "fa": fa_detector,
+        "is": is_detector
+    }
+def main():
+    st.set_page_config(
+        page_title="Typo Detector",
+        page_icon="⚡",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    remote_css("https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font/dist/font-face.css")
+    local_css("assets/style.css")
+    detectors = load_typo_detectors()
+    col1, col2 = st.beta_columns([6, 4])
+    with col2:
+        st.markdown(meta.INFO, unsafe_allow_html=True)
+    with col1:
+        language = st.selectbox(
+            'Examples (select from this list)',
+            LANGUAGES,
+            index=0
+        )
+        detector = detectors[language]
+        is_rtl = "rtl" if language == "fa" else "ltr"
+        if language == "fa":
+            local_css("assets/rtl.css")
+        else:
+            local_css("assets/ltr.css")
+        prompts = list(EXAMPLES[language].keys()) + ["Custom"]
+        prompt = st.selectbox(
+            'Examples (select from this list)',
+            prompts,
+            # index=len(prompts) - 1,
+            index=0
+        )
+        if prompt == "Custom":
+            prompt_box = ""
+        else:
+            prompt_box = EXAMPLES[language][prompt]
+        text = st.text_area(
+            'Insert your text: ',
+            detector.normalizer(prompt_box),
+            height=100
+        )
+        text = detector.normalizer(text)
+        entered_text = st.empty()
+    detect_typos = st.button('Detect Typos !')
+    st.markdown(
+        "<hr />",
+        unsafe_allow_html=True
+    )
+    if detect_typos:
+        words = text.split()
+        with st.spinner("Detecting..."):
+            if not len(words) > 3:
+                entered_text.markdown(
+                    "Insert your text (at least three words)"
+                )
+            else:
+                detected = detector.detect(text)
+                detected = f"<p class='typo-box {is_rtl}'>{detected}</p>"
+                st.markdown(
+                    detected,
+                    unsafe_allow_html=True
+                )
+if __name__ == '__main__':
+    main()

assets/ltr.css ADDED Viewed

	@@ -0,0 +1,5 @@

+textarea {
+    font-family: "IBM Plex Sans", sans-serif;
+    text-align: left;
+    direction: ltr;
+}

assets/rtl.css ADDED Viewed

	@@ -0,0 +1,14 @@

+.rtl,
+textarea {
+    font-family: Vazir !important;
+    text-align: right;
+    direction: rtl !important;
+}
+.rtl-box {
+    border-bottom: 1px solid #ddd;
+    padding-bottom: 20px;
+}
+.ltr {
+    text-align: left;
+    direction: ltr !important;
+}

assets/style.css ADDED Viewed

	@@ -0,0 +1,8 @@

+span.typo {
+    background: #ff520059;
+    border: 1px solid #ff5200a6;
+    padding: 2px 3px;
+    margin: auto 2px;
+}

libs/__init__.py ADDED Viewed

File without changes

libs/dummy.py ADDED Viewed

	@@ -0,0 +1,3 @@

+outputs = [
+    "He had also <span class='typo'>stgruggled</span> with addiction during his time in Congress ."
+]

libs/examples.py ADDED Viewed

	@@ -0,0 +1,31 @@

+LANGUAGES = ["en", "fa", "is"]
+EXAMPLES = {
+    "en": {
+        "Example 1": "He had also stgruggled with addiction during his time in Congress .",
+        "Example 2": "The review thoroughla assessed all aspects of JLENS SuR and CPG esign maturit and confidence .",
+        "Example 3": "Letterma also apologized two his staff for the satyation .",
+        "Example 4": "Vincent Jay had earlier won France 's first gold in gthe 10km biathlon sprint .",
+        "Example 5": "It is left to the directors to figure out hpw to bring the stry across to tye audience .",
+    },
+    "fa": {
+        "Example 1": "و گلوله دور مقابکل غلم بود .",
+        "Example 2": "شلام تاریکی، دوسته قدیمی من",
+        "Example 3": "در سدای سکوت، در روایئ ناآرام تنها غدم می‌زنم",
+        "Example 4": "زیر هلقه نور چراغ خیابان",
+        "Example 5": "و در صدای سکوت ضمضمه می شود",
+        "Example 6": "ویرایستیار متن برای نویسندگان ، روزنامه نگاران و اسحاب رصانهه",
+        "Example 7": "جکیم ابوالقفاسم فرذدوسی ساعر حماصی سصرای غرن جهارم استت ( تمامما قلط )",
+        "Example 8": "میان عاشق و معشوق هیچ هائل نیست",
+        "Example 9": "عذاهای زود حزم برای معده بهتر است .",
+        "Example 10": "غضا خوردم و رفتم .",
+        "Example 11": "او شاگرد خاص و عقرب به استاد بود ",
+    },
+    "is": {
+        "Example 1": "Páli, vini mínum, langaði að horfa á sjónnvarpið.",
+        "Example 2": "Leggir þciðursins eru þaktir fjöðrum til bað edravn fuglnn gekgn kuldanué .",
+        "Example 3": "Þar hitta þeir konu Björns og segir ovs :",
+        "Example 4": "Ingvar Sæmundsson ekgk rú sveitinni árið 2015 og etnbeitii sér að hinni þungarokkssvedt svnni Momentum .",
+        "Example 5": "Þar hitta þeir konu Björns og segir ovs :",
+        "Example 6": "Var hann síðaún hkluti af leikhópnum sem ferðaðist um Bandaríkin til að sýan söngleikinn ."
+    }
+}

libs/normalizer.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import re
+import regex
+import sys
+import textwrap
+from typing import Any, Dict, Optional
+punctuations = [
+    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
+    '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
+    '`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
+]
+class Normalizer:
+    """A general normalizer for every language"""
+    _whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
+    _dictionary = {}
+    def __init__(
+            self,
+            whitelist: str = None,
+            dictionary: Dict[str, str] = None,
+    ) -> None:
+        self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
+        self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
+    def chars_to_map(self, sentence: str) -> str:
+        """Maps every character, words, and phrase into a proper one.
+        Args:
+            sentence (str): A piece of text.
+        """
+        if not len(self.dictionary) > 0:
+            return sentence
+        pattern = "|".join(map(re.escape, self.dictionary.keys()))
+        return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
+    def chars_to_preserve(
+            self,
+            sentence: str,
+    ) -> str:
+        """Keeps specified characters from sentence
+        Args:
+            sentence (str): A piece of text.
+        """
+        try:
+            tokenized = regex.findall(self.whitelist, sentence)
+            return " ".join(tokenized)
+        except Exception as error:
+            print(
+                textwrap.dedent(
+                    f"""
+                    Bad characters range {self.whitelist},
+                    {error}
+                    """
+                )
+            )
+            raise
+    def text_level_normalizer(self, text: str) -> str:
+        """A text level of normalization"""
+        text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
+        text = text.strip()
+        return text
+    def __call__(
+            self,
+            text: str,
+            do_lowercase: Optional[bool] = False
+    ) -> Any:
+        """Normalization caller"""
+        text = self.chars_to_map(text)
+        text = self.chars_to_preserve(text)
+        text = self.text_level_normalizer(text)
+        text = re.sub(r"\s+", " ", text)
+        if do_lowercase:
+            text = text.lower()
+        return text

libs/utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+def local_css(css_path):
+    with open(css_path) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+def remote_css(css_url):
+    st.markdown(f'<link href="{css_url}" rel="stylesheet">', unsafe_allow_html=True)

meta.py ADDED Viewed

	@@ -0,0 +1,9 @@

+INFO = """
+# Typo Detector ⚡
+Currently, Typo Detector supports English, Persian and Icelandic.
+- [typo-detector-distilbert-fa 🇮🇷](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-fa)
+- [typo-detector-distilbert-en 🇺🇸](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-en)
+- [typo-detector-distilbert-is 🇮🇸](https://huggingface.co/m3hrdadfi/typo-detector-distilbert-is)
+""".strip()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+transformers
+torch
+regex
+plotly