Spaces:

ml6team
/

toxic-comment-detection-german

Sleeping

App Files Files Community

Konstantin commited on Feb 28, 2022

Commit

bd8327d

•

1 Parent(s): 091e9bd

Add spaces application

Browse files

Files changed (4) hide show

.gitignore +3 -0
README.md +3 -3
app.py +149 -0
requirements.txt +4 -0

.gitignore CHANGED Viewed

@@ -3,3 +3,6 @@
 !.gitignore
 !.gitattributes
 !README.md

 !.gitignore
 !.gitattributes
 !README.md
+!app.py
+!requirements.txt

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Toxic Comments German
-emoji: 📚
 colorFrom: red
 colorTo: gray
 sdk: streamlit
@@ -26,7 +26,7 @@ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gr
 Can be either `gradio`, `streamlit`, or `static`
 `sdk_version` : _string_
-Only applicable for `streamlit` SDK.
 See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
 `app_file`: _string_

 ---
+title: Toxic Comment Detection German
+emoji: 🤬
 colorFrom: red
 colorTo: gray
 sdk: streamlit
 Can be either `gradio`, `streamlit`, or `static`
 `sdk_version` : _string_
+Only applicable for `streamlit` SDK.
 See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
 `app_file`: _string_

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import random
+import streamlit as st
+from bs4 import BeautifulSoup
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import pipeline
+from transformers_interpret import SequenceClassificationExplainer
+model_hub_url = 'https://huggingface.co/ml6team/distilbert-base-german-cased-toxic-comments'
+model_name = 'ml6team/distilbert-base-german-cased-toxic-comments'
+about_page_markdown = f"""# 🤬 Toxic Comment Detection Space
+Made by [ML6](https://ml6.eu/).
+Token attribution is performed using [transformers-interpret](https://github.com/cdpierse/transformers-interpret).
+"""
+regular_emojis = [
+    '😐', '🙂', '👶', '😇',
+]
+undecided_emojis = [
+    '🤨', '🧐', '🥸', '🥴', '🤷',
+]
+potty_mouth_emojis = [
+    '🤐', '👿', '😡', '🤬', '☠️', '☣️', '☢️',
+]
+# Page setup
+st.set_page_config(
+    page_title="Toxic Comment Detection Space",
+    page_icon="🤬",
+    layout="centered",
+    initial_sidebar_state="auto",
+    menu_items={
+        'Get help': None,
+        'Report a bug': None,
+        'About': about_page_markdown,
+    }
+)
+# Model setup
+@st.cache(allow_output_mutation=True,
+          suppress_st_warning=True,
+          show_spinner=False)
+def load_pipeline():
+    with st.spinner('Loading the model (this might take a while)...'):
+        toxicity_pipeline = pipeline(
+            'text-classification',
+            model=model_name,
+            tokenizer=model_name)
+        cls_explainer = SequenceClassificationExplainer(
+            toxicity_pipeline.model,
+            toxicity_pipeline.tokenizer)
+    return toxicity_pipeline, cls_explainer
+toxicity_pipeline, cls_explainer = load_pipeline()
+# Auxiliary functions
+def format_explainer_html(html_string):
+    """Extract tokens with attribution-based background color."""
+    soup = BeautifulSoup(html_string, 'html.parser')
+    p = soup.new_tag('p')
+    # Select token elements and remove model specific tokens
+    for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
+        p.append(token)
+    return p.prettify()
+def classify_comment(comment):
+    """Classify the given comment and augment with additional information."""
+    result = toxicity_pipeline(comment)[0]
+    # Add explanation
+    result['word_attribution'] = cls_explainer(comment, class_name="non_toxic")
+    result['visualitsation_html'] = cls_explainer.visualize()._repr_html_()
+    result['tokens_with_background'] = format_explainer_html(
+        result['visualitsation_html'])
+    # Choose emoji reaction
+    label, score = result['label'], result['score']
+    if label == 'toxic' and score > 0.1:
+        emoji = random.choice(potty_mouth_emojis)
+    elif label == 'non_toxic' and score > 0.1:
+        emoji = random.choice(regular_emojis)
+    else:
+        emoji = random.choice(undecided_emojis)
+    result.update({'text': comment, 'emoji': emoji})
+    # Add result to session
+    st.session_state.results.append(result)
+# Start session
+if 'results' not in st.session_state:
+    st.session_state.results = []
+# Page
+st.title('🤬 German Toxic Comment Detection')
+st.markdown("""This demo showcases the German toxic comment detection model.""")
+# Introduction
+st.markdown(f"""The model was trained using a sequence classification task on a combination of multiple German datasets containing toxicity, profanity, and hate speech. For a more comprehensive overview of the model check out the [model card on 🤗 Model Hub]({model_hub_url}).
+""")
+st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
+    <font color="black">
+        <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
+    </font>
+    tokens indicate toxicity whereas
+    <font color="black">
+    <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
+    </font> tokens indicate indicate the opposite.
+Try it yourself! 👇""",
+    unsafe_allow_html=True)
+# Demo
+with st.form("german-toxic-comment-detection-input", clear_on_submit=True):
+    text = st.text_area(
+        label='Enter the comment you want to classify below (in German):')
+    _, rightmost_col = st.columns([6,1])
+    submitted = rightmost_col.form_submit_button("Classify",
+                                                 help="Classify comment")
+# Listener
+if submitted:
+    if text:
+        with st.spinner('Analysing comment...'):
+            classify_comment(text)
+    else:
+        st.error('**Error**: No comment to classify. Please provide a comment.')
+# Results
+if 'results' in st.session_state and st.session_state.results:
+    first = True
+    for result in st.session_state.results[::-1]:
+        if not first:
+            st.markdown("---")
+        st.markdown(f"Text:\n> {result['text']}")
+        col_1, col_2, col_3 = st.columns([1,2,2])
+        col_1.metric(label='', value=f"{result['emoji']}")
+        col_2.metric(label='Label', value=f"{result['label']}")
+        col_3.metric(label='Score', value=f"{result['score']:.3f}")
+        st.markdown(f"Token Attribution:\n{result['tokens_with_background']}",
+         unsafe_allow_html=True)
+        first = False

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+beautifulsoup4==4.10.0
+streamlit==1.0.0
+transformers==4.15.0
+transformers-interpret==0.5.2