Spaces:

EdBianchi
/

Social_Toximeter

Build error

App Files Files Community

EdBianchi commited on May 3, 2022

Commit

721d0e0

1 Parent(s): 869860f

Upload app.py

Browse files

Files changed (1) hide show

app.py +107 -0

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+# UTILITY
+from joblib import load
+# NLP
+import re
+from nltk.corpus import wordnet
+from nltk.tokenize import word_tokenize
+from nltk import SnowballStemmer
+import spacy
+from gensim.models.doc2vec import Doc2Vec
+comment = ""
+tresh = 0.5
+# set page setting
+st.set_page_config(page_title='Toxic Comments')
+# set history var
+if 'history' not in st.session_state:
+    st.session_state.history = []
+# import similarity (to be cached)
+def importModel(filename):
+    model = load(filename)
+    return model
+normalizer = importModel("normalizerD2V.joblib")
+classifier = importModel("toxicCommModel.joblib")
+model_d2v= Doc2Vec.load("d2v_comments.model")
+# REGEX
+def apply_regex(corpus):
+    corpus = re.sub("\S*\d\S*"," ", corpus)
+    corpus = re.sub("\S*@\S*\s?"," ", corpus)
+    corpus = re.sub("\S*#\S*\s?"," ", corpus)
+    corpus = re.sub(r'http\S+', ' ', corpus)
+    corpus = re.sub(r'[^a-zA-Z0-9 ]', ' ',corpus)
+    corpus = corpus.replace(u'\ufffd', '8')
+    corpus = re.sub(' +', ' ', corpus)
+    return corpus
+# TOKENIZE TEXT - we use the Spacy library stopwords
+spacy_model = spacy.load("en_core_web_sm")
+stop_words = spacy_model.Defaults.stop_words
+# TOKENIZE TEXT and STOP WORDS REMOVAL - execution (removes also the words shorter than 2 and longer than 15 chars)
+def tokenize(doc):
+    tokens_1 = word_tokenize(str(doc))
+    return [word.lower() for word in tokens_1 if len(word) > 1 and len(word) < 15 and word not in stop_words and not word.isdigit()]
+# STEMMING
+stemmer = SnowballStemmer(language="english")
+def applyStemming(listOfTokens):
+    return [stemmer.stem(token) for token in listOfTokens]
+# PROBS TO CLASS
+def probs_to_prediction(probs, threshold):
+    pred=[]
+    for x in probs[:,1]:
+        if x>threshold:
+            pred.append([1, x])
+        else:
+            pred.append([0,x])
+    return pred
+# PROCESSING
+def compute(comment, tresh):
+    comment = apply_regex(comment)
+    comment = tokenize(comment)
+    comment = applyStemming(comment)
+    vectorizedComment =  model_d2v.infer_vector(comment, epochs=70)
+    normComment = normalizer.transform([vectorizedComment])
+    probs = classifier.predict_proba(normComment)
+    preds = probs_to_prediction(probs, tresh)
+    print(tresh)
+    col1, col2 = st.columns(2)
+    col1.metric("Toxic", round(preds[0][1], 4))
+    col2.metric("Non Toxic", round(1-preds[0][1], 4))
+    return None
+# TITLE
+st.write("# Toxic Comments Classification")
+st.write("#### Drop a comment, choose a threshold and wait for toxicity.")
+# INPUT TEXTBOX
+comment = st.text_area('', '''
+        Write here the comment to analyze...
+     ''')
+# IMPUT THRESHOLD
+tresh = st.slider('Set the Threshold, default 0.5', 0.00, 1.00, step=0.0001)
+compute(comment, tresh)
+# STEMMED SHOWING
+#st.write(classResult)
+# sidebar
+st.sidebar.write("""
+This is a Toxic Comment Classifier that uses tokenization, stemming, Doc2Vec encoding and tuned logistic regression model.
+It was trained on a large corpus of comments.
+""")