EdBianchi commited on
Commit
721d0e0
·
1 Parent(s): 869860f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # UTILITY
3
+ from joblib import load
4
+ # NLP
5
+ import re
6
+ from nltk.corpus import wordnet
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk import SnowballStemmer
9
+ import spacy
10
+ from gensim.models.doc2vec import Doc2Vec
11
+
12
+ comment = ""
13
+ tresh = 0.5
14
+
15
+ # set page setting
16
+ st.set_page_config(page_title='Toxic Comments')
17
+
18
+ # set history var
19
+ if 'history' not in st.session_state:
20
+ st.session_state.history = []
21
+
22
+ # import similarity (to be cached)
23
+ def importModel(filename):
24
+ model = load(filename)
25
+ return model
26
+
27
+ normalizer = importModel("normalizerD2V.joblib")
28
+ classifier = importModel("toxicCommModel.joblib")
29
+ model_d2v= Doc2Vec.load("d2v_comments.model")
30
+
31
+ # REGEX
32
+ def apply_regex(corpus):
33
+ corpus = re.sub("\S*\d\S*"," ", corpus)
34
+ corpus = re.sub("\S*@\S*\s?"," ", corpus)
35
+ corpus = re.sub("\S*#\S*\s?"," ", corpus)
36
+ corpus = re.sub(r'http\S+', ' ', corpus)
37
+ corpus = re.sub(r'[^a-zA-Z0-9 ]', ' ',corpus)
38
+ corpus = corpus.replace(u'\ufffd', '8')
39
+ corpus = re.sub(' +', ' ', corpus)
40
+ return corpus
41
+
42
+ # TOKENIZE TEXT - we use the Spacy library stopwords
43
+ spacy_model = spacy.load("en_core_web_sm")
44
+ stop_words = spacy_model.Defaults.stop_words
45
+
46
+ # TOKENIZE TEXT and STOP WORDS REMOVAL - execution (removes also the words shorter than 2 and longer than 15 chars)
47
+ def tokenize(doc):
48
+ tokens_1 = word_tokenize(str(doc))
49
+ return [word.lower() for word in tokens_1 if len(word) > 1 and len(word) < 15 and word not in stop_words and not word.isdigit()]
50
+
51
+ # STEMMING
52
+ stemmer = SnowballStemmer(language="english")
53
+ def applyStemming(listOfTokens):
54
+ return [stemmer.stem(token) for token in listOfTokens]
55
+
56
+ # PROBS TO CLASS
57
+ def probs_to_prediction(probs, threshold):
58
+ pred=[]
59
+ for x in probs[:,1]:
60
+ if x>threshold:
61
+ pred.append([1, x])
62
+ else:
63
+ pred.append([0,x])
64
+ return pred
65
+
66
+ # PROCESSING
67
+ def compute(comment, tresh):
68
+ comment = apply_regex(comment)
69
+ comment = tokenize(comment)
70
+ comment = applyStemming(comment)
71
+
72
+ vectorizedComment = model_d2v.infer_vector(comment, epochs=70)
73
+
74
+ normComment = normalizer.transform([vectorizedComment])
75
+ probs = classifier.predict_proba(normComment)
76
+ preds = probs_to_prediction(probs, tresh)
77
+
78
+ print(tresh)
79
+ col1, col2 = st.columns(2)
80
+ col1.metric("Toxic", round(preds[0][1], 4))
81
+ col2.metric("Non Toxic", round(1-preds[0][1], 4))
82
+ return None
83
+
84
+ # TITLE
85
+ st.write("# Toxic Comments Classification")
86
+ st.write("#### Drop a comment, choose a threshold and wait for toxicity.")
87
+
88
+ # INPUT TEXTBOX
89
+ comment = st.text_area('', '''
90
+ Write here the comment to analyze...
91
+ ''')
92
+
93
+ # IMPUT THRESHOLD
94
+ tresh = st.slider('Set the Threshold, default 0.5', 0.00, 1.00, step=0.0001)
95
+ compute(comment, tresh)
96
+
97
+
98
+ # STEMMED SHOWING
99
+ #st.write(classResult)
100
+
101
+
102
+ # sidebar
103
+ st.sidebar.write("""
104
+ This is a Toxic Comment Classifier that uses tokenization, stemming, Doc2Vec encoding and tuned logistic regression model.
105
+ It was trained on a large corpus of comments.
106
+ """)
107
+