File size: 2,419 Bytes
9febd82
 
 
 
5424223
9febd82
c0a742f
 
3e18459
c0a742f
9febd82
 
 
 
 
 
 
5424223
9febd82
 
 
 
 
 
 
 
 
5424223
 
9febd82
 
 
 
 
 
 
 
 
 
 
 
 
 
673a35d
9febd82
5424223
9febd82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5424223
16ad1b3
7e0f5c4
9febd82
 
 
 
 
64e7b97
9febd82
5424223
9febd82
 
 
673a35d
9febd82
673a35d
9febd82
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  6 20:56:08 2022

@author: Aziz Baran Kurtuluş
"""
import os
os.system('pip install nltk')
os.system('pip install scikit-learn')

import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


import streamlit as st
import joblib
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer



site_header = st.container()
business_context = st.container()
data_desc = st.container()
performance = st.container()
tweet_input = st.container()
model_results = st.container()
sentiment_analysis = st.container()
contact = st.container()

with site_header:
    st.title('Toxic Comment Detection')
   

with tweet_input:
    st.header('Is Your Text Considered Toxic?')
    st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
    user_text = st.text_input('Enter Text', max_chars=280)

with model_results:    
    st.subheader('Prediction:')
    if user_text:
    # processing user_text
        # removing punctuation
        user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
        # tokenizing
        stop_words = set(stopwords.words('english'))
        tokens = nltk.word_tokenize(user_text)
        # removing stop words
        stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
        # taking root word
        lemmatizer = WordNetLemmatizer() 
        lemmatized_output = []
        for word in stopwords_removed:
            lemmatized_output.append(lemmatizer.lemmatize(word))

        # instantiating tfidf vectorizor
        tfidf = TfidfVectorizer(stop_words= 'english', ngram_range=(1,2))
        X_train = joblib.load(open('resources/X_train.pickel', 'rb'))
        X_test = lemmatized_output
        X_train_count = tfidf.fit_transform(X_train)
        X_test_count = tfidf.transform(X_test)

        # loading in model
        final_model = joblib.load(open('resources/final_bayes.pickel', 'rb'))

        # applying the model to make predictions
        prediction = final_model.predict(X_test_count[0])

        if prediction == 0:
            st.subheader('**Not Toxic**')
        else:
            st.subheader('**Toxic**')
        st.text('')