Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pickle | |
import numpy as np | |
import pandas as pd | |
import re | |
import tensorflow | |
from tensorflow import keras | |
from keras.preprocessing import text,sequence,utils | |
import html | |
import string | |
import nltk | |
from nltk.stem.porter import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
stop_words = stopwords.words('english') | |
from tensorflow.keras.preprocessing.text import text_to_word_sequence | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras import models | |
from tensorflow.keras import layers | |
from tensorflow.keras import losses | |
from tensorflow.keras import metrics | |
from tensorflow.keras import optimizers | |
from tensorflow.keras.utils import plot_model | |
def remove_special_chars(text): | |
re1 = re.compile(r' +') | |
x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace( | |
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( | |
'<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace( | |
' @-@ ', '-').replace('\\', ' \\ ') | |
return re1.sub(' ', html.unescape(x1)) | |
def to_lowercase(text): | |
return text.lower() | |
def remove_punctuation(text): | |
"""Remove punctuation from list of tokenized words""" | |
translator = str.maketrans('', '', string.punctuation) | |
return text.translate(translator) | |
def replace_numbers(text): | |
"""Replace all interger occurrences in list of tokenized words with textual representation""" | |
return re.sub(r'\d+', '', text) | |
def remove_whitespaces(text): | |
return text.strip() | |
def remove_stopwords(words, stop_words): | |
return [word for word in words if word not in stop_words] | |
def stem_words(words): | |
"""Stem words in text""" | |
stemmer = PorterStemmer() | |
return [stemmer.stem(word) for word in words] | |
def lemmatize_words(words): | |
"""Lemmatize words in text""" | |
lemmatizer = WordNetLemmatizer() | |
return [lemmatizer.lemmatize(word) for word in words] | |
def lemmatize_verbs(words): | |
"""Lemmatize verbs in text""" | |
lemmatizer = WordNetLemmatizer() | |
return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words]) | |
def text2words(text): | |
return word_tokenize(text) | |
def clean_text( text): | |
text = remove_special_chars(text) | |
text = remove_punctuation(text) | |
text = to_lowercase(text) | |
text = replace_numbers(text) | |
words = text2words(text) | |
words = remove_stopwords(words, stop_words) | |
#words = stem_words(words)# Either stem ovocar lemmatize | |
words = lemmatize_words(words) | |
words = lemmatize_verbs(words) | |
return ''.join(words) | |
df = pd.read_csv('train.csv') | |
df['comment_text'] = df['comment_text'].apply(lambda x: clean_text(x)) | |
model = pickle.load(open('tox_model.pkl','rb')) | |
st.title('Toxic comment classification') | |
input = st.text_area('Enter your comment') | |
input = input.apply(lambda x: clean_text(x)) | |
tok = Tokenizer(num_words=1000, oov_token='UNK') | |
tok.fit_on_texts(df['comment_text'] ) | |
x_test = tok.texts_to_sequence(input) | |
input_text = pad_sequences(x_test, | |
maxlen=50, | |
truncating='post', | |
padding='post' | |
) | |
if input: | |
out = model.predict(input_text) | |
st.json(out) | |