Test / app.py
Adrian8a's picture
Upload 7 files
0b3f2a8
from statistics import mode
from joblib import load
from tqdm import tqdm
import pandas as pd
import gradio as gr
import numpy as np
import regex as re
stopwords = load('stopwords.data')
nlp = load('nlp.path')
class Preprocessor:
def __init__(self, stopwords=stopwords):
self.vectorizer = load('vectorizer.model')
self.stopwords = stopwords
self.vectorizer_fitted = True
def remove_urls(self, texts):
print('Removing URLs...')
pattern = re.compile('(\w+\.com ?/ ?.+)|(http\S+)')
return [re.sub(pattern, '', text) for text in texts]
def remove_double_space(self, texts):
print('Removing double space...')
pattern = re.compile(' +')
return [re.sub(pattern, ' ', text) for text in texts]
def remove_punctuation(self, texts):
print('Removing Punctuation...')
pattern = re.compile('[^a-z ]')
return [re.sub(pattern, ' ', text) for text in texts]
def remove_stopwords(self, texts):
print('Removing stopwords...')
return [[w for w in text.split(' ') if w not in self.stopwords] for text in tqdm(texts)]
def remove_numbers(self, texts):
print('Removing numbers...')
return [' '.join([w for w in text if not w.isdigit()]) for text in tqdm(texts)]
def remove_emojis(self, texts):
print('Removing emojis...')
pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
return [re.sub(pattern, r'', text) for text in texts]
def lemmatize(self, texts):
print('Lemmatizing...')
lemmatized_texts = []
for text in tqdm(texts):
doc = nlp(text)
lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
return lemmatized_texts
def transform(self, X, y=None, mode='train'):
X = X.copy()
print('Removing Nans...')
X = X[~X.isnull()]
X = X[~X.duplicated()]
if mode == 'train':
self.train_idx = X.index
else:
self.test_idx = X.index
print('Counting capitalized...')
capitalized = [np.sum([t.isupper() for t in text.split()])
for text in np.array(X.values)]
print('Lowering...')
X = [text.lower() for text in X]
X = self.remove_urls(X)
X = self.remove_punctuation(X)
X = self.remove_double_space(X)
X = self.remove_emojis(X)
X = self.remove_stopwords(X)
X = self.remove_numbers(X)
X = self.lemmatize(X)
if not self.vectorizer_fitted:
self.vectorizer_fitted = True
print('Fitting vectorizer...')
self.vectorizer.fit(X)
print('Vectorizing...')
X = self.vectorizer.transform(X)
return X
def gettext(r):
pred = mode(r)
if pred == 0:
text = 'Irrelevant'
elif pred == 1:
text = 'Negative'
elif pred == 2:
text = 'Neutral'
else:
text = 'Positive'
return text
def greet(text):
df_new = pd.DataFrame([text])
pr = Preprocessor()
X_test = pr.transform(df_new[0])
log_reg = load('log_reg.model')
y_lr = log_reg.predict(X_test)
tree = load('tree.model')
y_tree = tree.predict(X_test)
forest = load('forest.model')
y_forest = forest.predict(X_test)
r = [y_lr[0], y_tree[0], y_forest[0]]
text = gettext(r)
return text
interface = gr.Interface(
title = "😄 Twitter Sentiment Analysis 😡 - UMG",
description = "<h3>The idea is to classify a text provided by the user according to the emotion contained in that text. "+
"The possible outputs are the following: Irrelevant, Negative, Neutral, and Positive. </h3>"+
"<b>Models:</b> Logistic Regression, Decision Trees and Random Forest"+
"<br><b>Metrics:</b> Accuracy: 0.95, Precision: 0.953, Recall: 0.945, F1 Score: 0.948 <br> <br><b>Please provide a text example:</b>",
article='Step-by-step on GitHub <a href="https://github.com/Adrian8aS/-Twitter-Sentiment-Analysis/blob/4558716d85e18bb18dde25f597f010af13a5deb5/Exam%20JAOS.ipynb"> notebook </a> <br> ~ José Adrián Ochoa Sánchez',
allow_flagging = "never",
fn = greet,
inputs = [
gr.Text(label="Write a tweet")],
outputs = [
gr.Text(label="Sentiment detected")],
examples = [
['I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'],
['BBC News - Amazon boss Jeff Bezos rejects claims company acted like a drug dealer bbc.co.uk/news/av/busine…'],
['@Microsoft Why do I pay for WORD when it functions so poorly on my @SamsungUS Chromebook? 🙄'],
['FUCKKKKKK I CANT WAIT']
]
)
interface.launch(share = True)