|
from statistics import mode |
|
from joblib import load |
|
from tqdm import tqdm |
|
|
|
import pandas as pd |
|
import gradio as gr |
|
import numpy as np |
|
import regex as re |
|
|
|
stopwords = load('stopwords.data') |
|
nlp = load('nlp.path') |
|
|
|
|
|
class Preprocessor: |
|
def __init__(self, stopwords=stopwords): |
|
self.vectorizer = load('vectorizer.model') |
|
self.stopwords = stopwords |
|
self.vectorizer_fitted = True |
|
|
|
def remove_urls(self, texts): |
|
print('Removing URLs...') |
|
pattern = re.compile('(\w+\.com ?/ ?.+)|(http\S+)') |
|
return [re.sub(pattern, '', text) for text in texts] |
|
|
|
def remove_double_space(self, texts): |
|
print('Removing double space...') |
|
pattern = re.compile(' +') |
|
return [re.sub(pattern, ' ', text) for text in texts] |
|
|
|
def remove_punctuation(self, texts): |
|
print('Removing Punctuation...') |
|
pattern = re.compile('[^a-z ]') |
|
return [re.sub(pattern, ' ', text) for text in texts] |
|
|
|
def remove_stopwords(self, texts): |
|
print('Removing stopwords...') |
|
return [[w for w in text.split(' ') if w not in self.stopwords] for text in tqdm(texts)] |
|
|
|
def remove_numbers(self, texts): |
|
print('Removing numbers...') |
|
return [' '.join([w for w in text if not w.isdigit()]) for text in tqdm(texts)] |
|
|
|
def remove_emojis(self, texts): |
|
print('Removing emojis...') |
|
pattern = re.compile("[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
"]+", flags=re.UNICODE) |
|
return [re.sub(pattern, r'', text) for text in texts] |
|
|
|
def lemmatize(self, texts): |
|
print('Lemmatizing...') |
|
lemmatized_texts = [] |
|
for text in tqdm(texts): |
|
doc = nlp(text) |
|
lemmatized_texts.append(' '.join([token.lemma_ for token in doc])) |
|
|
|
return lemmatized_texts |
|
|
|
def transform(self, X, y=None, mode='train'): |
|
X = X.copy() |
|
|
|
print('Removing Nans...') |
|
X = X[~X.isnull()] |
|
X = X[~X.duplicated()] |
|
|
|
if mode == 'train': |
|
self.train_idx = X.index |
|
else: |
|
self.test_idx = X.index |
|
|
|
print('Counting capitalized...') |
|
capitalized = [np.sum([t.isupper() for t in text.split()]) |
|
for text in np.array(X.values)] |
|
|
|
print('Lowering...') |
|
X = [text.lower() for text in X] |
|
|
|
X = self.remove_urls(X) |
|
X = self.remove_punctuation(X) |
|
X = self.remove_double_space(X) |
|
X = self.remove_emojis(X) |
|
X = self.remove_stopwords(X) |
|
X = self.remove_numbers(X) |
|
X = self.lemmatize(X) |
|
|
|
if not self.vectorizer_fitted: |
|
self.vectorizer_fitted = True |
|
print('Fitting vectorizer...') |
|
self.vectorizer.fit(X) |
|
|
|
print('Vectorizing...') |
|
X = self.vectorizer.transform(X) |
|
|
|
return X |
|
|
|
|
|
def gettext(r): |
|
|
|
pred = mode(r) |
|
|
|
if pred == 0: |
|
text = 'Irrelevant' |
|
elif pred == 1: |
|
text = 'Negative' |
|
elif pred == 2: |
|
text = 'Neutral' |
|
else: |
|
text = 'Positive' |
|
|
|
return text |
|
|
|
|
|
def greet(text): |
|
|
|
df_new = pd.DataFrame([text]) |
|
|
|
pr = Preprocessor() |
|
X_test = pr.transform(df_new[0]) |
|
|
|
log_reg = load('log_reg.model') |
|
y_lr = log_reg.predict(X_test) |
|
|
|
tree = load('tree.model') |
|
y_tree = tree.predict(X_test) |
|
|
|
forest = load('forest.model') |
|
y_forest = forest.predict(X_test) |
|
|
|
r = [y_lr[0], y_tree[0], y_forest[0]] |
|
|
|
text = gettext(r) |
|
|
|
return text |
|
|
|
|
|
interface = gr.Interface( |
|
title = "😄 Twitter Sentiment Analysis 😡 - UMG", |
|
description = "<h3>The idea is to classify a text provided by the user according to the emotion contained in that text. "+ |
|
"The possible outputs are the following: Irrelevant, Negative, Neutral, and Positive. </h3>"+ |
|
"<b>Models:</b> Logistic Regression, Decision Trees and Random Forest"+ |
|
"<br><b>Metrics:</b> Accuracy: 0.95, Precision: 0.953, Recall: 0.945, F1 Score: 0.948 <br> <br><b>Please provide a text example:</b>", |
|
article='Step-by-step on GitHub <a href="https://github.com/Adrian8aS/-Twitter-Sentiment-Analysis/blob/4558716d85e18bb18dde25f597f010af13a5deb5/Exam%20JAOS.ipynb"> notebook </a> <br> ~ José Adrián Ochoa Sánchez', |
|
allow_flagging = "never", |
|
fn = greet, |
|
inputs = [ |
|
gr.Text(label="Write a tweet")], |
|
outputs = [ |
|
gr.Text(label="Sentiment detected")], |
|
examples = [ |
|
['I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'], |
|
['BBC News - Amazon boss Jeff Bezos rejects claims company acted like a drug dealer bbc.co.uk/news/av/busine…'], |
|
['@Microsoft Why do I pay for WORD when it functions so poorly on my @SamsungUS Chromebook? 🙄'], |
|
['FUCKKKKKK I CANT WAIT'] |
|
] |
|
) |
|
|
|
interface.launch(share = True) |