Adrian8a commited on
Commit
0b3f2a8
1 Parent(s): 3a90b83

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. app.py +160 -0
  3. forest.model +3 -0
  4. log_reg.model +3 -0
  5. nlp.path +3 -0
  6. stopwords.data +0 -0
  7. tree.model +3 -0
  8. vectorizer.model +3 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ nlp.path filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from statistics import mode
2
+ from joblib import load
3
+ from tqdm import tqdm
4
+
5
+ import pandas as pd
6
+ import gradio as gr
7
+ import numpy as np
8
+ import regex as re
9
+
10
+ stopwords = load('stopwords.data')
11
+ nlp = load('nlp.path')
12
+
13
+
14
+ class Preprocessor:
15
+ def __init__(self, stopwords=stopwords):
16
+ self.vectorizer = load('vectorizer.model')
17
+ self.stopwords = stopwords
18
+ self.vectorizer_fitted = True
19
+
20
+ def remove_urls(self, texts):
21
+ print('Removing URLs...')
22
+ pattern = re.compile('(\w+\.com ?/ ?.+)|(http\S+)')
23
+ return [re.sub(pattern, '', text) for text in texts]
24
+
25
+ def remove_double_space(self, texts):
26
+ print('Removing double space...')
27
+ pattern = re.compile(' +')
28
+ return [re.sub(pattern, ' ', text) for text in texts]
29
+
30
+ def remove_punctuation(self, texts):
31
+ print('Removing Punctuation...')
32
+ pattern = re.compile('[^a-z ]')
33
+ return [re.sub(pattern, ' ', text) for text in texts]
34
+
35
+ def remove_stopwords(self, texts):
36
+ print('Removing stopwords...')
37
+ return [[w for w in text.split(' ') if w not in self.stopwords] for text in tqdm(texts)]
38
+
39
+ def remove_numbers(self, texts):
40
+ print('Removing numbers...')
41
+ return [' '.join([w for w in text if not w.isdigit()]) for text in tqdm(texts)]
42
+
43
+ def remove_emojis(self, texts):
44
+ print('Removing emojis...')
45
+ pattern = re.compile("["
46
+ u"\U0001F600-\U0001F64F" # emoticons
47
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
48
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
49
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
50
+ "]+", flags=re.UNICODE)
51
+ return [re.sub(pattern, r'', text) for text in texts]
52
+
53
+ def lemmatize(self, texts):
54
+ print('Lemmatizing...')
55
+ lemmatized_texts = []
56
+ for text in tqdm(texts):
57
+ doc = nlp(text)
58
+ lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
59
+
60
+ return lemmatized_texts
61
+
62
+ def transform(self, X, y=None, mode='train'):
63
+ X = X.copy()
64
+
65
+ print('Removing Nans...')
66
+ X = X[~X.isnull()]
67
+ X = X[~X.duplicated()]
68
+
69
+ if mode == 'train':
70
+ self.train_idx = X.index
71
+ else:
72
+ self.test_idx = X.index
73
+
74
+ print('Counting capitalized...')
75
+ capitalized = [np.sum([t.isupper() for t in text.split()])
76
+ for text in np.array(X.values)]
77
+
78
+ print('Lowering...')
79
+ X = [text.lower() for text in X]
80
+
81
+ X = self.remove_urls(X)
82
+ X = self.remove_punctuation(X)
83
+ X = self.remove_double_space(X)
84
+ X = self.remove_emojis(X)
85
+ X = self.remove_stopwords(X)
86
+ X = self.remove_numbers(X)
87
+ X = self.lemmatize(X)
88
+
89
+ if not self.vectorizer_fitted:
90
+ self.vectorizer_fitted = True
91
+ print('Fitting vectorizer...')
92
+ self.vectorizer.fit(X)
93
+
94
+ print('Vectorizing...')
95
+ X = self.vectorizer.transform(X)
96
+
97
+ return X
98
+
99
+
100
+ def gettext(r):
101
+
102
+ pred = mode(r)
103
+
104
+ if pred == 0:
105
+ text = 'Irrelevant'
106
+ elif pred == 1:
107
+ text = 'Negative'
108
+ elif pred == 2:
109
+ text = 'Neutral'
110
+ else:
111
+ text = 'Positive'
112
+
113
+ return text
114
+
115
+
116
+ def greet(text):
117
+
118
+ df_new = pd.DataFrame([text])
119
+
120
+ pr = Preprocessor()
121
+ X_test = pr.transform(df_new[0])
122
+
123
+ log_reg = load('log_reg.model')
124
+ y_lr = log_reg.predict(X_test)
125
+
126
+ tree = load('tree.model')
127
+ y_tree = tree.predict(X_test)
128
+
129
+ forest = load('forest.model')
130
+ y_forest = forest.predict(X_test)
131
+
132
+ r = [y_lr[0], y_tree[0], y_forest[0]]
133
+
134
+ text = gettext(r)
135
+
136
+ return text
137
+
138
+
139
+ interface = gr.Interface(
140
+ title = "😄 Twitter Sentiment Analysis 😡 - UMG",
141
+ description = "<h3>The idea is to classify a text provided by the user according to the emotion contained in that text. "+
142
+ "The possible outputs are the following: Irrelevant, Negative, Neutral, and Positive. </h3>"+
143
+ "<b>Models:</b> Logistic Regression, Decision Trees and Random Forest"+
144
+ "<br><b>Metrics:</b> Accuracy: 0.95, Precision: 0.953, Recall: 0.945, F1 Score: 0.948 <br> <br><b>Please provide a text example:</b>",
145
+ article='Step-by-step on GitHub <a href="https://github.com/Adrian8aS/-Twitter-Sentiment-Analysis/blob/4558716d85e18bb18dde25f597f010af13a5deb5/Exam%20JAOS.ipynb"> notebook </a> <br> ~ José Adrián Ochoa Sánchez',
146
+ allow_flagging = "never",
147
+ fn = greet,
148
+ inputs = [
149
+ gr.Text(label="Write a tweet")],
150
+ outputs = [
151
+ gr.Text(label="Sentiment detected")],
152
+ examples = [
153
+ ['I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'],
154
+ ['BBC News - Amazon boss Jeff Bezos rejects claims company acted like a drug dealer bbc.co.uk/news/av/busine…'],
155
+ ['@Microsoft Why do I pay for WORD when it functions so poorly on my @SamsungUS Chromebook? 🙄'],
156
+ ['FUCKKKKKK I CANT WAIT']
157
+ ]
158
+ )
159
+
160
+ interface.launch(share = True)
forest.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0a2a8ab5baa096bec20274cbe7a491e7b4043722aad5bf493f4d329706fd78a
3
+ size 317318393
log_reg.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06643ef7dd1bc3d5602e5f0ea640e77a3ae3a7c08c159e3f3144110e478c020f
3
+ size 401812
nlp.path ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541a40e7a0a0592389a37f1598dfe6922458e6f634eb3fbc19c8a590c3d1e724
3
+ size 17042384
stopwords.data ADDED
Binary file (1.31 kB). View file
 
tree.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a28fbe3807527ce0f507b5351a33a2e29d76745a36ad9dbb3a936aa7d4eb5741
3
+ size 1922126
vectorizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57852a03c4c7f7675ad62b77e3e9b94edf63d54ac60aea045edd9eeb1cf6a49a
3
+ size 11457423