Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import nltk
|
3 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
|
|
1 |
+
# utilities
|
2 |
+
import re
|
3 |
+
import pickle
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
# plotting
|
7 |
+
import seaborn as sns
|
8 |
+
from wordcloud import WordCloud
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
# nltk
|
11 |
+
from nltk.stem import WordNetLemmatizer
|
12 |
+
# sklearn
|
13 |
+
from sklearn.svm import LinearSVC
|
14 |
+
from sklearn.naive_bayes import BernoulliNB
|
15 |
+
from sklearn.linear_model import LogisticRegression
|
16 |
+
|
17 |
+
from sklearn.model_selection import train_test_split
|
18 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
19 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
20 |
+
|
21 |
+
from datasets import load_dataset
|
22 |
+
|
23 |
+
dataset = load_dataset('Twitter_Emoticon_Analysis_NLP/training.1600000.processed.noemoticon.csv')
|
24 |
+
|
25 |
+
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
|
26 |
+
DATASET_ENCODING = "ISO-8859-1"
|
27 |
+
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
|
28 |
+
encoding=DATASET_ENCODING , names=DATASET_COLUMNS)
|
29 |
+
|
30 |
+
# Removing the unnecessary columns.
|
31 |
+
dataset = dataset[['sentiment','text']]
|
32 |
+
# Replacing the values to ease understanding.
|
33 |
+
dataset['sentiment'] = dataset['sentiment'].replace(4,1)
|
34 |
+
|
35 |
+
# Storing data in lists.
|
36 |
+
text, sentiment = list(dataset['text']), list(dataset['sentiment'])
|
37 |
+
|
38 |
+
def preprocess(textdata):
|
39 |
+
processedText = []
|
40 |
+
|
41 |
+
# Create Lemmatizer and Stemmer.
|
42 |
+
wordLemm = WordNetLemmatizer()
|
43 |
+
|
44 |
+
# Defining regex patterns.
|
45 |
+
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
|
46 |
+
userPattern = '@[^\s]+'
|
47 |
+
alphaPattern = "[^a-zA-Z0-9]"
|
48 |
+
sequencePattern = r"(.)\1\1+"
|
49 |
+
seqReplacePattern = r"\1\1"
|
50 |
+
|
51 |
+
for tweet in textdata:
|
52 |
+
tweet = tweet.lower()
|
53 |
+
|
54 |
+
# Replace all URls with 'URL'
|
55 |
+
tweet = re.sub(urlPattern,' URL',tweet)
|
56 |
+
# Replace all emojis.
|
57 |
+
for emoji in emojis.keys():
|
58 |
+
tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
|
59 |
+
# Replace @USERNAME to 'USER'.
|
60 |
+
tweet = re.sub(userPattern,' USER', tweet)
|
61 |
+
# Replace all non alphabets.
|
62 |
+
tweet = re.sub(alphaPattern, " ", tweet)
|
63 |
+
# Replace 3 or more consecutive letters by 2 letter.
|
64 |
+
tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
|
65 |
+
|
66 |
+
tweetwords = ''
|
67 |
+
for word in tweet.split():
|
68 |
+
# Checking if the word is a stopword.
|
69 |
+
#if word not in stopwordlist:
|
70 |
+
if len(word)>1:
|
71 |
+
# Lemmatizing the word.
|
72 |
+
word = wordLemm.lemmatize(word)
|
73 |
+
tweetwords += (word+' ')
|
74 |
+
|
75 |
+
processedText.append(tweetwords)
|
76 |
+
|
77 |
+
return processedText
|
78 |
+
|
79 |
+
def preprocess(textdata):
|
80 |
+
processedText = []
|
81 |
+
|
82 |
+
# Create Lemmatizer and Stemmer.
|
83 |
+
wordLemm = WordNetLemmatizer()
|
84 |
+
|
85 |
+
# Defining regex patterns.
|
86 |
+
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
|
87 |
+
userPattern = '@[^\s]+'
|
88 |
+
alphaPattern = "[^a-zA-Z0-9]"
|
89 |
+
sequencePattern = r"(.)\1\1+"
|
90 |
+
seqReplacePattern = r"\1\1"
|
91 |
+
|
92 |
+
for tweet in textdata:
|
93 |
+
tweet = tweet.lower()
|
94 |
+
|
95 |
+
# Replace all URls with 'URL'
|
96 |
+
tweet = re.sub(urlPattern,' URL',tweet)
|
97 |
+
# Replace all emojis.
|
98 |
+
for emoji in emojis.keys():
|
99 |
+
tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
|
100 |
+
# Replace @USERNAME to 'USER'.
|
101 |
+
tweet = re.sub(userPattern,' USER', tweet)
|
102 |
+
# Replace all non alphabets.
|
103 |
+
tweet = re.sub(alphaPattern, " ", tweet)
|
104 |
+
# Replace 3 or more consecutive letters by 2 letter.
|
105 |
+
tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
|
106 |
+
|
107 |
+
tweetwords = ''
|
108 |
+
for word in tweet.split():
|
109 |
+
# Checking if the word is a stopword.
|
110 |
+
#if word not in stopwordlist:
|
111 |
+
if len(word)>1:
|
112 |
+
# Lemmatizing the word.
|
113 |
+
word = wordLemm.lemmatize(word)
|
114 |
+
tweetwords += (word+' ')
|
115 |
+
|
116 |
+
processedText.append(tweetwords)
|
117 |
+
|
118 |
+
return processedText
|
119 |
+
|
120 |
import gradio as gr
|
121 |
import nltk
|
122 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|