vibha-mah commited on
Commit
64f1623
·
1 Parent(s): eedf0cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py CHANGED
@@ -1,3 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import nltk
3
  from nltk.sentiment.vader import SentimentIntensityAnalyzer
 
1
+ # utilities
2
+ import re
3
+ import pickle
4
+ import numpy as np
5
+ import pandas as pd
6
+ # plotting
7
+ import seaborn as sns
8
+ from wordcloud import WordCloud
9
+ import matplotlib.pyplot as plt
10
+ # nltk
11
+ from nltk.stem import WordNetLemmatizer
12
+ # sklearn
13
+ from sklearn.svm import LinearSVC
14
+ from sklearn.naive_bayes import BernoulliNB
15
+ from sklearn.linear_model import LogisticRegression
16
+
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.feature_extraction.text import TfidfVectorizer
19
+ from sklearn.metrics import confusion_matrix, classification_report
20
+
21
+ from datasets import load_dataset
22
+
23
+ dataset = load_dataset('Twitter_Emoticon_Analysis_NLP/training.1600000.processed.noemoticon.csv')
24
+
25
+ DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
26
+ DATASET_ENCODING = "ISO-8859-1"
27
+ dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
28
+ encoding=DATASET_ENCODING , names=DATASET_COLUMNS)
29
+
30
+ # Removing the unnecessary columns.
31
+ dataset = dataset[['sentiment','text']]
32
+ # Replacing the values to ease understanding.
33
+ dataset['sentiment'] = dataset['sentiment'].replace(4,1)
34
+
35
+ # Storing data in lists.
36
+ text, sentiment = list(dataset['text']), list(dataset['sentiment'])
37
+
38
+ def preprocess(textdata):
39
+ processedText = []
40
+
41
+ # Create Lemmatizer and Stemmer.
42
+ wordLemm = WordNetLemmatizer()
43
+
44
+ # Defining regex patterns.
45
+ urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
46
+ userPattern = '@[^\s]+'
47
+ alphaPattern = "[^a-zA-Z0-9]"
48
+ sequencePattern = r"(.)\1\1+"
49
+ seqReplacePattern = r"\1\1"
50
+
51
+ for tweet in textdata:
52
+ tweet = tweet.lower()
53
+
54
+ # Replace all URls with 'URL'
55
+ tweet = re.sub(urlPattern,' URL',tweet)
56
+ # Replace all emojis.
57
+ for emoji in emojis.keys():
58
+ tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
59
+ # Replace @USERNAME to 'USER'.
60
+ tweet = re.sub(userPattern,' USER', tweet)
61
+ # Replace all non alphabets.
62
+ tweet = re.sub(alphaPattern, " ", tweet)
63
+ # Replace 3 or more consecutive letters by 2 letter.
64
+ tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
65
+
66
+ tweetwords = ''
67
+ for word in tweet.split():
68
+ # Checking if the word is a stopword.
69
+ #if word not in stopwordlist:
70
+ if len(word)>1:
71
+ # Lemmatizing the word.
72
+ word = wordLemm.lemmatize(word)
73
+ tweetwords += (word+' ')
74
+
75
+ processedText.append(tweetwords)
76
+
77
+ return processedText
78
+
79
+ def preprocess(textdata):
80
+ processedText = []
81
+
82
+ # Create Lemmatizer and Stemmer.
83
+ wordLemm = WordNetLemmatizer()
84
+
85
+ # Defining regex patterns.
86
+ urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
87
+ userPattern = '@[^\s]+'
88
+ alphaPattern = "[^a-zA-Z0-9]"
89
+ sequencePattern = r"(.)\1\1+"
90
+ seqReplacePattern = r"\1\1"
91
+
92
+ for tweet in textdata:
93
+ tweet = tweet.lower()
94
+
95
+ # Replace all URls with 'URL'
96
+ tweet = re.sub(urlPattern,' URL',tweet)
97
+ # Replace all emojis.
98
+ for emoji in emojis.keys():
99
+ tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
100
+ # Replace @USERNAME to 'USER'.
101
+ tweet = re.sub(userPattern,' USER', tweet)
102
+ # Replace all non alphabets.
103
+ tweet = re.sub(alphaPattern, " ", tweet)
104
+ # Replace 3 or more consecutive letters by 2 letter.
105
+ tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
106
+
107
+ tweetwords = ''
108
+ for word in tweet.split():
109
+ # Checking if the word is a stopword.
110
+ #if word not in stopwordlist:
111
+ if len(word)>1:
112
+ # Lemmatizing the word.
113
+ word = wordLemm.lemmatize(word)
114
+ tweetwords += (word+' ')
115
+
116
+ processedText.append(tweetwords)
117
+
118
+ return processedText
119
+
120
  import gradio as gr
121
  import nltk
122
  from nltk.sentiment.vader import SentimentIntensityAnalyzer