kajalag commited on
Commit
27c3a9d
·
1 Parent(s): 2ece148

Delete preprocessor.py

Browse files
Files changed (1) hide show
  1. preprocessor.py +0 -111
preprocessor.py DELETED
@@ -1,111 +0,0 @@
1
- import pandas as pd
2
- import re
3
- from textblob import TextBlob
4
- import numpy as np
5
- import nltk
6
- import nltk.data
7
- from nltk.sentiment.vader import SentimentIntensityAnalyzer
8
- from tqdm.notebook import tqdm
9
- sia=SentimentIntensityAnalyzer()
10
- nltk.download('vader_lexicon')
11
-
12
- def preprocess(data):
13
- pattern ='\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
14
-
15
- messages = re.split(pattern, data)[1:]
16
- dates = re.findall(pattern, data)
17
- df = pd.DataFrame({'user_message': messages, 'message_date': dates})
18
- df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M - ')
19
- df.rename(columns={'message_date': 'date'}, inplace=True)
20
- users = []
21
- messages = []
22
- for message in df['user_message']:
23
- entry = re.split('([\w\W]+?):\s', message)
24
-
25
- if entry[1:]:
26
- users.append(entry[1])
27
- messages.append(entry[2])
28
-
29
- else:
30
- users.append('group_notification')
31
- messages.append(entry[0])
32
- df['users'] = users
33
- df['message'] = messages
34
- df.drop(columns=['user_message'], inplace=True)
35
- df['year'] = df['date'].dt.year
36
- df['day'] = df['date'].dt.day
37
- df['hour'] = df['date'].dt.hour
38
- df['minute'] = df['date'].dt.minute
39
- df['Day_name'] = df['date'].dt.day_name()
40
- df['Date']=df['date'].dt.date
41
- df['Month'] = df['date'].dt.month
42
- df['Month_name'] = df['date'].dt.month_name()
43
-
44
- period = []
45
- for hour in df[['Day_name', 'hour']]['hour']:
46
- if hour == 23:
47
- period.append(str(hour) + "-" + str('00'))
48
- elif hour == 0:
49
- period.append(str('00') + "-" + str(hour + 1))
50
- else:
51
- period.append(str(hour) + "-" + str(hour + 1))
52
-
53
- df['period']=period
54
-
55
- temp = df[df['users'] != 'group_notification']
56
- temp = temp[temp['message'] != '<Media omitted>\n']
57
- temp.replace("", np.nan, inplace=True)
58
- temp = temp.dropna()
59
-
60
- def cleanTxt(text):
61
- text = re.sub(r'@[A-Za-z0-9]+', '', text)
62
- text = re.sub(r'#', '', text)
63
- text = text.replace('\n', "")
64
- return text
65
-
66
- temp['message'] = temp['message'].apply(cleanTxt)
67
- temp['users'] = temp['users'].apply(cleanTxt)
68
-
69
- res = {}
70
- for i, row in tqdm(temp.iterrows(), total=len(temp)):
71
- text = row['message']
72
- myid = row['users']
73
- res[myid] = sia.polarity_scores(text)
74
-
75
- vaders = pd.DataFrame(res).T
76
- vaders = vaders.reset_index().rename(columns={'index': 'users'})
77
- vaders = vaders.merge(temp, how="right")
78
- vaders_new = vaders.pop('message')
79
- vaders_new = pd.DataFrame(vaders_new)
80
- vaders.insert(1, "message", vaders_new['message'])
81
-
82
- def getSubjectivity(text):
83
- return TextBlob(text).sentiment.subjectivity
84
-
85
- def getPolarity(text):
86
- return TextBlob(text).sentiment.polarity
87
-
88
- vaders['Subjectivity'] = vaders['message'].apply(getSubjectivity)
89
- vaders['Polarity'] = vaders['message'].apply(getPolarity)
90
-
91
- def getAnalysis(score):
92
- if score < 0:
93
- return 'Negative'
94
- if score == 0:
95
- return 'Neutral'
96
- else:
97
- return 'Positive'
98
-
99
- vaders['Analysis'] = vaders['Polarity'].apply(getAnalysis)
100
-
101
- def getAnalysis(score):
102
- if score <= 0:
103
- return 'Negative'
104
- if score < 0.2960:
105
- return 'Neutral'
106
- else:
107
- return 'Positive'
108
-
109
- vaders['vader_Analysis'] = vaders['compound'].apply(getAnalysis)
110
-
111
- return vaders