smith2020 commited on
Commit
630e844
·
1 Parent(s): 8b65149

Create precessing_data.py

Browse files
Files changed (1) hide show
  1. precessing_data.py +51 -0
precessing_data.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def preprocess(data):
5
+ pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
6
+
7
+ messages = re.split(pattern, data)[1:]
8
+ dates = re.findall(pattern, data)
9
+
10
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
11
+ # convert date type
12
+ df['date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
13
+
14
+ users = []
15
+ messages = []
16
+
17
+
18
+ for message in df['user_message']:
19
+ entry = re.split('([\w\W]+?):\s', message)
20
+ if entry[1:]: # user name
21
+ users.append(entry[1])
22
+ messages.append(" ".join(entry[2:]))
23
+ else:
24
+ users.append('group_notification')
25
+ messages.append(entry[0])
26
+
27
+ df['user'] = users
28
+ df['message'] = messages
29
+ df.drop(columns=['user_message'], inplace=True)
30
+
31
+ df['only_date'] = df['date'].dt.date
32
+ df['year'] = df['date'].dt.year
33
+ df['month_num'] = df['date'].dt.month
34
+ df['month'] = df['date'].dt.month_name()
35
+ df['day'] = df['date'].dt.day
36
+ df['day_name'] = df['date'].dt.day_name()
37
+ df['hour'] = df['date'].dt.hour
38
+ df['minute'] = df['date'].dt.minute
39
+
40
+ period = []
41
+ for hour in df[['day_name', 'hour']]['hour']:
42
+ if hour == 23:
43
+ period.append(str(hour) + "-" + str('00'))
44
+ elif hour == 0:
45
+ period.append(str('00') + "-" + str(hour + 1))
46
+ else:
47
+ period.append(str(hour) + "-" + str(hour + 1))
48
+
49
+ df['period'] = period
50
+
51
+ return df