Spaces:
Runtime error
Runtime error
Create precessing_data.py
Browse files- precessing_data.py +51 -0
precessing_data.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def preprocess(data):
|
5 |
+
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
|
6 |
+
|
7 |
+
messages = re.split(pattern, data)[1:]
|
8 |
+
dates = re.findall(pattern, data)
|
9 |
+
|
10 |
+
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
|
11 |
+
# convert date type
|
12 |
+
df['date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
|
13 |
+
|
14 |
+
users = []
|
15 |
+
messages = []
|
16 |
+
|
17 |
+
|
18 |
+
for message in df['user_message']:
|
19 |
+
entry = re.split('([\w\W]+?):\s', message)
|
20 |
+
if entry[1:]: # user name
|
21 |
+
users.append(entry[1])
|
22 |
+
messages.append(" ".join(entry[2:]))
|
23 |
+
else:
|
24 |
+
users.append('group_notification')
|
25 |
+
messages.append(entry[0])
|
26 |
+
|
27 |
+
df['user'] = users
|
28 |
+
df['message'] = messages
|
29 |
+
df.drop(columns=['user_message'], inplace=True)
|
30 |
+
|
31 |
+
df['only_date'] = df['date'].dt.date
|
32 |
+
df['year'] = df['date'].dt.year
|
33 |
+
df['month_num'] = df['date'].dt.month
|
34 |
+
df['month'] = df['date'].dt.month_name()
|
35 |
+
df['day'] = df['date'].dt.day
|
36 |
+
df['day_name'] = df['date'].dt.day_name()
|
37 |
+
df['hour'] = df['date'].dt.hour
|
38 |
+
df['minute'] = df['date'].dt.minute
|
39 |
+
|
40 |
+
period = []
|
41 |
+
for hour in df[['day_name', 'hour']]['hour']:
|
42 |
+
if hour == 23:
|
43 |
+
period.append(str(hour) + "-" + str('00'))
|
44 |
+
elif hour == 0:
|
45 |
+
period.append(str('00') + "-" + str(hour + 1))
|
46 |
+
else:
|
47 |
+
period.append(str(hour) + "-" + str(hour + 1))
|
48 |
+
|
49 |
+
df['period'] = period
|
50 |
+
|
51 |
+
return df
|