Spaces:
Sleeping
Sleeping
import re | |
import pandas as pd | |
def checker(data): | |
pattern_12 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s[a,m,p]+\s-\s' | |
pattern_24 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s-\s' | |
check_1 = re.search(pattern_12, data) | |
check_2 = re.search(pattern_24, data) | |
if check_1 or check_2: | |
return True | |
else: | |
return False | |
def preprocess(data): | |
pattern_12 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s[a,m,p]+\s-\s' | |
pattern_24 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s-\s' | |
pattern_ph_num = '\d{1,2} \d{5} \d{5}' | |
x = re.search(pattern_12, data) | |
# For 12 hour format | |
if x: | |
messages = re.split(pattern_12, data)[1:] | |
dates = re.findall(pattern_12, data) | |
df = pd.DataFrame({'user_message': messages, 'message_date': dates}) | |
try: | |
df['message_date'] = pd.to_datetime( | |
df['message_date'], format='%d/%m/%y, %I:%M %p - ') | |
except ValueError: | |
df['message_date'] = pd.to_datetime( | |
df['message_date'], format='%d/%m/%Y, %I:%M %p - ') | |
else: | |
messages = re.split(pattern_24, data)[1:] | |
dates = re.findall(pattern_24, data) | |
df = pd.DataFrame({'user_message': messages, 'message_date': dates}) | |
try: | |
df['message_date'] = pd.to_datetime( | |
df['message_date'], format='%d/%m/%y, %H:%M - ') | |
except ValueError: | |
df['message_date'] = pd.to_datetime( | |
df['message_date'], format='%d/%m/%Y, %H:%M - ') | |
df.rename(columns={'message_date': 'date'}, inplace=True) | |
users = [] | |
messages = [] | |
for message in df['user_message']: | |
entry = re.split('([\w\W]+?):\s', message) | |
if entry[1:]: | |
if re.search(pattern_ph_num, entry[1]): | |
users.append('M:' + str(entry[1])) | |
else: | |
users.append(entry[1]) | |
messages.append(entry[2]) | |
else: | |
users.append('group_notification') | |
messages.append(entry[0]) | |
df['user'] = users | |
df['message'] = messages | |
df.drop(columns=['user_message'], inplace=True) | |
df['only_date'] = df['date'].dt.date | |
df['year'] = df['date'].dt.year | |
df['month'] = df['date'].dt.month_name() | |
df['day'] = df['date'].dt.day | |
df['day_name'] = df['date'].dt.day_name() | |
df['hour'] = df['date'].dt.hour | |
df['minute'] = df['date'].dt.minute | |
period = [] | |
for hour in df[['day_name', 'hour']]['hour']: | |
if hour == 23: | |
period.append(str(hour) + ' - ' + '00') | |
elif hour == 0: | |
period.append('00' + ' - ' + str(hour + 1)) | |
else: | |
period.append(str(hour) + ' - ' + str(hour + 1)) | |
# period.append(str(hour) + 'abc') | |
df['period'] = period | |
return df | |