#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Feb 28 18:21:59 2024 @author: selbl """ import streamlit as st from script import WhatsappAnalysis import re import pandas as pd def main(): st.set_page_config( page_title="Whatsapp Conversation Analysis", page_icon=":bar_chart:", # You can use an emoji or a URL to an image #layout="wide", initial_sidebar_state="auto", ) st.header("Whatsapp Chat Analysis",divider="blue") st.text("Always wanted to know who talks the most in your Whatsapp groups?") st.text("Want to check how much more active your group has been in the past years?") st.text("This script will take care of doing the hard word for you!") st.text("Just upload your conversation as a .txt file below") st.text("The file you upload IS NOT SAVED") st.text("If you are unsure on how to get a txt file refer to the sidebar") st.text("Please only upload .txt files") # Button to generate text uploaded_file = st.file_uploader("Upload your conversation here",type="txt") #Check that file exists if uploaded_file is not None: if not uploaded_file.name.endswith(".txt"): st.error("Please upload a valid .txt file!") else: df,nombregrupo = buildDF(uploaded_file) WhatsappAnalysis(df, nombregrupo) #Auxiliary function for converting from 12 hour to 24 hour format def convert_to_24hour(time_str): hour, minute, second, am_pm = re.findall('\d+|\w+', time_str) hour = int(hour) if am_pm == 'PM' and hour != 12: hour += 12 elif am_pm == 'AM' and hour == 12: hour = 0 return f'{hour:02d}:{minute}:{second}' def buildDF(file): #file_name = file.name #This can be obtained from the file name and finding what is between with and .txt #Define the column names of the databases we work with column_names = ["Date", "Time", "Sender","Message"] #Initialize an empty dataframe df = pd.DataFrame(columns = column_names) #Get the lines of the file #lines = file.iter_lines() #Work with each line #with open(file,encoding="utf8") as f: #Obtain text #lines = f.readlines() #Create a counter to extract group name cont = 0 #create boolean for bracket formatting (explained below) bracket_form = False #Make another boolean for 24hr format twentyfour_hr = True #Iterate over lines for line in file: #Decode line = line.decode('utf-8') #use initial line to check the format of the line if cont == 0: #If the first element is a bracket ([) then that means it has iPhone formatting #Which is: #‎[Date, Time AM/PM] User: ‎Message #This is opposed to Android formatting which is: #Date, Time (24hrs) - User: Message #Check by seeing if the brackets are there if line[0] == '[': bracket_form = True print(line) #Check the AM,PM time if (line.find('AM') != -1) or (line.find('PM') != -1): twentyfour_hr = False #Add to the counter cont += 1 continue #Check the counter if cont == 1: # Use re.search to find the match #match = re.search(r'created group "(.*?)"', str(line)) #Extract #nombregrupo = match.group(1) x_list = line.split() nombregrupo = x_list[-1][1:(len(x_list[-1])-1)] #Increase counter so it does not bother anymore cont += 1 continue #Check if it has delimiters #if (", " in line) and (": " in line) and (" - " in line): #With bracket format, the pattern changes if bracket_form: #Remove first bracket #line = line[1:] #Find other punctuations #comma_ind = line.find(',') #bracket_ind = line.find(']') #colon_ind = line.find(': ') #Store into a splitline #splitline = [line[:comma_ind],line[comma_ind+2:bracket_ind],line[bracket_ind+2:colon_ind],line[colon_ind+2:]] pattern = re.compile(r'(\d+/\d+/\d+), (\d+:\d+:\d+\s[APMapm]+)\] (.*?):\s(.*)') matches = pattern.findall(line) #if the pattern is not there just skip if not matches: continue splitline = list(matches[0]) #Define pattern depending on the type of format elif (", " in line) and (": " in line) and (" - " in line): splitline = re.split(', | - |: ',line) else: continue #Split #splitline = re.split(', | - |: ',line) #Delete the \n value #Only process if the length of the splitline allows it if len(splitline) == 4: splitline[len(splitline)-1] = splitline[len(splitline)-1].replace("\n","",1) #Check if I have to move the non-24 hour format to 24 hour format if not twentyfour_hr: splitline[1] = convert_to_24hour(splitline[1]) #If the sender is the same as the name of the group, then ignore if splitline[2] != nombregrupo: #Create line as a dictionary and then a dataframe line = {'Date' : splitline[0], 'Time' : splitline[1], 'Sender' : splitline[2],"Message":splitline[3]} line = pd.DataFrame([line]) #Add to dataframe df = pd.concat([df,line],ignore_index=True) #Move counter for the first line 0 index return df, nombregrupo if __name__ == "__main__": main()