File size: 3,967 Bytes
bd9abb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import zipfile
import re
from io import BytesIO


def detect_file_type(file_path):
    type = file_path[-3:]
    if type in ["txt","zip"]:
        return type
    else:
        return "unknown"

def preprocess_whatsapp_messages(file_path, file_type):
    """
       Preprocesses the Whatsapp messages zip file into a Pandas Dataframe, all messages in one day go
       to a row and a timestamp is added.

       Args:
           file_path (str): Location of the file (zip or txt) of the conversation.

       Returns:
           str: Dataframe
       """

    # Load the zip file and extract text data
    print(file_type)
    if file_type == "zip":
        with zipfile.ZipFile(file_path, 'r') as z:
            file_name = z.namelist()[0]
            with z.open(file_name) as file:
                text_data = file.read().decode('utf-8')
    else:
        text_data = BytesIO(file_path.getvalue()).read().decode('utf-8')


    # Split the text data into lines
    lines = text_data.strip().split('\n')

    # Create a DataFrame
    df = pd.DataFrame(lines, columns=['message'])

    # Process each line to separate timestamp and text
    df[['timestamp', 'text']] = df['message'].str.split(']', n=1, expand=True)
    df['timestamp'] = df['timestamp'].str.strip('[')

    # Handle cases where the split might not work (e.g., missing ']' in a line)
    df.dropna(subset=['timestamp', 'text'], inplace=True)

    # Convert timestamp to datetime and remove the time, keeping only the date
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%y, %H:%M:%S', errors='coerce').dt.date

    # Drop rows where the timestamp conversion failed (which results in NaT)
    df.dropna(subset=['timestamp'], inplace=True)

    # Remove initial WhatsApp system messages in English and Spanish
    filter_text_en = "Your messages and calls are end-to-end encrypted"
    filter_text_es = "Los mensajes y las llamadas están cifrados de extremo a extremo"
    df = df[~df['text'].str.contains(filter_text_en, na=False)]
    df = df[~df['text'].str.contains(filter_text_es, na=False)]

    # Additional preprocessing steps:
    # Remove URLs and convert text to lowercase
    df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))  # Remove URLs
    df['text'] = df['text'].apply(lambda x: x.lower())                        # Convert text to lowercase

    # Remove emojis, images, stickers, documents while preserving colons after sender names
    df['text'] = df['text'].apply(lambda x: re.sub(r'(?<!\w)(:\s|\s:\s|\s:)', '', x))  # Remove colons that are not part of sender's name
    df['text'] = df['text'].apply(lambda x: re.sub(r'\[image omitted\]', '', x))  # Remove images
    df['text'] = df['text'].apply(lambda x: re.sub(r'\[sticker omitted\]', '', x))  # Remove stickers
    df['text'] = df['text'].apply(lambda x: re.sub(r'\[document omitted\]', '', x)) # Remove documents
    df['text'] = df['text'].apply(lambda x: re.sub(r'<se editó este mensaje.>', '', x)) # Remove editing function (new Whatsapp addition) in Spanish
    df['text'] = df['text'].apply(lambda x: re.sub(r'<this message was edited.>', '', x)) # Remove editing function (new Whatsapp addition) in English I AM GUESSING IDk

    # Group by date and concatenate all messages from the same date
    df = df.groupby('timestamp')['text'].apply(lambda x: '\n'.join(x)).reset_index()
    df.columns = ['date', 'text']
    df['date'] = pd.to_datetime(df['date'])
    df['text'] = df['text'].astype(str)

    return df

def get_dated_input(data, selected_date):
    '''
    The Pandas dataframe is processed and the text is extracted.
    :param data:
    :param selected_date:
    :return:
    '''
    selected_date = pd.to_datetime(selected_date)
    data_for_model = data[data['date'].dt.date == selected_date.date()]
    data_for_model.loc[:, 'text'] = data_for_model['text']
    first_row_text = data_for_model['text'].iloc[0]
    return first_row_text