|
import re |
|
|
|
|
|
def clean_corpus(chat_export_file): |
|
"""Prepare a WhatsApp chat export for training with chatterbot.""" |
|
message_corpus = remove_chat_metadata(chat_export_file) |
|
cleaned_corpus = remove_non_message_text(message_corpus) |
|
return cleaned_corpus |
|
|
|
|
|
def remove_chat_metadata(chat_export_file): |
|
"""Remove WhatsApp chat metadata. |
|
|
|
WhatsApp chat exports come with metadata about each message: |
|
|
|
date time username message |
|
--------------------------------------- |
|
8/26/22, 17:47 - Jane Doe: Message text |
|
|
|
This function removes all the metadata up to the text of each message. |
|
|
|
Args: |
|
chat_export_file (str): The name of the chat export file |
|
|
|
Returns: |
|
tuple: The text of each message in the conversation |
|
""" |
|
date_time = r"(\d+\/\d+\/\d+,\s\d+:\d+)" |
|
dash_whitespace = r"\s-\s" |
|
username = r"([\w\s]+)" |
|
metadata_end = r":\s" |
|
pattern = date_time + dash_whitespace + username + metadata_end |
|
|
|
with open(chat_export_file, "r") as corpus_file: |
|
content = corpus_file.read() |
|
cleaned_corpus = re.sub(pattern, "", content) |
|
return tuple(cleaned_corpus.split("\n")) |
|
|
|
|
|
def remove_non_message_text(export_text_lines): |
|
"""Remove conversation-irrelevant text from chat export. |
|
|
|
WhatsApp chat exports come with a standardized intro line, |
|
and an empty line at the end of the file. |
|
Text exports also replace media messages with text that isn't |
|
relevant for the conversation. This function removes all that. |
|
|
|
Args: |
|
export_text_lines (tuple): All lines from the export file |
|
|
|
Returns: |
|
tuple: Messages that are a relevant part of the conversation |
|
""" |
|
messages = export_text_lines[1:-1] |
|
|
|
filter_out_msgs = ("<Media omitted>",) |
|
return tuple((msg for msg in messages if msg not in filter_out_msgs)) |
|
|