Arafath10 commited on
Commit
5e574c8
1 Parent(s): deeb5eb

Upload cleaner.py

Browse files
Files changed (1) hide show
  1. cleaner.py +57 -0
cleaner.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def clean_corpus(chat_export_file):
5
+ """Prepare a WhatsApp chat export for training with chatterbot."""
6
+ message_corpus = remove_chat_metadata(chat_export_file)
7
+ cleaned_corpus = remove_non_message_text(message_corpus)
8
+ return cleaned_corpus
9
+
10
+
11
+ def remove_chat_metadata(chat_export_file):
12
+ """Remove WhatsApp chat metadata.
13
+
14
+ WhatsApp chat exports come with metadata about each message:
15
+
16
+ date time username message
17
+ ---------------------------------------
18
+ 8/26/22, 17:47 - Jane Doe: Message text
19
+
20
+ This function removes all the metadata up to the text of each message.
21
+
22
+ Args:
23
+ chat_export_file (str): The name of the chat export file
24
+
25
+ Returns:
26
+ tuple: The text of each message in the conversation
27
+ """
28
+ date_time = r"(\d+\/\d+\/\d+,\s\d+:\d+)" # e.g. "8/26/22, 17:47"
29
+ dash_whitespace = r"\s-\s" # " - "
30
+ username = r"([\w\s]+)" # e.g. "Jane Doe"
31
+ metadata_end = r":\s" # ": "
32
+ pattern = date_time + dash_whitespace + username + metadata_end
33
+
34
+ with open(chat_export_file, "r") as corpus_file:
35
+ content = corpus_file.read()
36
+ cleaned_corpus = re.sub(pattern, "", content)
37
+ return tuple(cleaned_corpus.split("\n"))
38
+
39
+
40
+ def remove_non_message_text(export_text_lines):
41
+ """Remove conversation-irrelevant text from chat export.
42
+
43
+ WhatsApp chat exports come with a standardized intro line,
44
+ and an empty line at the end of the file.
45
+ Text exports also replace media messages with text that isn't
46
+ relevant for the conversation. This function removes all that.
47
+
48
+ Args:
49
+ export_text_lines (tuple): All lines from the export file
50
+
51
+ Returns:
52
+ tuple: Messages that are a relevant part of the conversation
53
+ """
54
+ messages = export_text_lines[1:-1]
55
+
56
+ filter_out_msgs = ("<Media omitted>",)
57
+ return tuple((msg for msg in messages if msg not in filter_out_msgs))