r3aperdev commited on
Commit
088dc83
·
1 Parent(s): 95c1bb3

Create parsing.py

Browse files
Files changed (1) hide show
  1. parsing.py +46 -0
parsing.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import typing as t
3
+
4
+ BAD_CHARS_FOR_REGEX_REGEX = re.compile(r"[-\/\\^$*+?.()|[\]{}]")
5
+
6
+
7
+ def _sanitize_string_for_use_in_a_regex(string: str) -> str:
8
+ '''Sanitizes `string` so it can be used inside of a regexp.'''
9
+ return BAD_CHARS_FOR_REGEX_REGEX.sub(r"\\\g<0>", string)
10
+
11
+
12
+ def parse_messages_from_str(string: str, names: t.List[str]) -> t.List[str]:
13
+ '''
14
+ Given a big string containing raw chat history, this function attempts to
15
+ parse it out into a list where each item is an individual message.
16
+ '''
17
+ sanitized_names = [
18
+ _sanitize_string_for_use_in_a_regex(name) for name in names
19
+ ]
20
+
21
+ speaker_regex = re.compile(rf"^({'|'.join(sanitized_names)}): ?",
22
+ re.MULTILINE)
23
+
24
+ message_start_indexes = []
25
+ for match in speaker_regex.finditer(string):
26
+ message_start_indexes.append(match.start())
27
+
28
+ # FIXME(11b): One of these returns is silently dropping the last message.
29
+ if len(message_start_indexes) < 2:
30
+ # Single message in the string.
31
+ return [string.strip()]
32
+
33
+ prev_start_idx = message_start_indexes[0]
34
+ messages = []
35
+
36
+ for start_idx in message_start_indexes[1:]:
37
+ message = string[prev_start_idx:start_idx].strip()
38
+ messages.append(message)
39
+ prev_start_idx = start_idx
40
+
41
+ return messages
42
+
43
+
44
+ def serialize_chat_history(history: t.List[str]) -> str:
45
+ '''Given a structured chat history object, collapses it down to a string.'''
46
+ return "\n".join(history)