Create parsing.py
Browse files- parsing.py +46 -0
parsing.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import typing as t
|
| 3 |
+
|
| 4 |
+
BAD_CHARS_FOR_REGEX_REGEX = re.compile(r"[-\/\\^$*+?.()|[\]{}]")
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _sanitize_string_for_use_in_a_regex(string: str) -> str:
|
| 8 |
+
'''Sanitizes `string` so it can be used inside of a regexp.'''
|
| 9 |
+
return BAD_CHARS_FOR_REGEX_REGEX.sub(r"\\\g<0>", string)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def parse_messages_from_str(string: str, names: t.List[str]) -> t.List[str]:
|
| 13 |
+
'''
|
| 14 |
+
Given a big string containing raw chat history, this function attempts to
|
| 15 |
+
parse it out into a list where each item is an individual message.
|
| 16 |
+
'''
|
| 17 |
+
sanitized_names = [
|
| 18 |
+
_sanitize_string_for_use_in_a_regex(name) for name in names
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
speaker_regex = re.compile(rf"^({'|'.join(sanitized_names)}): ?",
|
| 22 |
+
re.MULTILINE)
|
| 23 |
+
|
| 24 |
+
message_start_indexes = []
|
| 25 |
+
for match in speaker_regex.finditer(string):
|
| 26 |
+
message_start_indexes.append(match.start())
|
| 27 |
+
|
| 28 |
+
# FIXME(11b): One of these returns is silently dropping the last message.
|
| 29 |
+
if len(message_start_indexes) < 2:
|
| 30 |
+
# Single message in the string.
|
| 31 |
+
return [string.strip()]
|
| 32 |
+
|
| 33 |
+
prev_start_idx = message_start_indexes[0]
|
| 34 |
+
messages = []
|
| 35 |
+
|
| 36 |
+
for start_idx in message_start_indexes[1:]:
|
| 37 |
+
message = string[prev_start_idx:start_idx].strip()
|
| 38 |
+
messages.append(message)
|
| 39 |
+
prev_start_idx = start_idx
|
| 40 |
+
|
| 41 |
+
return messages
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def serialize_chat_history(history: t.List[str]) -> str:
|
| 45 |
+
'''Given a structured chat history object, collapses it down to a string.'''
|
| 46 |
+
return "\n".join(history)
|