MedNERN-CR-JA / utils.py
gabrielandrade2's picture
Update model with additional negative examples, improve support scripts
576d564
import re
def split_sentences(text):
"""Given a string, split it into sentences.
:param text: The string to be processed.
:return: The list of split sentences.
"""
processed_text = re.split(
"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[.?!])\s\n*|(?<=[^A-zA-z0-90-9 ].)(?<=[。..??!!])(?![\.」])\n*", text)
# processed_text = re.split("(? <=[。??!!])") # In case only a simple regex is necessary
processed_text = [x.strip() for x in processed_text]
processed_text = [x for x in processed_text if x != '']
return processed_text