Spaces:
Sleeping
Sleeping
ljyflores
commited on
Commit
•
facce4e
1
Parent(s):
850fcc9
Split report into sentences
Browse files
__pycache__/utils_casemaker.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils_casemaker.cpython-310.pyc and b/__pycache__/utils_casemaker.cpython-310.pyc differ
|
|
utils_casemaker.py
CHANGED
@@ -34,11 +34,20 @@ def clean(s: str) -> str:
|
|
34 |
return s
|
35 |
|
36 |
|
37 |
-
def
|
|
|
38 |
paragraphs = text.split("\n\n")
|
39 |
paragraphs = list(map(clean, paragraphs))
|
40 |
paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
def format_casemaker_data(
|
@@ -137,8 +146,8 @@ class CaseMaker:
|
|
137 |
"""
|
138 |
report_string_by_organ = dict[str, str]()
|
139 |
|
140 |
-
# Split the report into a list of
|
141 |
-
paragraphs =
|
142 |
# Collect a list of paragraphs related to each organ
|
143 |
for p in paragraphs:
|
144 |
# Figure out which organ is being referenced
|
|
|
34 |
return s
|
35 |
|
36 |
|
37 |
+
def split_into_sentences(text: str):
|
38 |
+
# Split paragraphs
|
39 |
paragraphs = text.split("\n\n")
|
40 |
paragraphs = list(map(clean, paragraphs))
|
41 |
paragraphs = list(filter(lambda s: len(s.split()) > 10, paragraphs))
|
42 |
+
|
43 |
+
# Split into sentences
|
44 |
+
sentences = [sent_tokenize(p) for p in paragraphs]
|
45 |
+
sentences = [
|
46 |
+
sent
|
47 |
+
for lst in sentences
|
48 |
+
for sent in lst if isinstance(sent, str)
|
49 |
+
]
|
50 |
+
return sentences
|
51 |
|
52 |
|
53 |
def format_casemaker_data(
|
|
|
146 |
"""
|
147 |
report_string_by_organ = dict[str, str]()
|
148 |
|
149 |
+
# Split the report into a list of sentences
|
150 |
+
paragraphs = split_into_sentences(report)
|
151 |
# Collect a list of paragraphs related to each organ
|
152 |
for p in paragraphs:
|
153 |
# Figure out which organ is being referenced
|