Spaces:
Sleeping
Sleeping
abnerguzman
commited on
Commit
•
27178c5
1
Parent(s):
6584d90
Update demo.py
Browse files
demo.py
CHANGED
@@ -1,7 +1,15 @@
|
|
1 |
from collections import defaultdict
|
|
|
2 |
import time
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
<style>
|
6 |
.section-title {
|
7 |
/* font-family: cursive, sans-serif; */
|
@@ -64,6 +72,10 @@ style_str = """
|
|
64 |
/* font-weight: bolder; */
|
65 |
/* font-style: italic; */
|
66 |
}
|
|
|
|
|
|
|
|
|
67 |
|
68 |
.doc-title {
|
69 |
/* font-family: cursive, sans-serif; */
|
@@ -72,7 +84,7 @@ style_str = """
|
|
72 |
display: inline-block;
|
73 |
font-size: 2em;
|
74 |
font-weight: bolder;
|
75 |
-
padding-top:
|
76 |
/* font-style: italic; */
|
77 |
}
|
78 |
.doc-url {
|
@@ -97,6 +109,150 @@ style_str = """
|
|
97 |
/* font-style: italic; */
|
98 |
color: #0000FF;
|
99 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
.doc-title > img {
|
101 |
width: 22px;
|
102 |
height: 22px;
|
@@ -119,27 +275,166 @@ chunk_separator = '<span class="chunk-separator">[...]</span>'
|
|
119 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
120 |
|
121 |
sentence_splitter = RecursiveCharacterTextSplitter(
|
122 |
-
chunk_size=
|
123 |
chunk_overlap=0,
|
124 |
separators=["\n\n", "\n", "."],
|
125 |
keep_separator=False
|
126 |
)
|
127 |
|
128 |
-
def
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
for atom_support in atom_support_l:
|
132 |
for url, aggmatch_determination in atom_support.items():
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
def format_chunk_texts_for_display(cid_ctext_tuples):
|
145 |
ids_l = [int(x[0].split('-')[1]) for x in cid_ctext_tuples]
|
@@ -161,5 +456,74 @@ def format_chunk_texts_for_display(cid_ctext_tuples):
|
|
161 |
match_text += chunk_separator
|
162 |
return match_text
|
163 |
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from collections import defaultdict
|
2 |
+
import os
|
3 |
import time
|
4 |
|
5 |
+
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
|
6 |
+
PINECONE_HOST = 'prorata-postman-ds-ul-dp9xwvt.svc.aped-4627-b74a.pinecone.io'
|
7 |
+
|
8 |
+
from pinecone import Pinecone
|
9 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
10 |
+
pc_ul = pc.Index('prorata-postman-ds-ul')
|
11 |
+
|
12 |
+
style1_str = """
|
13 |
<style>
|
14 |
.section-title {
|
15 |
/* font-family: cursive, sans-serif; */
|
|
|
72 |
/* font-weight: bolder; */
|
73 |
/* font-style: italic; */
|
74 |
}
|
75 |
+
.claim-text .one-quote {
|
76 |
+
/* font-style: italic; */
|
77 |
+
color: #C70039;
|
78 |
+
}
|
79 |
|
80 |
.doc-title {
|
81 |
/* font-family: cursive, sans-serif; */
|
|
|
84 |
display: inline-block;
|
85 |
font-size: 2em;
|
86 |
font-weight: bolder;
|
87 |
+
padding-top: 30px;
|
88 |
/* font-style: italic; */
|
89 |
}
|
90 |
.doc-url {
|
|
|
109 |
/* font-style: italic; */
|
110 |
color: #0000FF;
|
111 |
}
|
112 |
+
.doc-text .one-quote {
|
113 |
+
/* font-style: italic; */
|
114 |
+
color: #C70039;
|
115 |
+
}
|
116 |
+
.doc-title > img {
|
117 |
+
width: 22px;
|
118 |
+
height: 22px;
|
119 |
+
border-radius: 50%;
|
120 |
+
overflow: hidden;
|
121 |
+
background-color: transparent;
|
122 |
+
display: inline-block;
|
123 |
+
vertical-align: middle;
|
124 |
+
}
|
125 |
+
.doc-title > score {
|
126 |
+
font-family: Optima, sans-serif;
|
127 |
+
font-weight: normal;
|
128 |
+
float: right;
|
129 |
+
}
|
130 |
+
</style>
|
131 |
+
"""
|
132 |
+
|
133 |
+
style2_str = """
|
134 |
+
<style>
|
135 |
+
.section-title {
|
136 |
+
/* font-family: cursive, sans-serif; */
|
137 |
+
font-family: Optima, sans-serif;
|
138 |
+
width: 100%;
|
139 |
+
font-size: 2.5em;
|
140 |
+
font-weight: bolder;
|
141 |
+
padding-bottom: 20px;
|
142 |
+
padding-top: 20px;
|
143 |
+
/* font-style: italic; */
|
144 |
+
}
|
145 |
+
.claim-header {
|
146 |
+
/* font-family: cursive, sans-serif; */
|
147 |
+
font-family: Optima, sans-serif;
|
148 |
+
width: 100%;
|
149 |
+
font-size: 1.5em;
|
150 |
+
font-weight: normal;
|
151 |
+
padding-bottom: 10px;
|
152 |
+
padding-top: 10px;
|
153 |
+
/* font-style: italic; */
|
154 |
+
}
|
155 |
+
.claim-doc-title {
|
156 |
+
/* font-family: cursive, sans-serif; */
|
157 |
+
font-family: Optima, sans-serif;
|
158 |
+
width: 100%;
|
159 |
+
font-size: 1.25em;
|
160 |
+
font-weight: normal;
|
161 |
+
padding-left: 20px;
|
162 |
+
padding-bottom: 5px;
|
163 |
+
padding-top: 10px;
|
164 |
+
/* font-style: italic; */
|
165 |
+
}
|
166 |
+
.claim-doc-url {
|
167 |
+
/* font-family: cursive, sans-serif; */
|
168 |
+
font-size: 0.75em;
|
169 |
+
padding-left: 20px;
|
170 |
+
padding-bottom: 10px;
|
171 |
+
padding-top: 0px;
|
172 |
+
/* font-weight: bolder; */
|
173 |
+
/* font-style: italic; */
|
174 |
+
}
|
175 |
+
.claim-determination {
|
176 |
+
/* font-family: cursive, sans-serif; */
|
177 |
+
font-family: Optima, sans-serif;
|
178 |
+
width: 100%;
|
179 |
+
font-size: 1em;
|
180 |
+
font-weight: normal;
|
181 |
+
padding-left: 60px;
|
182 |
+
padding-bottom: 10px;
|
183 |
+
/* font-style: italic; */
|
184 |
+
}
|
185 |
+
.claim-text {
|
186 |
+
/* font-family: cursive, sans-serif; */
|
187 |
+
font-family: Optima, sans-serif;
|
188 |
+
font-size: 1em;
|
189 |
+
white-space: pre-wrap;
|
190 |
+
padding-left: 80px;
|
191 |
+
text-indent: -20px;
|
192 |
+
padding-bottom: 20px;
|
193 |
+
/* font-weight: bolder; */
|
194 |
+
/* font-style: italic; */
|
195 |
+
}
|
196 |
+
.claim-text .one-quote {
|
197 |
+
/* font-style: italic; */
|
198 |
+
color: #C70039;
|
199 |
+
}
|
200 |
+
|
201 |
+
.doc-title {
|
202 |
+
/* font-family: cursive, sans-serif; */
|
203 |
+
font-family: Optima, sans-serif;
|
204 |
+
width: 100%;
|
205 |
+
display: inline-block;
|
206 |
+
font-size: 2em;
|
207 |
+
font-weight: bolder;
|
208 |
+
padding-top: 30px;
|
209 |
+
/* font-style: italic; */
|
210 |
+
}
|
211 |
+
.doc-url {
|
212 |
+
/* font-family: cursive, sans-serif; */
|
213 |
+
font-size: 1em;
|
214 |
+
padding-left: 40px;
|
215 |
+
padding-bottom: 10px;
|
216 |
+
/* font-weight: bolder; */
|
217 |
+
/* font-style: italic; */
|
218 |
+
}
|
219 |
+
.doc-text-wrapper {
|
220 |
+
width: 100%;
|
221 |
+
overflow: hidden;
|
222 |
+
|
223 |
+
/* font-family: cursive, sans-serif; */
|
224 |
+
font-family: Optima, sans-serif;
|
225 |
+
font-size: 1.25em;
|
226 |
+
padding-left: 40px;
|
227 |
+
padding-bottom: 20px;
|
228 |
+
padding-top: 10px;
|
229 |
+
/* font-weight: bolder; */
|
230 |
+
/* font-style: italic; */
|
231 |
+
}
|
232 |
+
.doc-text-wrapper .doc-text-left {
|
233 |
+
float: left;
|
234 |
+
width: 60%;
|
235 |
+
padding-right: 20px;
|
236 |
+
overflow-y: auto;
|
237 |
+
height: 200px;
|
238 |
+
white-space: pre-wrap;
|
239 |
+
}
|
240 |
+
.doc-text-wrapper .doc-text-right {
|
241 |
+
float: left;
|
242 |
+
width: 40%;
|
243 |
+
padding-left: 20px;
|
244 |
+
overflow-y: auto;
|
245 |
+
height: 200px;
|
246 |
+
white-space: pre-wrap;
|
247 |
+
}
|
248 |
+
.doc-text-wrapper .chunk-separator {
|
249 |
+
/* font-style: italic; */
|
250 |
+
color: #0000FF;
|
251 |
+
}
|
252 |
+
.doc-text-wrapper .one-quote {
|
253 |
+
/* font-style: italic; */
|
254 |
+
color: #C70039;
|
255 |
+
}
|
256 |
.doc-title > img {
|
257 |
width: 22px;
|
258 |
height: 22px;
|
|
|
275 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
276 |
|
277 |
sentence_splitter = RecursiveCharacterTextSplitter(
|
278 |
+
chunk_size=1024,
|
279 |
chunk_overlap=0,
|
280 |
separators=["\n\n", "\n", "."],
|
281 |
keep_separator=False
|
282 |
)
|
283 |
|
284 |
+
# def get_article_from_url(url):
|
285 |
+
# headers = {
|
286 |
+
# "Content-Type": "application/json",
|
287 |
+
# "Api-Key": PINECONE_API_KEY
|
288 |
+
# }
|
289 |
+
# data = {
|
290 |
+
# "id": url,
|
291 |
+
# "topK": 1,
|
292 |
+
# "includeMetadata": True,
|
293 |
+
# }
|
294 |
+
# res = requests.post(f"https://{PINECONE_HOST}/query", headers=headers, json=data)
|
295 |
+
|
296 |
+
# if not res:
|
297 |
+
# return {}
|
298 |
+
|
299 |
+
# top_match_metadata = res.json()['matches'][0]['metadata']
|
300 |
+
# return {
|
301 |
+
# 'title': top_match_metadata['title'],
|
302 |
+
# 'url': top_match_metadata['url'],
|
303 |
+
# 'text': top_match_metadata['text'],
|
304 |
+
# }
|
305 |
+
def get_article_from_url(url):
|
306 |
+
res = pc_ul.query(id=url, top_k=1, include_metadata=True)
|
307 |
+
if not res['matches']:
|
308 |
+
return {}
|
309 |
+
top_match_metadata = res['matches'][0]['metadata']
|
310 |
+
return {
|
311 |
+
'title': top_match_metadata['title'],
|
312 |
+
'url': top_match_metadata['url'],
|
313 |
+
'text': top_match_metadata['text'],
|
314 |
+
}
|
315 |
+
|
316 |
+
def print_w_time_elapsed(msg, start_time, file=None):
|
317 |
+
print(f"{msg} ({time.perf_counter()-start_time:.2f} secs)", file=file)
|
318 |
+
|
319 |
+
# def _add_chunk_text_formatted_l_aggmatch_determination(aggmatch_determination):
|
320 |
+
# chunk_text_l = aggmatch_determination['chunk_text_l']
|
321 |
+
# n_chunks = len(chunk_text_l)
|
322 |
+
|
323 |
+
# if 'quote_matches_l' not in aggmatch_determination:
|
324 |
+
# aggmatch_determination['chunk_support_flags'] = n_chunks*[True]
|
325 |
+
# aggmatch_determination['chunk_text_formatted_l'] = chunk_text_l
|
326 |
+
# return
|
327 |
+
|
328 |
+
# quote_matches_l = aggmatch_determination['quote_matches_l']
|
329 |
+
|
330 |
+
# last_end, coffset = 0, 0
|
331 |
+
# chunk_support_flags = [False]*n_chunks
|
332 |
+
# chunk_text_formatted_l = []
|
333 |
+
|
334 |
+
# for cidx, ctext in enumerate(chunk_text_l):
|
335 |
+
# ctext_formatted = ""
|
336 |
+
|
337 |
+
# for quote_match in quote_matches_l:
|
338 |
+
# if quote_match['start'] > coffset and quote_match['end'] <= coffset + len(ctext):
|
339 |
+
# chunk_support_flags[cidx] = True
|
340 |
+
# # TODO: handle case were quote spans across chunks
|
341 |
+
# ctext_formatted += ctext[last_end-coffset:quote_match['start']-coffset]
|
342 |
+
# ctext_formatted += quote_start + ctext[quote_match['start']-coffset:quote_match['end']-coffset] + quote_end
|
343 |
+
# last_end = quote_match['end']
|
344 |
+
|
345 |
+
# ctext_formatted += ctext[last_end-coffset:]
|
346 |
+
# chunk_text_formatted_l.append(ctext_formatted)
|
347 |
+
|
348 |
+
# coffset += len(ctext) + 2
|
349 |
+
# last_end = coffset
|
350 |
+
|
351 |
+
# aggmatch_determination['chunk_support_flags'] = chunk_support_flags
|
352 |
+
# aggmatch_determination['chunk_text_formatted_l'] = chunk_text_formatted_l
|
353 |
+
|
354 |
+
# # TODO: need to operate on single copy of each chunk (so all quotes are kept)
|
355 |
+
# def _add_chunk_text_formatted_l(atom_support_l):
|
356 |
+
# for atom_support in atom_support_l:
|
357 |
+
# for url, aggmatch_determination in atom_support.items():
|
358 |
+
# _add_chunk_text_formatted_l_aggmatch_determination(aggmatch_determination)
|
359 |
+
|
360 |
+
def create_url_to_cid_to_ctext_formatted_map(atom_support_l):
|
361 |
+
url_to_cid_to_ctext_map = defaultdict(dict)
|
362 |
+
url_to_cid_to_ctext_formatted_map = defaultdict(dict)
|
363 |
+
url_to_cid_to_nquotes_map = defaultdict(dict)
|
364 |
+
|
365 |
for atom_support in atom_support_l:
|
366 |
for url, aggmatch_determination in atom_support.items():
|
367 |
+
cid_to_ctext_map = url_to_cid_to_ctext_map[url]
|
368 |
+
cid_to_ctext_formatted_map = url_to_cid_to_ctext_formatted_map[url]
|
369 |
+
cid_to_nquotes_map = url_to_cid_to_nquotes_map[url]
|
370 |
+
|
371 |
+
chunk_id_l = aggmatch_determination['id_l']
|
372 |
+
chunk_text_l = aggmatch_determination['chunk_text_l']
|
373 |
+
|
374 |
+
for cid, ctext in zip(chunk_id_l, chunk_text_l):
|
375 |
+
cid_to_ctext_map[cid] = ctext
|
376 |
+
|
377 |
+
quote_matches_l = aggmatch_determination.get('quote_matches_l', None)
|
378 |
+
if quote_matches_l:
|
379 |
+
last_end, coffset = 0, 0
|
380 |
+
chunk_text_formatted_l = []
|
381 |
+
|
382 |
+
for cid, ctext in zip(chunk_id_l, chunk_text_l):
|
383 |
+
nquotes = 0
|
384 |
+
ctext_formatted = ""
|
385 |
+
|
386 |
+
for quote_match in quote_matches_l:
|
387 |
+
if quote_match['start'] >= coffset and quote_match['end'] <= coffset + len(ctext):
|
388 |
+
nquotes += 1
|
389 |
+
# TODO: handle case were quote spans across chunks
|
390 |
+
ctext_formatted += ctext[last_end-coffset:quote_match['start']-coffset]
|
391 |
+
ctext_formatted += quote_start + ctext[quote_match['start']-coffset:quote_match['end']-coffset] + quote_end
|
392 |
+
last_end = quote_match['end']
|
393 |
+
|
394 |
+
ctext_formatted += ctext[last_end-coffset:]
|
395 |
+
chunk_text_formatted_l.append(ctext_formatted)
|
396 |
+
|
397 |
+
coffset += len(ctext) + 2
|
398 |
+
last_end = coffset
|
399 |
+
|
400 |
+
# this one used in per claim breakdown
|
401 |
+
aggmatch_determination['chunk_text_formatted_l'] = chunk_text_formatted_l
|
402 |
+
|
403 |
+
# these are for the main view
|
404 |
+
if not cid in cid_to_nquotes_map or nquotes > cid_to_nquotes_map[cid]:
|
405 |
+
print(f"\n\n### {url} storing formatted cid={cid} ctext:")
|
406 |
+
print(f"quote_matches_l={quote_matches_l}")
|
407 |
+
print(f"nquotes={nquotes}, ctext_formatted={ctext_formatted}")
|
408 |
+
cid_to_nquotes_map[cid] = nquotes
|
409 |
+
cid_to_ctext_formatted_map[cid] = ctext_formatted
|
410 |
+
|
411 |
+
return url_to_cid_to_ctext_map, url_to_cid_to_ctext_formatted_map, url_to_cid_to_nquotes_map
|
412 |
+
|
413 |
+
# def get_url_to_supporting_cid_ctext_tuples(atom_support_l):
|
414 |
+
# url_to_supporting_cid_quote_flag_map = defaultdict(dict)
|
415 |
+
# url_to_supporting_cid_ctext_map = defaultdict(dict)
|
416 |
+
# for atom_support in atom_support_l:
|
417 |
+
# for url, aggmatch_determination in atom_support.items():
|
418 |
+
# if aggmatch_determination['true']:
|
419 |
+
# use_formatted = 'chunk_text_formatted_l' in aggmatch_determination
|
420 |
+
# include_only_formatted = use_formatted and any(aggmatch_determination['chunk_support_flags'])
|
421 |
+
|
422 |
+
# chunk_text_l_key = 'chunk_text_formatted_l' if use_formatted else 'chunk_text_l'
|
423 |
+
|
424 |
+
# for lidx, (cid, ctext) in enumerate(zip(aggmatch_determination['id_l'], aggmatch_determination[chunk_text_l_key])):
|
425 |
+
# chunk_has_quote = aggmatch_determination['chunk_support_flags'][lidx]
|
426 |
+
|
427 |
+
# if cid not in url_to_supporting_cid_quote_flag_map[url] or not url_to_supporting_cid_quote_flag_map[url][cid]:
|
428 |
+
# if not include_only_formatted or chunk_has_quote:
|
429 |
+
# url_to_supporting_cid_quote_flag_map[url][cid] = chunk_has_quote
|
430 |
+
# url_to_supporting_cid_ctext_map[url][cid] = ctext
|
431 |
+
# # now sort each list of chunks
|
432 |
+
# url_to_supporting_cid_ctext_tuples = {}
|
433 |
+
# for url, cid_ctext_map in url_to_supporting_cid_ctext_map.items():
|
434 |
+
# # url_to_supporting_cid_ctext_tuples[url] = sorted(cid_ctext_tuple_l, key=lambda x: x[0])
|
435 |
+
# url_to_supporting_cid_ctext_tuples[url] = sorted(list(cid_ctext_map.items()), key=lambda x: x[0])
|
436 |
+
# # pprint.pp(url_to_supporting_cid_ctext_tuples)
|
437 |
+
# return url_to_supporting_cid_ctext_tuples
|
438 |
|
439 |
def format_chunk_texts_for_display(cid_ctext_tuples):
|
440 |
ids_l = [int(x[0].split('-')[1]) for x in cid_ctext_tuples]
|
|
|
456 |
match_text += chunk_separator
|
457 |
return match_text
|
458 |
|
459 |
+
quote_start = '<span class="one-quote">'
|
460 |
+
quote_end = '</span>'
|
461 |
+
|
462 |
+
|
463 |
+
import re
|
464 |
+
|
465 |
+
quote_pattern_l = [
|
466 |
+
r"(\n[\s]*){1}\<span class=\"one-quote\"\>[\S\s]*\</span\>",
|
467 |
+
r"(\n\s*[A-Z“\"]){1}.*\<span class=\"one-quote\"\>[\S\s]*\</span\>",
|
468 |
+
r"(\n\s*[A-Z“\"]){1}[\S\s]*\<span class=\"one-quote\"\>[\S\s]*\</span\>",
|
469 |
+
r"(\n|^){1}[\S\s]*\<span class=\"one-quote\"\>[\S\s]*\</span\>",
|
470 |
+
]
|
471 |
+
|
472 |
+
def format_chunk_texts_for_display2(url, cid_ctext_tuples):
|
473 |
+
ids_l = [int(x[0].split('-')[1]) for x in cid_ctext_tuples]
|
474 |
+
n_chunks = len(cid_ctext_tuples)
|
475 |
+
print(f"Formatting {url} n_chunks={n_chunks}...")
|
476 |
+
|
477 |
+
ctext_formatted_l, has_quote_l, needs_ellipsis_l = [], [], []
|
478 |
+
for j, cid_ctext_tuple in enumerate(cid_ctext_tuples):
|
479 |
+
ctext = cid_ctext_tuple[1]
|
480 |
+
print(f"cid={cid_ctext_tuple[0]}:")
|
481 |
+
print(f"```{ctext}```")
|
482 |
+
|
483 |
+
needs_ellipsis = False
|
484 |
+
if j < n_chunks-1 and ids_l[j] != ids_l[j+1]:
|
485 |
+
needs_ellipsis = True
|
486 |
+
if len(ctext) > 512:
|
487 |
+
# first_quote_idx = ctext.find(quote_start)
|
488 |
+
first_quote_idx = -1
|
489 |
+
for pidx, quote_pattern in enumerate(quote_pattern_l):
|
490 |
+
match = re.search(quote_pattern, ctext)
|
491 |
+
if match:
|
492 |
+
print(f"pidx={pidx} found match: {match}")
|
493 |
+
first_quote_idx = match.span()[0]
|
494 |
+
break
|
495 |
+
|
496 |
+
if first_quote_idx >= 0:
|
497 |
+
ctext = ctext[first_quote_idx:]
|
498 |
+
ctext = sentence_splitter.split_text(ctext)[0]
|
499 |
+
needs_ellipsis = True
|
500 |
+
|
501 |
+
ctext_formatted_l.append(ctext)
|
502 |
+
has_quote_l.append(first_quote_idx >= 0)
|
503 |
+
needs_ellipsis_l.append(needs_ellipsis)
|
504 |
+
|
505 |
+
if any(has_quote_l):
|
506 |
+
ctext_formatted_l = [ctext_formatted_l[i] for i in range(n_chunks) if has_quote_l[i]]
|
507 |
+
needs_ellipsis_l = [needs_ellipsis_l[i] for i in range(n_chunks) if has_quote_l[i]]
|
508 |
+
|
509 |
+
match_text = ""
|
510 |
+
for j, ctext_formatted in enumerate(ctext_formatted_l):
|
511 |
+
if j > 0:
|
512 |
+
match_text += '\n\n'
|
513 |
+
match_text += ctext_formatted
|
514 |
+
if needs_ellipsis_l[j]:
|
515 |
+
match_text += chunk_separator
|
516 |
+
return match_text
|
517 |
+
|
518 |
+
def format_chunk_texts_for_display3(url, cid_to_ctext_map, cid_to_ctext_formatted_map, cid_to_nquotes_map):
|
519 |
+
|
520 |
+
cid_w_quotes_map = { cid: cid_to_ctext_formatted_map[cid] for cid, nquotes in cid_to_nquotes_map.items() if nquotes > 0 }
|
521 |
+
if cid_w_quotes_map:
|
522 |
+
cid_ctext_tuples = sorted(list(cid_w_quotes_map.items()), key=lambda x: x[0])
|
523 |
+
else:
|
524 |
+
cid_ctext_tuples = sorted(list(cid_to_ctext_map.items()), key=lambda x: x[0])
|
525 |
+
|
526 |
+
# print(f"{url}:")
|
527 |
+
# print(f"cid_ctext_tuples={cid_ctext_tuples}")
|
528 |
+
|
529 |
+
return format_chunk_texts_for_display2(url, cid_ctext_tuples)
|