abnerguzman commited on
Commit
27178c5
1 Parent(s): 6584d90

Update demo.py

Browse files
Files changed (1) hide show
  1. demo.py +382 -18
demo.py CHANGED
@@ -1,7 +1,15 @@
1
  from collections import defaultdict
 
2
  import time
3
 
4
- style_str = """
 
 
 
 
 
 
 
5
  <style>
6
  .section-title {
7
  /* font-family: cursive, sans-serif; */
@@ -64,6 +72,10 @@ style_str = """
64
  /* font-weight: bolder; */
65
  /* font-style: italic; */
66
  }
 
 
 
 
67
 
68
  .doc-title {
69
  /* font-family: cursive, sans-serif; */
@@ -72,7 +84,7 @@ style_str = """
72
  display: inline-block;
73
  font-size: 2em;
74
  font-weight: bolder;
75
- padding-top: 20px;
76
  /* font-style: italic; */
77
  }
78
  .doc-url {
@@ -97,6 +109,150 @@ style_str = """
97
  /* font-style: italic; */
98
  color: #0000FF;
99
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  .doc-title > img {
101
  width: 22px;
102
  height: 22px;
@@ -119,27 +275,166 @@ chunk_separator = '<span class="chunk-separator">[...]</span>'
119
  from langchain.text_splitter import RecursiveCharacterTextSplitter
120
 
121
  sentence_splitter = RecursiveCharacterTextSplitter(
122
- chunk_size=512,
123
  chunk_overlap=0,
124
  separators=["\n\n", "\n", "."],
125
  keep_separator=False
126
  )
127
 
128
- def get_url_to_supporting_cid_ctext_tuples(atom_support_l):
129
- url_to_supporting_cid_sets = defaultdict(set)
130
- url_to_supporting_cid_ctext_tuples = defaultdict(list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  for atom_support in atom_support_l:
132
  for url, aggmatch_determination in atom_support.items():
133
- if aggmatch_determination['true']:
134
- for cid, ctext in zip(aggmatch_determination['id_l'], aggmatch_determination['chunk_text_l']):
135
- if cid not in url_to_supporting_cid_sets[url]:
136
- url_to_supporting_cid_sets[url].add(cid)
137
- url_to_supporting_cid_ctext_tuples[url].append((cid, ctext))
138
- # now sort each list of chunks
139
- for url, cid_ctext_tuple_l in url_to_supporting_cid_ctext_tuples.items():
140
- url_to_supporting_cid_ctext_tuples[url] = sorted(cid_ctext_tuple_l, key=lambda x: x[0])
141
- # pprint.pp(url_to_supporting_cid_ctext_tuples)
142
- return url_to_supporting_cid_ctext_tuples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  def format_chunk_texts_for_display(cid_ctext_tuples):
145
  ids_l = [int(x[0].split('-')[1]) for x in cid_ctext_tuples]
@@ -161,5 +456,74 @@ def format_chunk_texts_for_display(cid_ctext_tuples):
161
  match_text += chunk_separator
162
  return match_text
163
 
164
- def print_w_time_elapsed(msg, start_time, file=None):
165
- print(f"{msg} ({time.perf_counter()-start_time:.2f} secs)", file=file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from collections import defaultdict
2
+ import os
3
  import time
4
 
5
+ PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
6
+ PINECONE_HOST = 'prorata-postman-ds-ul-dp9xwvt.svc.aped-4627-b74a.pinecone.io'
7
+
8
+ from pinecone import Pinecone
9
+ pc = Pinecone(api_key=PINECONE_API_KEY)
10
+ pc_ul = pc.Index('prorata-postman-ds-ul')
11
+
12
+ style1_str = """
13
  <style>
14
  .section-title {
15
  /* font-family: cursive, sans-serif; */
 
72
  /* font-weight: bolder; */
73
  /* font-style: italic; */
74
  }
75
+ .claim-text .one-quote {
76
+ /* font-style: italic; */
77
+ color: #C70039;
78
+ }
79
 
80
  .doc-title {
81
  /* font-family: cursive, sans-serif; */
 
84
  display: inline-block;
85
  font-size: 2em;
86
  font-weight: bolder;
87
+ padding-top: 30px;
88
  /* font-style: italic; */
89
  }
90
  .doc-url {
 
109
  /* font-style: italic; */
110
  color: #0000FF;
111
  }
112
+ .doc-text .one-quote {
113
+ /* font-style: italic; */
114
+ color: #C70039;
115
+ }
116
+ .doc-title > img {
117
+ width: 22px;
118
+ height: 22px;
119
+ border-radius: 50%;
120
+ overflow: hidden;
121
+ background-color: transparent;
122
+ display: inline-block;
123
+ vertical-align: middle;
124
+ }
125
+ .doc-title > score {
126
+ font-family: Optima, sans-serif;
127
+ font-weight: normal;
128
+ float: right;
129
+ }
130
+ </style>
131
+ """
132
+
133
+ style2_str = """
134
+ <style>
135
+ .section-title {
136
+ /* font-family: cursive, sans-serif; */
137
+ font-family: Optima, sans-serif;
138
+ width: 100%;
139
+ font-size: 2.5em;
140
+ font-weight: bolder;
141
+ padding-bottom: 20px;
142
+ padding-top: 20px;
143
+ /* font-style: italic; */
144
+ }
145
+ .claim-header {
146
+ /* font-family: cursive, sans-serif; */
147
+ font-family: Optima, sans-serif;
148
+ width: 100%;
149
+ font-size: 1.5em;
150
+ font-weight: normal;
151
+ padding-bottom: 10px;
152
+ padding-top: 10px;
153
+ /* font-style: italic; */
154
+ }
155
+ .claim-doc-title {
156
+ /* font-family: cursive, sans-serif; */
157
+ font-family: Optima, sans-serif;
158
+ width: 100%;
159
+ font-size: 1.25em;
160
+ font-weight: normal;
161
+ padding-left: 20px;
162
+ padding-bottom: 5px;
163
+ padding-top: 10px;
164
+ /* font-style: italic; */
165
+ }
166
+ .claim-doc-url {
167
+ /* font-family: cursive, sans-serif; */
168
+ font-size: 0.75em;
169
+ padding-left: 20px;
170
+ padding-bottom: 10px;
171
+ padding-top: 0px;
172
+ /* font-weight: bolder; */
173
+ /* font-style: italic; */
174
+ }
175
+ .claim-determination {
176
+ /* font-family: cursive, sans-serif; */
177
+ font-family: Optima, sans-serif;
178
+ width: 100%;
179
+ font-size: 1em;
180
+ font-weight: normal;
181
+ padding-left: 60px;
182
+ padding-bottom: 10px;
183
+ /* font-style: italic; */
184
+ }
185
+ .claim-text {
186
+ /* font-family: cursive, sans-serif; */
187
+ font-family: Optima, sans-serif;
188
+ font-size: 1em;
189
+ white-space: pre-wrap;
190
+ padding-left: 80px;
191
+ text-indent: -20px;
192
+ padding-bottom: 20px;
193
+ /* font-weight: bolder; */
194
+ /* font-style: italic; */
195
+ }
196
+ .claim-text .one-quote {
197
+ /* font-style: italic; */
198
+ color: #C70039;
199
+ }
200
+
201
+ .doc-title {
202
+ /* font-family: cursive, sans-serif; */
203
+ font-family: Optima, sans-serif;
204
+ width: 100%;
205
+ display: inline-block;
206
+ font-size: 2em;
207
+ font-weight: bolder;
208
+ padding-top: 30px;
209
+ /* font-style: italic; */
210
+ }
211
+ .doc-url {
212
+ /* font-family: cursive, sans-serif; */
213
+ font-size: 1em;
214
+ padding-left: 40px;
215
+ padding-bottom: 10px;
216
+ /* font-weight: bolder; */
217
+ /* font-style: italic; */
218
+ }
219
+ .doc-text-wrapper {
220
+ width: 100%;
221
+ overflow: hidden;
222
+
223
+ /* font-family: cursive, sans-serif; */
224
+ font-family: Optima, sans-serif;
225
+ font-size: 1.25em;
226
+ padding-left: 40px;
227
+ padding-bottom: 20px;
228
+ padding-top: 10px;
229
+ /* font-weight: bolder; */
230
+ /* font-style: italic; */
231
+ }
232
+ .doc-text-wrapper .doc-text-left {
233
+ float: left;
234
+ width: 60%;
235
+ padding-right: 20px;
236
+ overflow-y: auto;
237
+ height: 200px;
238
+ white-space: pre-wrap;
239
+ }
240
+ .doc-text-wrapper .doc-text-right {
241
+ float: left;
242
+ width: 40%;
243
+ padding-left: 20px;
244
+ overflow-y: auto;
245
+ height: 200px;
246
+ white-space: pre-wrap;
247
+ }
248
+ .doc-text-wrapper .chunk-separator {
249
+ /* font-style: italic; */
250
+ color: #0000FF;
251
+ }
252
+ .doc-text-wrapper .one-quote {
253
+ /* font-style: italic; */
254
+ color: #C70039;
255
+ }
256
  .doc-title > img {
257
  width: 22px;
258
  height: 22px;
 
275
  from langchain.text_splitter import RecursiveCharacterTextSplitter
276
 
277
  sentence_splitter = RecursiveCharacterTextSplitter(
278
+ chunk_size=1024,
279
  chunk_overlap=0,
280
  separators=["\n\n", "\n", "."],
281
  keep_separator=False
282
  )
283
 
284
+ # def get_article_from_url(url):
285
+ # headers = {
286
+ # "Content-Type": "application/json",
287
+ # "Api-Key": PINECONE_API_KEY
288
+ # }
289
+ # data = {
290
+ # "id": url,
291
+ # "topK": 1,
292
+ # "includeMetadata": True,
293
+ # }
294
+ # res = requests.post(f"https://{PINECONE_HOST}/query", headers=headers, json=data)
295
+
296
+ # if not res:
297
+ # return {}
298
+
299
+ # top_match_metadata = res.json()['matches'][0]['metadata']
300
+ # return {
301
+ # 'title': top_match_metadata['title'],
302
+ # 'url': top_match_metadata['url'],
303
+ # 'text': top_match_metadata['text'],
304
+ # }
305
+ def get_article_from_url(url):
306
+ res = pc_ul.query(id=url, top_k=1, include_metadata=True)
307
+ if not res['matches']:
308
+ return {}
309
+ top_match_metadata = res['matches'][0]['metadata']
310
+ return {
311
+ 'title': top_match_metadata['title'],
312
+ 'url': top_match_metadata['url'],
313
+ 'text': top_match_metadata['text'],
314
+ }
315
+
316
+ def print_w_time_elapsed(msg, start_time, file=None):
317
+ print(f"{msg} ({time.perf_counter()-start_time:.2f} secs)", file=file)
318
+
319
+ # def _add_chunk_text_formatted_l_aggmatch_determination(aggmatch_determination):
320
+ # chunk_text_l = aggmatch_determination['chunk_text_l']
321
+ # n_chunks = len(chunk_text_l)
322
+
323
+ # if 'quote_matches_l' not in aggmatch_determination:
324
+ # aggmatch_determination['chunk_support_flags'] = n_chunks*[True]
325
+ # aggmatch_determination['chunk_text_formatted_l'] = chunk_text_l
326
+ # return
327
+
328
+ # quote_matches_l = aggmatch_determination['quote_matches_l']
329
+
330
+ # last_end, coffset = 0, 0
331
+ # chunk_support_flags = [False]*n_chunks
332
+ # chunk_text_formatted_l = []
333
+
334
+ # for cidx, ctext in enumerate(chunk_text_l):
335
+ # ctext_formatted = ""
336
+
337
+ # for quote_match in quote_matches_l:
338
+ # if quote_match['start'] > coffset and quote_match['end'] <= coffset + len(ctext):
339
+ # chunk_support_flags[cidx] = True
340
+ # # TODO: handle case were quote spans across chunks
341
+ # ctext_formatted += ctext[last_end-coffset:quote_match['start']-coffset]
342
+ # ctext_formatted += quote_start + ctext[quote_match['start']-coffset:quote_match['end']-coffset] + quote_end
343
+ # last_end = quote_match['end']
344
+
345
+ # ctext_formatted += ctext[last_end-coffset:]
346
+ # chunk_text_formatted_l.append(ctext_formatted)
347
+
348
+ # coffset += len(ctext) + 2
349
+ # last_end = coffset
350
+
351
+ # aggmatch_determination['chunk_support_flags'] = chunk_support_flags
352
+ # aggmatch_determination['chunk_text_formatted_l'] = chunk_text_formatted_l
353
+
354
+ # # TODO: need to operate on single copy of each chunk (so all quotes are kept)
355
+ # def _add_chunk_text_formatted_l(atom_support_l):
356
+ # for atom_support in atom_support_l:
357
+ # for url, aggmatch_determination in atom_support.items():
358
+ # _add_chunk_text_formatted_l_aggmatch_determination(aggmatch_determination)
359
+
360
+ def create_url_to_cid_to_ctext_formatted_map(atom_support_l):
361
+ url_to_cid_to_ctext_map = defaultdict(dict)
362
+ url_to_cid_to_ctext_formatted_map = defaultdict(dict)
363
+ url_to_cid_to_nquotes_map = defaultdict(dict)
364
+
365
  for atom_support in atom_support_l:
366
  for url, aggmatch_determination in atom_support.items():
367
+ cid_to_ctext_map = url_to_cid_to_ctext_map[url]
368
+ cid_to_ctext_formatted_map = url_to_cid_to_ctext_formatted_map[url]
369
+ cid_to_nquotes_map = url_to_cid_to_nquotes_map[url]
370
+
371
+ chunk_id_l = aggmatch_determination['id_l']
372
+ chunk_text_l = aggmatch_determination['chunk_text_l']
373
+
374
+ for cid, ctext in zip(chunk_id_l, chunk_text_l):
375
+ cid_to_ctext_map[cid] = ctext
376
+
377
+ quote_matches_l = aggmatch_determination.get('quote_matches_l', None)
378
+ if quote_matches_l:
379
+ last_end, coffset = 0, 0
380
+ chunk_text_formatted_l = []
381
+
382
+ for cid, ctext in zip(chunk_id_l, chunk_text_l):
383
+ nquotes = 0
384
+ ctext_formatted = ""
385
+
386
+ for quote_match in quote_matches_l:
387
+ if quote_match['start'] >= coffset and quote_match['end'] <= coffset + len(ctext):
388
+ nquotes += 1
389
+ # TODO: handle case were quote spans across chunks
390
+ ctext_formatted += ctext[last_end-coffset:quote_match['start']-coffset]
391
+ ctext_formatted += quote_start + ctext[quote_match['start']-coffset:quote_match['end']-coffset] + quote_end
392
+ last_end = quote_match['end']
393
+
394
+ ctext_formatted += ctext[last_end-coffset:]
395
+ chunk_text_formatted_l.append(ctext_formatted)
396
+
397
+ coffset += len(ctext) + 2
398
+ last_end = coffset
399
+
400
+ # this one used in per claim breakdown
401
+ aggmatch_determination['chunk_text_formatted_l'] = chunk_text_formatted_l
402
+
403
+ # these are for the main view
404
+ if not cid in cid_to_nquotes_map or nquotes > cid_to_nquotes_map[cid]:
405
+ print(f"\n\n### {url} storing formatted cid={cid} ctext:")
406
+ print(f"quote_matches_l={quote_matches_l}")
407
+ print(f"nquotes={nquotes}, ctext_formatted={ctext_formatted}")
408
+ cid_to_nquotes_map[cid] = nquotes
409
+ cid_to_ctext_formatted_map[cid] = ctext_formatted
410
+
411
+ return url_to_cid_to_ctext_map, url_to_cid_to_ctext_formatted_map, url_to_cid_to_nquotes_map
412
+
413
+ # def get_url_to_supporting_cid_ctext_tuples(atom_support_l):
414
+ # url_to_supporting_cid_quote_flag_map = defaultdict(dict)
415
+ # url_to_supporting_cid_ctext_map = defaultdict(dict)
416
+ # for atom_support in atom_support_l:
417
+ # for url, aggmatch_determination in atom_support.items():
418
+ # if aggmatch_determination['true']:
419
+ # use_formatted = 'chunk_text_formatted_l' in aggmatch_determination
420
+ # include_only_formatted = use_formatted and any(aggmatch_determination['chunk_support_flags'])
421
+
422
+ # chunk_text_l_key = 'chunk_text_formatted_l' if use_formatted else 'chunk_text_l'
423
+
424
+ # for lidx, (cid, ctext) in enumerate(zip(aggmatch_determination['id_l'], aggmatch_determination[chunk_text_l_key])):
425
+ # chunk_has_quote = aggmatch_determination['chunk_support_flags'][lidx]
426
+
427
+ # if cid not in url_to_supporting_cid_quote_flag_map[url] or not url_to_supporting_cid_quote_flag_map[url][cid]:
428
+ # if not include_only_formatted or chunk_has_quote:
429
+ # url_to_supporting_cid_quote_flag_map[url][cid] = chunk_has_quote
430
+ # url_to_supporting_cid_ctext_map[url][cid] = ctext
431
+ # # now sort each list of chunks
432
+ # url_to_supporting_cid_ctext_tuples = {}
433
+ # for url, cid_ctext_map in url_to_supporting_cid_ctext_map.items():
434
+ # # url_to_supporting_cid_ctext_tuples[url] = sorted(cid_ctext_tuple_l, key=lambda x: x[0])
435
+ # url_to_supporting_cid_ctext_tuples[url] = sorted(list(cid_ctext_map.items()), key=lambda x: x[0])
436
+ # # pprint.pp(url_to_supporting_cid_ctext_tuples)
437
+ # return url_to_supporting_cid_ctext_tuples
438
 
439
  def format_chunk_texts_for_display(cid_ctext_tuples):
440
  ids_l = [int(x[0].split('-')[1]) for x in cid_ctext_tuples]
 
456
  match_text += chunk_separator
457
  return match_text
458
 
459
+ quote_start = '<span class="one-quote">'
460
+ quote_end = '</span>'
461
+
462
+
463
+ import re
464
+
465
+ quote_pattern_l = [
466
+ r"(\n[\s]*){1}\<span class=\"one-quote\"\>[\S\s]*\</span\>",
467
+ r"(\n\s*[A-Z“\"]){1}.*\<span class=\"one-quote\"\>[\S\s]*\</span\>",
468
+ r"(\n\s*[A-Z“\"]){1}[\S\s]*\<span class=\"one-quote\"\>[\S\s]*\</span\>",
469
+ r"(\n|^){1}[\S\s]*\<span class=\"one-quote\"\>[\S\s]*\</span\>",
470
+ ]
471
+
472
+ def format_chunk_texts_for_display2(url, cid_ctext_tuples):
473
+ ids_l = [int(x[0].split('-')[1]) for x in cid_ctext_tuples]
474
+ n_chunks = len(cid_ctext_tuples)
475
+ print(f"Formatting {url} n_chunks={n_chunks}...")
476
+
477
+ ctext_formatted_l, has_quote_l, needs_ellipsis_l = [], [], []
478
+ for j, cid_ctext_tuple in enumerate(cid_ctext_tuples):
479
+ ctext = cid_ctext_tuple[1]
480
+ print(f"cid={cid_ctext_tuple[0]}:")
481
+ print(f"```{ctext}```")
482
+
483
+ needs_ellipsis = False
484
+ if j < n_chunks-1 and ids_l[j] != ids_l[j+1]:
485
+ needs_ellipsis = True
486
+ if len(ctext) > 512:
487
+ # first_quote_idx = ctext.find(quote_start)
488
+ first_quote_idx = -1
489
+ for pidx, quote_pattern in enumerate(quote_pattern_l):
490
+ match = re.search(quote_pattern, ctext)
491
+ if match:
492
+ print(f"pidx={pidx} found match: {match}")
493
+ first_quote_idx = match.span()[0]
494
+ break
495
+
496
+ if first_quote_idx >= 0:
497
+ ctext = ctext[first_quote_idx:]
498
+ ctext = sentence_splitter.split_text(ctext)[0]
499
+ needs_ellipsis = True
500
+
501
+ ctext_formatted_l.append(ctext)
502
+ has_quote_l.append(first_quote_idx >= 0)
503
+ needs_ellipsis_l.append(needs_ellipsis)
504
+
505
+ if any(has_quote_l):
506
+ ctext_formatted_l = [ctext_formatted_l[i] for i in range(n_chunks) if has_quote_l[i]]
507
+ needs_ellipsis_l = [needs_ellipsis_l[i] for i in range(n_chunks) if has_quote_l[i]]
508
+
509
+ match_text = ""
510
+ for j, ctext_formatted in enumerate(ctext_formatted_l):
511
+ if j > 0:
512
+ match_text += '\n\n'
513
+ match_text += ctext_formatted
514
+ if needs_ellipsis_l[j]:
515
+ match_text += chunk_separator
516
+ return match_text
517
+
518
+ def format_chunk_texts_for_display3(url, cid_to_ctext_map, cid_to_ctext_formatted_map, cid_to_nquotes_map):
519
+
520
+ cid_w_quotes_map = { cid: cid_to_ctext_formatted_map[cid] for cid, nquotes in cid_to_nquotes_map.items() if nquotes > 0 }
521
+ if cid_w_quotes_map:
522
+ cid_ctext_tuples = sorted(list(cid_w_quotes_map.items()), key=lambda x: x[0])
523
+ else:
524
+ cid_ctext_tuples = sorted(list(cid_to_ctext_map.items()), key=lambda x: x[0])
525
+
526
+ # print(f"{url}:")
527
+ # print(f"cid_ctext_tuples={cid_ctext_tuples}")
528
+
529
+ return format_chunk_texts_for_display2(url, cid_ctext_tuples)