sidphbot commited on
Commit
adf54e4
1 Parent(s): c952ddd

stdout capture

Browse files
Files changed (2) hide show
  1. app.py +6 -34
  2. src/Surveyor.py +219 -207
app.py CHANGED
@@ -4,38 +4,11 @@ import numpy as np
4
 
5
  from src.Surveyor import Surveyor
6
 
7
- import contextlib
8
- from functools import wraps
9
- from io import StringIO
10
-
11
- def capture_output(func):
12
- """Capture output from running a function and write using streamlit."""
13
-
14
- @wraps(func)
15
- def wrapper(*args, **kwargs):
16
- # Redirect output to string buffers
17
- stdout, stderr = StringIO(), StringIO()
18
- try:
19
- with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
20
- return func(*args, **kwargs)
21
- except Exception as err:
22
- st.write(f"Failure while executing: {err}")
23
- finally:
24
- if _stdout := stdout.getvalue():
25
- st.write("Execution stdout:")
26
- st.code(_stdout)
27
- if _stderr := stderr.getvalue():
28
- st.write("Execution stderr:")
29
- st.code(_stderr)
30
-
31
- return wrapper
32
-
33
  def run_survey(surveyor, research_keywords, max_search, num_papers):
34
- survey_fn = capture_output(surveyor.survey)
35
- zip_file_name, survey_file_name = survey_fn(research_keywords,
36
- max_search=max_search,
37
- num_papers=num_papers
38
- )
39
 
40
  with open(str(zip_file_name), "rb") as file:
41
  btn = st.download_button(
@@ -55,7 +28,7 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
55
 
56
 
57
  def survey_space(surveyor):
58
- st.sidebar.title('Auto-Research V0.1 - Automated Survey generation from research keywords')
59
  form = st.sidebar.form(key='survey_form')
60
  research_keywords = form.text_input("What would you like to research in today?")
61
  max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
@@ -65,11 +38,10 @@ def survey_space(surveyor):
65
  submit = form.form_submit_button('Submit')
66
 
67
  if submit:
68
- st.write("hello")
69
  run_survey(surveyor, research_keywords, max_search, num_papers)
70
 
71
 
72
  if __name__ == '__main__':
73
  global surveyor
74
- surveyor_obj = Surveyor()
75
  survey_space(surveyor_obj)
 
4
 
5
  from src.Surveyor import Surveyor
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def run_survey(surveyor, research_keywords, max_search, num_papers):
8
+ zip_file_name, survey_file_name = surveyor.survey(research_keywords,
9
+ max_search=max_search,
10
+ num_papers=num_papers
11
+ )
 
12
 
13
  with open(str(zip_file_name), "rb") as file:
14
  btn = st.download_button(
 
28
 
29
 
30
  def survey_space(surveyor):
31
+ st.container().title('Auto-Research V0.1 - Automated Survey generation from research keywords')
32
  form = st.sidebar.form(key='survey_form')
33
  research_keywords = form.text_input("What would you like to research in today?")
34
  max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
 
38
  submit = form.form_submit_button('Submit')
39
 
40
  if submit:
 
41
  run_survey(surveyor, research_keywords, max_search, num_papers)
42
 
43
 
44
  if __name__ == '__main__':
45
  global surveyor
46
+ surveyor_obj = Surveyor(print_fn=st.write)
47
  survey_space(surveyor_obj)
src/Surveyor.py CHANGED
@@ -44,7 +44,8 @@ class Surveyor:
44
  kw_model_name=None,
45
  high_gpu=False,
46
  refresh_models=False,
47
- no_save_models=False
 
48
  ):
49
  '''
50
  Initializes models and directory structure for the surveyor
@@ -71,10 +72,14 @@ class Surveyor:
71
  - num_papers: int maximium number of papers to download and analyse - defaults to 25
72
 
73
  '''
 
 
 
 
74
  self.torch_device = 'cpu'
75
- print("\nTorch_device: " + self.torch_device)
76
  if torch.cuda.is_available():
77
- print("\nloading defaults for gpu")
78
  self.torch_device = 'cuda'
79
  spacy.require_gpu()
80
 
@@ -109,7 +114,7 @@ class Surveyor:
109
  similarity_nlp_name = self.DEFAULTS["similarity_nlp_name"]
110
 
111
  if refresh_models or not models_found:
112
- print(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
113
  if not no_save_models:
114
  self.clean_dirs([models_dir])
115
 
@@ -148,7 +153,7 @@ class Surveyor:
148
  if not no_save_models:
149
  self.embedder.save(models_dir + "/embedder")
150
  else:
151
- print("\nInitializing from previously saved models at" + models_dir)
152
  self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
153
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
154
  self.title_model.eval()
@@ -230,9 +235,9 @@ class Surveyor:
230
 
231
  papers = papers_meta[:self.num_papers]
232
  selected_papers = papers
233
- print("\nFirst stage paper collection...")
234
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
235
- print("\nFirst stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
236
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
237
  _ = self.get_freq_cited(cites)
238
  '''
@@ -243,16 +248,16 @@ class Surveyor:
243
  new_papers.extend(new_searched_papers)
244
  '''
245
  selected_papers.extend(new_papers)
246
- print("\nSecond stage paper collection...")
247
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
248
- print("\nSecond stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
249
  papers.extend(new_papers)
250
 
251
  joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
252
  copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
253
  copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
254
 
255
- print("\nExtracting section-wise highlights.. ")
256
  papers = self.extract_highlights(papers)
257
 
258
  return papers, selected_papers
@@ -265,7 +270,7 @@ class Surveyor:
265
  [cites_list.append(val) for val in v]
266
  cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
267
  sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
268
- print("\nThe most cited paper ids are:\n" + str(sorted_cites))
269
 
270
  return sorted_cites.keys()
271
 
@@ -275,7 +280,7 @@ class Surveyor:
275
 
276
  if repeat:
277
  with tempfile.TemporaryDirectory() as dirpath:
278
- print("\n- downloading extra pdfs.. ")
279
  # full text preparation of selected papers
280
  self.download_pdfs(papers, dirpath)
281
  dirpath_pdfs = os.listdir(dirpath)
@@ -283,22 +288,22 @@ class Surveyor:
283
  full_file_name = os.path.join(dirpath, file_name)
284
  if os.path.isfile(full_file_name):
285
  shutil.copy(full_file_name, pdf_dir)
286
- print("\n- converting extra pdfs.. ")
287
  self.convert_pdfs(dirpath, txt_dir)
288
  else:
289
- print("\n- downloading pdfs.. ")
290
  # full text preparation of selected papers
291
  self.download_pdfs(papers, pdf_dir)
292
- print("\n- converting pdfs.. ")
293
  self.convert_pdfs(pdf_dir, txt_dir)
294
  # plugging citations to our papers object
295
- print("\n- plugging in citation network.. ")
296
  papers, cites = self.cocitation_network(papers, txt_dir)
297
  joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp')
298
  from distutils.dir_util import copy_tree
299
  copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir))
300
  copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir))
301
- print("\n- extracting structure.. ")
302
  papers, ids_none = self.extract_structure(papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir)
303
  return ids_none, papers, cites
304
 
@@ -328,92 +333,92 @@ class Surveyor:
328
  def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
329
 
330
  import arxiv2bib
331
- print("\nbuilding bibliography entries.. ")
332
  bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
333
  bibentries = [r.bibtex() for r in bibentries]
334
 
335
- print("\nbuilding final survey file .. at "+ filename)
336
  file = open(filename, 'w+')
337
  if query is None:
338
  query = 'Internal(existing) research'
339
  file.write("----------------------------------------------------------------------")
340
  file.write("Title: A survey on " + query)
341
- print("")
342
- print("----------------------------------------------------------------------")
343
- print("Title: A survey on " + query)
344
  file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
345
- print("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
346
  file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
347
- print("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
348
  file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
349
  "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
350
  "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
351
  "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
352
  "\nentries(only to avoid LaTex overhead). ")
353
- print("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
354
  "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
355
  "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
356
  "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
357
  "\nentries(only to avoid LaTex overhead). ")
358
  file.write("----------------------------------------------------------------------")
359
- print("----------------------------------------------------------------------")
360
  file.write("")
361
- print("")
362
  file.write('ABSTRACT')
363
- print('ABSTRACT')
364
- print("=================================================")
365
  file.write("=================================================")
366
  file.write("")
367
- print("")
368
  file.write(research_sections['abstract'])
369
- print(research_sections['abstract'])
370
  file.write("")
371
- print("")
372
  file.write('INTRODUCTION')
373
- print('INTRODUCTION')
374
- print("=================================================")
375
  file.write("=================================================")
376
  file.write("")
377
- print("")
378
  file.write(research_sections['introduction'])
379
- print(research_sections['introduction'])
380
  file.write("")
381
- print("")
382
  for k, v in research_sections.items():
383
  if k not in ['abstract', 'introduction', 'conclusion']:
384
  file.write(k.upper())
385
- print(k.upper())
386
- print("=================================================")
387
  file.write("=================================================")
388
  file.write("")
389
- print("")
390
  file.write(v)
391
- print(v)
392
  file.write("")
393
- print("")
394
  file.write('CONCLUSION')
395
- print('CONCLUSION')
396
- print("=================================================")
397
  file.write("=================================================")
398
  file.write("")
399
- print("")
400
  file.write(research_sections['conclusion'])
401
- print(research_sections['conclusion'])
402
  file.write("")
403
- print("")
404
 
405
  file.write('REFERENCES')
406
- print('REFERENCES')
407
- print("=================================================")
408
  file.write("=================================================")
409
  file.write("")
410
- print("")
411
  for entry in bibentries:
412
  file.write(entry)
413
- print(entry)
414
  file.write("")
415
- print("")
416
- print("========================XXX=========================")
417
  file.write("========================XXX=========================")
418
  file.close()
419
 
@@ -421,14 +426,15 @@ class Surveyor:
421
 
422
  research_blocks = {}
423
  for head, textarr in corpus_known_sections.items():
424
- torch.cuda.empty_cache()
425
- # print(head.upper())
 
426
  with torch.no_grad():
427
  summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5)
428
  res = self.nlp(summtext)
429
  res = set([str(sent) for sent in list(res.sents)])
430
  summtext = ''.join([line for line in res])
431
- # pprint(summtext)
432
  research_blocks[head] = summtext
433
 
434
  return research_blocks
@@ -444,7 +450,8 @@ class Surveyor:
444
  sequences = ledmodel.generate(input_ids, global_attention_mask=global_attention_mask).sequences
445
  summary = ledtokenizer.batch_decode(sequences)
446
  '''
447
- torch.cuda.empty_cache()
 
448
  inputs = self.ledtokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
449
  return_tensors='pt').to(self.torch_device)
450
  with torch.no_grad():
@@ -454,7 +461,7 @@ class Surveyor:
454
  res = self.nlp(summary[0])
455
  res = set([str(sent) for sent in list(res.sents)])
456
  summtext = ''.join([line for line in res])
457
- #print("abstractive summary type:" + str(type(summary)))
458
  return summtext
459
 
460
  def get_abstract(self, abs_lines, corpus_known_sections, research_blocks):
@@ -463,7 +470,7 @@ class Surveyor:
463
  abs_lines = ""
464
  abs_lines += " ".join([l.lower() for l in corpus_known_sections['abstract']])
465
  abs_lines += research_blocks['abstract']
466
- # print(abs_lines)
467
 
468
  try:
469
  return self.abstractive_summary(abs_lines)
@@ -475,12 +482,12 @@ class Surveyor:
475
  abs_lines = []
476
  types = set()
477
  for k, v in corpus.items():
478
- # print(v)
479
  types.add(type(v))
480
  abstext = k + '. ' + v.replace('\n', ' ')
481
  abstext = self.nlp(abstext)
482
  abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)])
483
- #print("unique corpus value types:" + str(types))
484
  # abs_lines = '\n'.join([str(sent) for sent in abs_lines.sents])
485
  return abs_lines
486
 
@@ -501,9 +508,9 @@ class Surveyor:
501
  if p['id'] not in selected_pids:
502
  meta_abs.append(self.generate_title(p['abstract']))
503
  docs.extend(meta_abs)
504
- #print("meta_abs num"+str(len(meta_abs)))
505
- #print("selected_pids num"+str(len(selected_pids)))
506
- #print("papers_meta num"+str(len(papers_meta)))
507
  #assert (len(meta_abs) + len(selected_pids) == len(papers_meta))
508
  assert ('str' in str(type(random.sample(docs, 1)[0])))
509
  return [doc for doc in docs if doc != '']
@@ -513,7 +520,8 @@ class Surveyor:
513
  from sklearn.cluster import KMeans
514
  # from bertopic import BERTopic
515
  # topic_model = BERTopic(embedding_model=embedder)
516
- torch.cuda.empty_cache()
 
517
  corpus_embeddings = self.embedder.encode(abs_lines)
518
  # Normalize the embeddings to unit length
519
  corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
@@ -533,17 +541,17 @@ class Surveyor:
533
  clustered_sentences[cluster_id] = []
534
  '''
535
  if dummy_count < 5:
536
- print("abs_line: "+abs_lines[sentence_id])
537
- print("cluster_ID: "+str(cluster_id))
538
- print("embedding: "+str(corpus_embeddings[sentence_id]))
539
  dummy_count += 1
540
  '''
541
  clustered_sentences[cluster_id].append(abs_lines[sentence_id])
542
 
543
  # for i, cluster in clustered_sentences.items():
544
- # print("Cluster ", i+1)
545
- # print(cluster)
546
- # print("")
547
 
548
  return self.get_clustered_sections(clustered_sentences), clustered_sentences
549
 
@@ -552,7 +560,8 @@ class Surveyor:
552
  from sklearn.cluster import KMeans
553
  # from bertopic import BERTopic
554
  # topic_model = BERTopic(embedding_model=embedder)
555
- torch.cuda.empty_cache()
 
556
  abs_lines = self.get_sectioned_docs(papers, papers_meta)
557
  corpus_embeddings = self.embedder.encode(abs_lines)
558
  # Normalize the embeddings to unit length
@@ -573,22 +582,23 @@ class Surveyor:
573
  clustered_sentences[cluster_id] = []
574
  '''
575
  if dummy_count < 5:
576
- print("abs_line: "+abs_lines[sentence_id])
577
- print("cluster_ID: "+str(cluster_id))
578
- print("embedding: "+str(corpus_embeddings[sentence_id]))
579
  dummy_count += 1
580
  '''
581
  clustered_sentences[cluster_id].append(abs_lines[sentence_id])
582
 
583
  # for i, cluster in clustered_sentences.items():
584
- # print("Cluster ", i+1)
585
- # print(cluster)
586
- # print("")
587
 
588
  return self.get_clustered_sections(clustered_sentences), clustered_sentences
589
 
590
  def generate_title(self, longtext):
591
- torch.cuda.empty_cache()
 
592
 
593
  inputs = self.title_tokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
594
  return_tensors='pt').to(self.torch_device)
@@ -602,7 +612,7 @@ class Surveyor:
602
  def get_clustered_sections(self, clustered_lines):
603
  clusters_dict = {}
604
  for i, cluster in clustered_lines.items():
605
- # print(cluster)
606
  try:
607
  clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
608
  str(" ".join(cluster)).lower())
@@ -641,7 +651,7 @@ class Surveyor:
641
  for section in p['sections']:
642
  if kh in section['heading']:
643
  khtext.extend(section['highlights'])
644
- # print(khtext)
645
  corpus_known_sections[kh] = khtext
646
  return corpus_known_sections
647
 
@@ -649,15 +659,15 @@ class Surveyor:
649
  known = ['abstract', 'introduction', 'discussion', 'relatedwork', 'contribution', 'analysis', 'experiments',
650
  'conclusion']
651
  for p in papers:
652
- # print("================================")
653
  headings = [section['heading'] for section in p['sections'] if len(section['heading'].split()) < 3]
654
- # print("id: "+ str(p['id'])+"\nHeadings: \n"+str('\n'.join(headings)))
655
  for kh in known:
656
  for section in p['sections']:
657
  if len(section['heading'].split()) < 3:
658
- # print(section['heading'])
659
  if kh in ''.join(filter(str.isalpha, section['heading'].replace(' ', '').lower())):
660
- # print("orig head: "+ section['heading'] +", plain head:" + kh)
661
  section['heading'] = kh
662
  return papers
663
 
@@ -671,14 +681,14 @@ class Surveyor:
671
  if pid == p['id']:
672
  corpus[pid] = p['abstract'] + str(' '.join(ph))
673
  '''
674
- print("================== final corpus ====================")
675
- print('\n'.join([str("paper: "+ get_by_pid(pid, papers_meta)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in corpus.items()]))
676
- print("======== sample point ========")
677
  p = random.choice(list(papers))
678
- print("paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
679
- print("======== sample meta point ========")
680
  p = random.choice(list(papers_meta))
681
- print("meta paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
682
  '''
683
  return corpus
684
 
@@ -690,25 +700,25 @@ class Surveyor:
690
  def build_meta_corpus(self, papers):
691
  meta_corpus = {}
692
  for p in papers:
693
- # pprint(p)
694
  pid = p['id']
695
  ptext = p['title'] + ". " + p['abstract']
696
  doc = self.nlp(ptext)
697
  phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)])
698
  meta_corpus[pid] = str(' '.join(phs))
699
  '''
700
- print("================== meta corpus ====================")
701
- print('\n'.join([str("paper: "+ get_by_pid(pid, papers)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in meta_corpus.items()]))
702
- print("======== sample point ========")
703
  p = random.choice(list(papers))
704
- print("paper: "+ p['title']+" \nhighlights: " + str(meta_corpus[p['id']]))
705
  '''
706
  return meta_corpus
707
 
708
  def select_papers(self, papers, query, num_papers=20):
709
  import numpy as np
710
- # print("paper sample: ")
711
- # print(papers)
712
  meta_corpus = self.build_meta_corpus(papers)
713
  scores = []
714
  pids = []
@@ -716,32 +726,33 @@ class Surveyor:
716
  score = self.text_para_similarity(query, highlights)
717
  scores.append(score)
718
  pids.append(id)
719
- print("corpus item: " + str(self.get_by_pid(id, papers)['title']))
720
 
721
  idx = np.argsort(scores)[:num_papers]
722
  #for i in range(len(scores)):
723
- # print("paper: " + str(self.get_by_pid(pids[i], papers)['title']))
724
- # print("score: " + str(scores[i]))
725
- # print("argsort ids("+str(num_papers)+" papers): "+ str(idx))
726
  idx = [pids[i] for i in idx]
727
- # print("argsort pids("+str(num_papers)+" papers): "+ str(idx))
728
  papers_selected = [p for p in papers if p['id'] in idx]
729
  # assert(len(papers_selected)==num_papers)
730
- print("num papers selected: " + str(len(papers_selected)))
731
  for p in papers_selected:
732
- print("Selected Paper: " + p['title'])
733
 
734
- print("constrast with natural selection: forward")
735
  for p in papers[:4]:
736
- print("Selected Paper: " + p['title'])
737
- print("constrast with natural selection: backward")
738
  for p in papers[-4:]:
739
- print("Selected Paper: " + p['title'])
740
  # arxiv search producing better relevnce
741
  return papers_selected
742
 
743
  def extractive_summary(self, text):
744
- torch.cuda.empty_cache()
 
745
  with torch.no_grad():
746
  res = self.model(text, ratio=0.5)
747
  res_doc = self.nlp(res)
@@ -751,12 +762,13 @@ class Surveyor:
751
  # text = " ".join(lines)
752
  # text_doc = nlp(" ".join([l.lower() for l in lines]))
753
  # text = ' '.join([ str(sent) for sent in list(text_doc.sents)])
754
- torch.cuda.empty_cache()
 
755
  with torch.no_grad():
756
  res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
757
  res_doc = self.nlp(res)
758
  res_lines = set([str(sent) for sent in list(res_doc.sents)])
759
- # print("\n".join(res_sents))
760
  with torch.no_grad():
761
  keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
762
  keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
@@ -782,14 +794,14 @@ class Surveyor:
782
  return papers
783
 
784
  def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
785
- print("\nextracting sections.. ")
786
  papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
787
 
788
- print("\nextracting images.. for future correlation use-cases ")
789
  papers = self.extract_images(papers, pdf_dir, img_dir)
790
 
791
  if tables:
792
- print("\nextracting tables.. for future correlation use-cases ")
793
  papers = self.extract_tables(papers, pdf_dir, tab_dir)
794
 
795
  return papers, ids_none
@@ -816,12 +828,12 @@ class Surveyor:
816
  '''
817
  for f, h in headings_all.items():
818
  if len(h) < 4:
819
- print("=================headings almost undetected================")
820
- print(f)
821
- print(h)
822
  '''
823
  # from pprint import pprint
824
- # pprint({f: len(h) for f,h in headings_all.items()})
825
  papers_none = [p for p in papers if p['id'] in ids_none]
826
  for p in papers_none:
827
  os.remove(txt_dir + '/'+ p['id'] + '.txt')
@@ -848,7 +860,7 @@ class Surveyor:
848
  start = headings[i]
849
  end = headings[i + 1]
850
  section = self.get_section(start, end, lines)
851
- # print(start + " : "+ str(len(section)) +" lines")
852
  '''
853
  if i > 0:
854
  old = headings[i-1]
@@ -884,19 +896,19 @@ class Surveyor:
884
  start = [i for i in range(len(lines)) if first is lines[i]][0]
885
  end = [i for i in range(len(lines)) if last is lines[i]][0]
886
  section_lines = lines[start + 1:end]
887
- # print("heading: " + str(first))
888
- # print("section_lines: "+ str(section_lines))
889
- # print(section_lines)
890
  return section_lines
891
  except ValueError:
892
- print("value error :")
893
- print("first heading :" + str(first) + ", second heading :" + str(last))
894
- print("first index :" + str(start) + ", second index :" + str(end))
895
  return ""
896
 
897
  def check_list_elems_in_list(self, headings, lines):
898
  import numpy as np
899
- # [print(head) for head in headings if head not in lines ]
900
  return np.all([True if head in lines else False for head in headings])
901
 
902
  def check_first_char_upper(self, text):
@@ -916,25 +928,25 @@ class Surveyor:
916
  assert (self.check_list_elems_in_list(headings, refined))
917
  headings = self.check_duplicates(headings)
918
 
919
- # print('===========================================')
920
- # print(txt_file +": first scan: \n"+str(len(headings))+" headings")
921
- # print('\n'.join(headings))
922
 
923
  # scan_failed - rescan with first match for abstract hook
924
  if len(headings) == 0:
925
- # print('===================')
926
- # print("run 1 failed")
927
  abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())]
928
  if len(abs_cans) != 0:
929
  abs_head = abs_cans[0]
930
  refined, headings = self.scan_text(lines, abs_head=abs_head)
931
  self.check_list_elems_in_list(headings, refined)
932
  headings = self.check_duplicates(headings)
933
- # print('===================')
934
- # print(txt_file +": second scan: \n"+str(len(headings))+" headings")
935
 
936
  # if len(headings) == 0:
937
- # print("heading scan failed completely")
938
 
939
  return refined, headings
940
 
@@ -944,7 +956,7 @@ class Surveyor:
944
  if len(dups) > 0:
945
  [my_finallist.append(n) for n in my_list if n not in my_finallist]
946
 
947
- # print("original: "+str(len(my_list))+" new: "+str(len(my_finallist)))
948
  return my_finallist
949
 
950
  def clean_lines(self, text):
@@ -973,7 +985,7 @@ class Surveyor:
973
 
974
  def scan_text(self, lines, abs_head=None):
975
  import re
976
- # print('\n'.join(lines))
977
  record = False
978
  headings = []
979
  refined = []
@@ -994,7 +1006,7 @@ class Surveyor:
994
  refined.append(line)
995
  break
996
  refined, headings = self.scanline(record, headings, refined, i, lines)
997
- # print('=========in scan_text loop i : '+str(i)+' heading count : '+str(len(headings))+' =========')
998
  return refined, headings
999
 
1000
  def scanline(self, record, headings, refined, id, lines):
@@ -1003,22 +1015,22 @@ class Surveyor:
1003
  line = lines[id]
1004
 
1005
  if not len(line) == 0:
1006
- # print("in scanline")
1007
- # print(line)
1008
  if record:
1009
  refined.append(line)
1010
  if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match(
1011
  "^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$",
1012
  line) and self.char_length(line) > 7:
1013
- # print("candidate")
1014
- # print(line)
1015
  if np.mean([len(s) for s in lines[id + 2:id + 6]]) > 40 and self.check_first_char_upper(
1016
  line) and re.match("^[a-zA-Z1-9\.\:\-\s]*$", line) and len(line.split()) < 10:
1017
  # if len(line) < 20 and np.mean([len(s) for s in lines[i+1:i+5]]) > 30 :
1018
  headings.append(line)
1019
  assert (line in refined)
1020
- # print("selected")
1021
- # print(line)
1022
  else:
1023
  known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography']
1024
  missing = [h for h in known_headings if not np.any([True for head in headings if h in head])]
@@ -1045,7 +1057,7 @@ class Surveyor:
1045
  for p in papers:
1046
  if p['id'] == pid:
1047
  return p
1048
- print("\npaper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
1049
 
1050
 
1051
  def alpha_length(self, s):
@@ -1066,7 +1078,7 @@ class Surveyor:
1066
 
1067
  def extract_images(self, papers, pdf_dir, img_dir):
1068
  import fitz
1069
- # print("in images")
1070
  for p in papers:
1071
  file = pdf_dir + p['id'] + ".pdf"
1072
  pdf_file = fitz.open(file)
@@ -1076,10 +1088,10 @@ class Surveyor:
1076
  images.extend(page.getImageList())
1077
  images_files = [self.save_image(pdf_file.extractImage(img[0]), i, p['id'], img_dir) for i, img in
1078
  enumerate(set(images)) if img[0]]
1079
- # print(len(images_per_paper))
1080
  p['images'] = images_files
1081
- # print(len(p.keys()))
1082
- # print(papers[0].keys())
1083
  return papers
1084
 
1085
 
@@ -1105,7 +1117,7 @@ class Surveyor:
1105
  # save it to local disk
1106
  fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext
1107
  image.save(open(f"{fname}", "wb"))
1108
- # print(fname)
1109
  return fname
1110
 
1111
  def save_tables(self, dfs, pid, tab_dir):
@@ -1125,7 +1137,7 @@ class Surveyor:
1125
  for p in papers:
1126
  dfs = tabula.read_pdf(pdf_dir + p['id'] + ".pdf", pages='all', multiple_tables=True, silent=True)
1127
  p['tables'] = self.save_tables(dfs, p['id'], tab_dir)
1128
- # print(papers[0].keys())
1129
  return papers
1130
 
1131
  def extract_tables_from_file(self, pdf_file_name, tab_dir):
@@ -1179,7 +1191,7 @@ class Surveyor:
1179
  else:
1180
  discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
1181
 
1182
- print("\nPapers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
1183
 
1184
  return results, searched_papers
1185
 
@@ -1187,8 +1199,8 @@ class Surveyor:
1187
  import arxiv
1188
  from urllib.parse import urlparse
1189
  ids = [p['id'] for p in papers]
1190
- print("\ndownloading below selected papers: ")
1191
- print(ids)
1192
  # asert(False)
1193
  papers_filtered = arxiv.Search(id_list=ids).get()
1194
  for p in papers_filtered:
@@ -1201,7 +1213,7 @@ class Surveyor:
1201
  import arxiv
1202
  from urllib.parse import urlparse
1203
  ids = [p['id'] for p in papers]
1204
- print(ids)
1205
  # asert(False)
1206
  papers_filtered = arxiv.Search(id_list=ids).get()
1207
  for p in papers_filtered:
@@ -1230,8 +1242,8 @@ class Surveyor:
1230
 
1231
 
1232
  cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
1233
- print("\ncitation-network: ")
1234
- print(cites)
1235
 
1236
  for p in papers:
1237
  p['cites'] = cites[p['id']]
@@ -1242,7 +1254,7 @@ class Surveyor:
1242
  from scholarly import scholarly
1243
  import operator
1244
  # Retrieve the author's data, fill-in, and print
1245
- print("Searching Author: " + author_query)
1246
  search_result = next(scholarly.search_author(author_query), None)
1247
 
1248
  if search_result is not None:
@@ -1262,7 +1274,7 @@ class Surveyor:
1262
  'url_picture': author['url_picture'],
1263
  }
1264
  else:
1265
- print("author not found")
1266
  author_stats = {
1267
  'name': author_query,
1268
  'affiliation': "",
@@ -1276,7 +1288,7 @@ class Surveyor:
1276
  'url_picture': "",
1277
  }
1278
 
1279
- # pprint(author_stats)
1280
  return author_stats
1281
 
1282
  def author_stats(self, papers):
@@ -1314,9 +1326,9 @@ class Surveyor:
1314
  start_positions = torch.tensor([1])
1315
  end_positions = torch.tensor([3])
1316
  outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions)
1317
- print("context: " + text)
1318
- print("question: " + question)
1319
- print("outputs: " + outputs)
1320
  return outputs
1321
 
1322
  def zip_outputs(self, dump_dir, query):
@@ -1342,10 +1354,10 @@ class Surveyor:
1342
  if not num_papers:
1343
  num_papers = self.DEFAULTS['num_papers']
1344
  # arxiv api relevance search and data preparation
1345
- print("\nsearching arXiv for top 100 papers.. ")
1346
  results, searched_papers = self.search(query, max_search=max_search)
1347
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1348
- print("\nfound " + str(len(searched_papers)) + " papers")
1349
 
1350
  # paper selection by scibert vector embedding relevance scores
1351
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
@@ -1358,23 +1370,23 @@ class Surveyor:
1358
 
1359
  joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1360
 
1361
- print("\nStandardizing known section headings per paper.. ")
1362
  papers_standardized = self.standardize_headings(papers_highlighted)
1363
  joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1364
 
1365
- print("\nBuilding paper-wise corpus.. ")
1366
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1367
  joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1368
 
1369
- print("\nBuilding section-wise corpus.. ")
1370
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1371
  joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1372
 
1373
- print("\nBuilding basic research highlights.. ")
1374
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1375
  joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1376
 
1377
- print("\nReducing corpus to lines.. ")
1378
  corpus_lines = self.get_corpus_lines(corpus)
1379
  joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1380
 
@@ -1390,67 +1402,67 @@ class Surveyor:
1390
  '''
1391
 
1392
  '''
1393
- print("papers_highlighted types:"+ str(np.unique([str(type(p['sections'][0]['highlights'])) for p in papers_highlighted])))
1394
- print("papers_highlighted example:")
1395
- print(random.sample(list(papers_highlighted), 1)[0]['sections'][0]['highlights'])
1396
- print("corpus types:"+ str(np.unique([str(type(txt)) for k,txt in corpus.items()])))
1397
- print("corpus example:")
1398
- print(random.sample(list(corpus.items()), 1)[0])
1399
- print("corpus_lines types:"+ str(np.unique([str(type(txt)) for txt in corpus_lines])))
1400
- print("corpus_lines example:")
1401
- print(random.sample(list(corpus_lines), 1)[0])
1402
- print("corpus_sectionwise types:"+ str(np.unique([str(type(txt)) for k,txt in corpus_sectionwise.items()])))
1403
- print("corpus_sectionwise example:")
1404
- print(random.sample(list(corpus_sectionwise.items()), 1)[0])
1405
- print("research_blocks types:"+ str(np.unique([str(type(txt)) for k,txt in research_blocks.items()])))
1406
- print("research_blocks example:")
1407
- print(random.sample(list(research_blocks.items()), 1)[0])
1408
  '''
1409
- # print("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
1410
 
1411
- print("\nBuilding abstract.. ")
1412
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1413
  joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1414
  '''
1415
- print("abstract_block type:"+ str(type(abstract_block)))
1416
- print("abstract_block:")
1417
- print(abstract_block)
1418
  '''
1419
 
1420
- print("\nBuilding introduction.. ")
1421
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1422
  joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1423
  '''
1424
- print("intro_block type:"+ str(type(intro_block)))
1425
- print("intro_block:")
1426
- print(intro_block)
1427
  '''
1428
- print("\nBuilding custom sections.. ")
1429
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1430
  joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1431
  joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
1432
 
1433
  '''
1434
- print("clusters extracted")
1435
- print("clustered_sentences types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sentences.items()])))
1436
- print("clustered_sentences example:")
1437
- print(random.sample(list(clustered_sections.items()), 1)[0])
1438
- print("clustered_sections types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sections.items()])))
1439
- print("clustered_sections example:")
1440
- print(random.sample(list(clustered_sections.items()), 1)[0])
1441
  '''
1442
  clustered_sections['abstract'] = abstract_block
1443
  clustered_sections['introduction'] = intro_block
1444
  joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1445
 
1446
- print("\nBuilding conclusion.. ")
1447
  conclusion_block = self.get_conclusion(clustered_sections)
1448
  joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1449
  clustered_sections['conclusion'] = conclusion_block
1450
  '''
1451
- print("conclusion_block type:"+ str(type(conclusion_block)))
1452
- print("conclusion_block:")
1453
- print(conclusion_block)
1454
  '''
1455
 
1456
  survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
@@ -1460,7 +1472,7 @@ class Surveyor:
1460
  shutil.copy(self.dump_dir + survey_file, survey_file)
1461
  assert (os.path.exists(survey_file))
1462
  output_zip = self.zip_outputs(self.dump_dir, query)
1463
- print("\nSurvey complete.. \nSurvey file path :" + os.path.abspath(
1464
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1465
 
1466
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
 
44
  kw_model_name=None,
45
  high_gpu=False,
46
  refresh_models=False,
47
+ no_save_models=False,
48
+ print_fn=None
49
  ):
50
  '''
51
  Initializes models and directory structure for the surveyor
 
72
  - num_papers: int maximium number of papers to download and analyse - defaults to 25
73
 
74
  '''
75
+ self.print_fn = print
76
+ if print_fn is not None:
77
+ self.print_fn = print_fn
78
+
79
  self.torch_device = 'cpu'
80
+ self.print_fn("\nTorch_device: " + self.torch_device)
81
  if torch.cuda.is_available():
82
+ self.print_fn("\nloading defaults for gpu")
83
  self.torch_device = 'cuda'
84
  spacy.require_gpu()
85
 
 
114
  similarity_nlp_name = self.DEFAULTS["similarity_nlp_name"]
115
 
116
  if refresh_models or not models_found:
117
+ self.print_fn(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
118
  if not no_save_models:
119
  self.clean_dirs([models_dir])
120
 
 
153
  if not no_save_models:
154
  self.embedder.save(models_dir + "/embedder")
155
  else:
156
+ self.print_fn("\nInitializing from previously saved models at" + models_dir)
157
  self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
158
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
159
  self.title_model.eval()
 
235
 
236
  papers = papers_meta[:self.num_papers]
237
  selected_papers = papers
238
+ self.print_fn("\nFirst stage paper collection...")
239
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
240
+ self.print_fn("\nFirst stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
241
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
242
  _ = self.get_freq_cited(cites)
243
  '''
 
248
  new_papers.extend(new_searched_papers)
249
  '''
250
  selected_papers.extend(new_papers)
251
+ self.print_fn("\nSecond stage paper collection...")
252
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
253
+ self.print_fn("\nSecond stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
254
  papers.extend(new_papers)
255
 
256
  joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
257
  copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
258
  copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
259
 
260
+ self.print_fn("\nExtracting section-wise highlights.. ")
261
  papers = self.extract_highlights(papers)
262
 
263
  return papers, selected_papers
 
270
  [cites_list.append(val) for val in v]
271
  cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
272
  sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
273
+ self.print_fn("\nThe most cited paper ids are:\n" + str(sorted_cites))
274
 
275
  return sorted_cites.keys()
276
 
 
280
 
281
  if repeat:
282
  with tempfile.TemporaryDirectory() as dirpath:
283
+ self.print_fn("\n- downloading extra pdfs.. ")
284
  # full text preparation of selected papers
285
  self.download_pdfs(papers, dirpath)
286
  dirpath_pdfs = os.listdir(dirpath)
 
288
  full_file_name = os.path.join(dirpath, file_name)
289
  if os.path.isfile(full_file_name):
290
  shutil.copy(full_file_name, pdf_dir)
291
+ self.print_fn("\n- converting extra pdfs.. ")
292
  self.convert_pdfs(dirpath, txt_dir)
293
  else:
294
+ self.print_fn("\n- downloading pdfs.. ")
295
  # full text preparation of selected papers
296
  self.download_pdfs(papers, pdf_dir)
297
+ self.print_fn("\n- converting pdfs.. ")
298
  self.convert_pdfs(pdf_dir, txt_dir)
299
  # plugging citations to our papers object
300
+ self.print_fn("\n- plugging in citation network.. ")
301
  papers, cites = self.cocitation_network(papers, txt_dir)
302
  joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp')
303
  from distutils.dir_util import copy_tree
304
  copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir))
305
  copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir))
306
+ self.print_fn("\n- extracting structure.. ")
307
  papers, ids_none = self.extract_structure(papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir)
308
  return ids_none, papers, cites
309
 
 
333
  def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
334
 
335
  import arxiv2bib
336
+ self.print_fn("\nbuilding bibliography entries.. ")
337
  bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
338
  bibentries = [r.bibtex() for r in bibentries]
339
 
340
+ self.print_fn("\nbuilding final survey file .. at "+ filename)
341
  file = open(filename, 'w+')
342
  if query is None:
343
  query = 'Internal(existing) research'
344
  file.write("----------------------------------------------------------------------")
345
  file.write("Title: A survey on " + query)
346
+ self.print_fn("")
347
+ self.print_fn("----------------------------------------------------------------------")
348
+ self.print_fn("Title: A survey on " + query)
349
  file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
350
+ self.print_fn("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
351
  file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
352
+ self.print_fn("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
353
  file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
354
  "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
355
  "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
356
  "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
357
  "\nentries(only to avoid LaTex overhead). ")
358
+ self.print_fn("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
359
  "\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
360
  "\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
361
  "\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
362
  "\nentries(only to avoid LaTex overhead). ")
363
  file.write("----------------------------------------------------------------------")
364
+ self.print_fn("----------------------------------------------------------------------")
365
  file.write("")
366
+ self.print_fn("")
367
  file.write('ABSTRACT')
368
+ self.print_fn('ABSTRACT')
369
+ self.print_fn("=================================================")
370
  file.write("=================================================")
371
  file.write("")
372
+ self.print_fn("")
373
  file.write(research_sections['abstract'])
374
+ self.print_fn(research_sections['abstract'])
375
  file.write("")
376
+ self.print_fn("")
377
  file.write('INTRODUCTION')
378
+ self.print_fn('INTRODUCTION')
379
+ self.print_fn("=================================================")
380
  file.write("=================================================")
381
  file.write("")
382
+ self.print_fn("")
383
  file.write(research_sections['introduction'])
384
+ self.print_fn(research_sections['introduction'])
385
  file.write("")
386
+ self.print_fn("")
387
  for k, v in research_sections.items():
388
  if k not in ['abstract', 'introduction', 'conclusion']:
389
  file.write(k.upper())
390
+ self.print_fn(k.upper())
391
+ self.print_fn("=================================================")
392
  file.write("=================================================")
393
  file.write("")
394
+ self.print_fn("")
395
  file.write(v)
396
+ self.print_fn(v)
397
  file.write("")
398
+ self.print_fn("")
399
  file.write('CONCLUSION')
400
+ self.print_fn('CONCLUSION')
401
+ self.print_fn("=================================================")
402
  file.write("=================================================")
403
  file.write("")
404
+ self.print_fn("")
405
  file.write(research_sections['conclusion'])
406
+ self.print_fn(research_sections['conclusion'])
407
  file.write("")
408
+ self.print_fn("")
409
 
410
  file.write('REFERENCES')
411
+ self.print_fn('REFERENCES')
412
+ self.print_fn("=================================================")
413
  file.write("=================================================")
414
  file.write("")
415
+ self.print_fn("")
416
  for entry in bibentries:
417
  file.write(entry)
418
+ self.print_fn(entry)
419
  file.write("")
420
+ self.print_fn("")
421
+ self.print_fn("========================XXX=========================")
422
  file.write("========================XXX=========================")
423
  file.close()
424
 
 
426
 
427
  research_blocks = {}
428
  for head, textarr in corpus_known_sections.items():
429
+ if 'cuda' in self.torch_device:
430
+ torch.cuda.empty_cache()
431
+ # self.print_fn(head.upper())
432
  with torch.no_grad():
433
  summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5)
434
  res = self.nlp(summtext)
435
  res = set([str(sent) for sent in list(res.sents)])
436
  summtext = ''.join([line for line in res])
437
+ # pself.print_fn(summtext)
438
  research_blocks[head] = summtext
439
 
440
  return research_blocks
 
450
  sequences = ledmodel.generate(input_ids, global_attention_mask=global_attention_mask).sequences
451
  summary = ledtokenizer.batch_decode(sequences)
452
  '''
453
+ if 'cuda' in self.torch_device:
454
+ torch.cuda.empty_cache()
455
  inputs = self.ledtokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
456
  return_tensors='pt').to(self.torch_device)
457
  with torch.no_grad():
 
461
  res = self.nlp(summary[0])
462
  res = set([str(sent) for sent in list(res.sents)])
463
  summtext = ''.join([line for line in res])
464
+ #self.print_fn("abstractive summary type:" + str(type(summary)))
465
  return summtext
466
 
467
  def get_abstract(self, abs_lines, corpus_known_sections, research_blocks):
 
470
  abs_lines = ""
471
  abs_lines += " ".join([l.lower() for l in corpus_known_sections['abstract']])
472
  abs_lines += research_blocks['abstract']
473
+ # self.print_fn(abs_lines)
474
 
475
  try:
476
  return self.abstractive_summary(abs_lines)
 
482
  abs_lines = []
483
  types = set()
484
  for k, v in corpus.items():
485
+ # self.print_fn(v)
486
  types.add(type(v))
487
  abstext = k + '. ' + v.replace('\n', ' ')
488
  abstext = self.nlp(abstext)
489
  abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)])
490
+ #self.print_fn("unique corpus value types:" + str(types))
491
  # abs_lines = '\n'.join([str(sent) for sent in abs_lines.sents])
492
  return abs_lines
493
 
 
508
  if p['id'] not in selected_pids:
509
  meta_abs.append(self.generate_title(p['abstract']))
510
  docs.extend(meta_abs)
511
+ #self.print_fn("meta_abs num"+str(len(meta_abs)))
512
+ #self.print_fn("selected_pids num"+str(len(selected_pids)))
513
+ #self.print_fn("papers_meta num"+str(len(papers_meta)))
514
  #assert (len(meta_abs) + len(selected_pids) == len(papers_meta))
515
  assert ('str' in str(type(random.sample(docs, 1)[0])))
516
  return [doc for doc in docs if doc != '']
 
520
  from sklearn.cluster import KMeans
521
  # from bertopic import BERTopic
522
  # topic_model = BERTopic(embedding_model=embedder)
523
+ if 'cuda' in self.torch_device:
524
+ torch.cuda.empty_cache()
525
  corpus_embeddings = self.embedder.encode(abs_lines)
526
  # Normalize the embeddings to unit length
527
  corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
 
541
  clustered_sentences[cluster_id] = []
542
  '''
543
  if dummy_count < 5:
544
+ self.print_fn("abs_line: "+abs_lines[sentence_id])
545
+ self.print_fn("cluster_ID: "+str(cluster_id))
546
+ self.print_fn("embedding: "+str(corpus_embeddings[sentence_id]))
547
  dummy_count += 1
548
  '''
549
  clustered_sentences[cluster_id].append(abs_lines[sentence_id])
550
 
551
  # for i, cluster in clustered_sentences.items():
552
+ # self.print_fn("Cluster ", i+1)
553
+ # self.print_fn(cluster)
554
+ # self.print_fn("")
555
 
556
  return self.get_clustered_sections(clustered_sentences), clustered_sentences
557
 
 
560
  from sklearn.cluster import KMeans
561
  # from bertopic import BERTopic
562
  # topic_model = BERTopic(embedding_model=embedder)
563
+ if 'cuda' in self.torch_device:
564
+ torch.cuda.empty_cache()
565
  abs_lines = self.get_sectioned_docs(papers, papers_meta)
566
  corpus_embeddings = self.embedder.encode(abs_lines)
567
  # Normalize the embeddings to unit length
 
582
  clustered_sentences[cluster_id] = []
583
  '''
584
  if dummy_count < 5:
585
+ self.print_fn("abs_line: "+abs_lines[sentence_id])
586
+ self.print_fn("cluster_ID: "+str(cluster_id))
587
+ self.print_fn("embedding: "+str(corpus_embeddings[sentence_id]))
588
  dummy_count += 1
589
  '''
590
  clustered_sentences[cluster_id].append(abs_lines[sentence_id])
591
 
592
  # for i, cluster in clustered_sentences.items():
593
+ # self.print_fn("Cluster ", i+1)
594
+ # self.print_fn(cluster)
595
+ # self.print_fn("")
596
 
597
  return self.get_clustered_sections(clustered_sentences), clustered_sentences
598
 
599
  def generate_title(self, longtext):
600
+ if 'cuda' in self.torch_device:
601
+ torch.cuda.empty_cache()
602
 
603
  inputs = self.title_tokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
604
  return_tensors='pt').to(self.torch_device)
 
612
  def get_clustered_sections(self, clustered_lines):
613
  clusters_dict = {}
614
  for i, cluster in clustered_lines.items():
615
+ # self.print_fn(cluster)
616
  try:
617
  clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
618
  str(" ".join(cluster)).lower())
 
651
  for section in p['sections']:
652
  if kh in section['heading']:
653
  khtext.extend(section['highlights'])
654
+ # self.print_fn(khtext)
655
  corpus_known_sections[kh] = khtext
656
  return corpus_known_sections
657
 
 
659
  known = ['abstract', 'introduction', 'discussion', 'relatedwork', 'contribution', 'analysis', 'experiments',
660
  'conclusion']
661
  for p in papers:
662
+ # self.print_fn("================================")
663
  headings = [section['heading'] for section in p['sections'] if len(section['heading'].split()) < 3]
664
+ # self.print_fn("id: "+ str(p['id'])+"\nHeadings: \n"+str('\n'.join(headings)))
665
  for kh in known:
666
  for section in p['sections']:
667
  if len(section['heading'].split()) < 3:
668
+ # self.print_fn(section['heading'])
669
  if kh in ''.join(filter(str.isalpha, section['heading'].replace(' ', '').lower())):
670
+ # self.print_fn("orig head: "+ section['heading'] +", plain head:" + kh)
671
  section['heading'] = kh
672
  return papers
673
 
 
681
  if pid == p['id']:
682
  corpus[pid] = p['abstract'] + str(' '.join(ph))
683
  '''
684
+ self.print_fn("================== final corpus ====================")
685
+ self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers_meta)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in corpus.items()]))
686
+ self.print_fn("======== sample point ========")
687
  p = random.choice(list(papers))
688
+ self.print_fn("paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
689
+ self.print_fn("======== sample meta point ========")
690
  p = random.choice(list(papers_meta))
691
+ self.print_fn("meta paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
692
  '''
693
  return corpus
694
 
 
700
  def build_meta_corpus(self, papers):
701
  meta_corpus = {}
702
  for p in papers:
703
+ # pself.print_fn(p)
704
  pid = p['id']
705
  ptext = p['title'] + ". " + p['abstract']
706
  doc = self.nlp(ptext)
707
  phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)])
708
  meta_corpus[pid] = str(' '.join(phs))
709
  '''
710
+ self.print_fn("================== meta corpus ====================")
711
+ self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in meta_corpus.items()]))
712
+ self.print_fn("======== sample point ========")
713
  p = random.choice(list(papers))
714
+ self.print_fn("paper: "+ p['title']+" \nhighlights: " + str(meta_corpus[p['id']]))
715
  '''
716
  return meta_corpus
717
 
718
  def select_papers(self, papers, query, num_papers=20):
719
  import numpy as np
720
+ # self.print_fn("paper sample: ")
721
+ # self.print_fn(papers)
722
  meta_corpus = self.build_meta_corpus(papers)
723
  scores = []
724
  pids = []
 
726
  score = self.text_para_similarity(query, highlights)
727
  scores.append(score)
728
  pids.append(id)
729
+ self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
730
 
731
  idx = np.argsort(scores)[:num_papers]
732
  #for i in range(len(scores)):
733
+ # self.print_fn("paper: " + str(self.get_by_pid(pids[i], papers)['title']))
734
+ # self.print_fn("score: " + str(scores[i]))
735
+ # self.print_fn("argsort ids("+str(num_papers)+" papers): "+ str(idx))
736
  idx = [pids[i] for i in idx]
737
+ # self.print_fn("argsort pids("+str(num_papers)+" papers): "+ str(idx))
738
  papers_selected = [p for p in papers if p['id'] in idx]
739
  # assert(len(papers_selected)==num_papers)
740
+ self.print_fn("num papers selected: " + str(len(papers_selected)))
741
  for p in papers_selected:
742
+ self.print_fn("Selected Paper: " + p['title'])
743
 
744
+ self.print_fn("constrast with natural selection: forward")
745
  for p in papers[:4]:
746
+ self.print_fn("Selected Paper: " + p['title'])
747
+ self.print_fn("constrast with natural selection: backward")
748
  for p in papers[-4:]:
749
+ self.print_fn("Selected Paper: " + p['title'])
750
  # arxiv search producing better relevnce
751
  return papers_selected
752
 
753
  def extractive_summary(self, text):
754
+ if 'cuda' in self.torch_device:
755
+ torch.cuda.empty_cache()
756
  with torch.no_grad():
757
  res = self.model(text, ratio=0.5)
758
  res_doc = self.nlp(res)
 
762
  # text = " ".join(lines)
763
  # text_doc = nlp(" ".join([l.lower() for l in lines]))
764
  # text = ' '.join([ str(sent) for sent in list(text_doc.sents)])
765
+ if 'cuda' in self.torch_device:
766
+ torch.cuda.empty_cache()
767
  with torch.no_grad():
768
  res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
769
  res_doc = self.nlp(res)
770
  res_lines = set([str(sent) for sent in list(res_doc.sents)])
771
+ # self.print_fn("\n".join(res_sents))
772
  with torch.no_grad():
773
  keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
774
  keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
 
794
  return papers
795
 
796
  def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
797
+ self.print_fn("\nextracting sections.. ")
798
  papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
799
 
800
+ self.print_fn("\nextracting images.. for future correlation use-cases ")
801
  papers = self.extract_images(papers, pdf_dir, img_dir)
802
 
803
  if tables:
804
+ self.print_fn("\nextracting tables.. for future correlation use-cases ")
805
  papers = self.extract_tables(papers, pdf_dir, tab_dir)
806
 
807
  return papers, ids_none
 
828
  '''
829
  for f, h in headings_all.items():
830
  if len(h) < 4:
831
+ self.print_fn("=================headings almost undetected================")
832
+ self.print_fn(f)
833
+ self.print_fn(h)
834
  '''
835
  # from pprint import pprint
836
+ # pself.print_fn({f: len(h) for f,h in headings_all.items()})
837
  papers_none = [p for p in papers if p['id'] in ids_none]
838
  for p in papers_none:
839
  os.remove(txt_dir + '/'+ p['id'] + '.txt')
 
860
  start = headings[i]
861
  end = headings[i + 1]
862
  section = self.get_section(start, end, lines)
863
+ # self.print_fn(start + " : "+ str(len(section)) +" lines")
864
  '''
865
  if i > 0:
866
  old = headings[i-1]
 
896
  start = [i for i in range(len(lines)) if first is lines[i]][0]
897
  end = [i for i in range(len(lines)) if last is lines[i]][0]
898
  section_lines = lines[start + 1:end]
899
+ # self.print_fn("heading: " + str(first))
900
+ # self.print_fn("section_lines: "+ str(section_lines))
901
+ # self.print_fn(section_lines)
902
  return section_lines
903
  except ValueError:
904
+ self.print_fn("value error :")
905
+ self.print_fn("first heading :" + str(first) + ", second heading :" + str(last))
906
+ self.print_fn("first index :" + str(start) + ", second index :" + str(end))
907
  return ""
908
 
909
  def check_list_elems_in_list(self, headings, lines):
910
  import numpy as np
911
+ # [self.print_fn(head) for head in headings if head not in lines ]
912
  return np.all([True if head in lines else False for head in headings])
913
 
914
  def check_first_char_upper(self, text):
 
928
  assert (self.check_list_elems_in_list(headings, refined))
929
  headings = self.check_duplicates(headings)
930
 
931
+ # self.print_fn('===========================================')
932
+ # self.print_fn(txt_file +": first scan: \n"+str(len(headings))+" headings")
933
+ # self.print_fn('\n'.join(headings))
934
 
935
  # scan_failed - rescan with first match for abstract hook
936
  if len(headings) == 0:
937
+ # self.print_fn('===================')
938
+ # self.print_fn("run 1 failed")
939
  abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())]
940
  if len(abs_cans) != 0:
941
  abs_head = abs_cans[0]
942
  refined, headings = self.scan_text(lines, abs_head=abs_head)
943
  self.check_list_elems_in_list(headings, refined)
944
  headings = self.check_duplicates(headings)
945
+ # self.print_fn('===================')
946
+ # self.print_fn(txt_file +": second scan: \n"+str(len(headings))+" headings")
947
 
948
  # if len(headings) == 0:
949
+ # self.print_fn("heading scan failed completely")
950
 
951
  return refined, headings
952
 
 
956
  if len(dups) > 0:
957
  [my_finallist.append(n) for n in my_list if n not in my_finallist]
958
 
959
+ # self.print_fn("original: "+str(len(my_list))+" new: "+str(len(my_finallist)))
960
  return my_finallist
961
 
962
  def clean_lines(self, text):
 
985
 
986
  def scan_text(self, lines, abs_head=None):
987
  import re
988
+ # self.print_fn('\n'.join(lines))
989
  record = False
990
  headings = []
991
  refined = []
 
1006
  refined.append(line)
1007
  break
1008
  refined, headings = self.scanline(record, headings, refined, i, lines)
1009
+ # self.print_fn('=========in scan_text loop i : '+str(i)+' heading count : '+str(len(headings))+' =========')
1010
  return refined, headings
1011
 
1012
  def scanline(self, record, headings, refined, id, lines):
 
1015
  line = lines[id]
1016
 
1017
  if not len(line) == 0:
1018
+ # self.print_fn("in scanline")
1019
+ # self.print_fn(line)
1020
  if record:
1021
  refined.append(line)
1022
  if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match(
1023
  "^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$",
1024
  line) and self.char_length(line) > 7:
1025
+ # self.print_fn("candidate")
1026
+ # self.print_fn(line)
1027
  if np.mean([len(s) for s in lines[id + 2:id + 6]]) > 40 and self.check_first_char_upper(
1028
  line) and re.match("^[a-zA-Z1-9\.\:\-\s]*$", line) and len(line.split()) < 10:
1029
  # if len(line) < 20 and np.mean([len(s) for s in lines[i+1:i+5]]) > 30 :
1030
  headings.append(line)
1031
  assert (line in refined)
1032
+ # self.print_fn("selected")
1033
+ # self.print_fn(line)
1034
  else:
1035
  known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography']
1036
  missing = [h for h in known_headings if not np.any([True for head in headings if h in head])]
 
1057
  for p in papers:
1058
  if p['id'] == pid:
1059
  return p
1060
+ self.print_fn("\npaper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
1061
 
1062
 
1063
  def alpha_length(self, s):
 
1078
 
1079
  def extract_images(self, papers, pdf_dir, img_dir):
1080
  import fitz
1081
+ # self.print_fn("in images")
1082
  for p in papers:
1083
  file = pdf_dir + p['id'] + ".pdf"
1084
  pdf_file = fitz.open(file)
 
1088
  images.extend(page.getImageList())
1089
  images_files = [self.save_image(pdf_file.extractImage(img[0]), i, p['id'], img_dir) for i, img in
1090
  enumerate(set(images)) if img[0]]
1091
+ # self.print_fn(len(images_per_paper))
1092
  p['images'] = images_files
1093
+ # self.print_fn(len(p.keys()))
1094
+ # self.print_fn(papers[0].keys())
1095
  return papers
1096
 
1097
 
 
1117
  # save it to local disk
1118
  fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext
1119
  image.save(open(f"{fname}", "wb"))
1120
+ # self.print_fn(fname)
1121
  return fname
1122
 
1123
  def save_tables(self, dfs, pid, tab_dir):
 
1137
  for p in papers:
1138
  dfs = tabula.read_pdf(pdf_dir + p['id'] + ".pdf", pages='all', multiple_tables=True, silent=True)
1139
  p['tables'] = self.save_tables(dfs, p['id'], tab_dir)
1140
+ # self.print_fn(papers[0].keys())
1141
  return papers
1142
 
1143
  def extract_tables_from_file(self, pdf_file_name, tab_dir):
 
1191
  else:
1192
  discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
1193
 
1194
+ self.print_fn("\nPapers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
1195
 
1196
  return results, searched_papers
1197
 
 
1199
  import arxiv
1200
  from urllib.parse import urlparse
1201
  ids = [p['id'] for p in papers]
1202
+ self.print_fn("\ndownloading below selected papers: ")
1203
+ self.print_fn(ids)
1204
  # asert(False)
1205
  papers_filtered = arxiv.Search(id_list=ids).get()
1206
  for p in papers_filtered:
 
1213
  import arxiv
1214
  from urllib.parse import urlparse
1215
  ids = [p['id'] for p in papers]
1216
+ self.print_fn(ids)
1217
  # asert(False)
1218
  papers_filtered = arxiv.Search(id_list=ids).get()
1219
  for p in papers_filtered:
 
1242
 
1243
 
1244
  cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
1245
+ self.print_fn("\ncitation-network: ")
1246
+ self.print_fn(cites)
1247
 
1248
  for p in papers:
1249
  p['cites'] = cites[p['id']]
 
1254
  from scholarly import scholarly
1255
  import operator
1256
  # Retrieve the author's data, fill-in, and print
1257
+ self.print_fn("Searching Author: " + author_query)
1258
  search_result = next(scholarly.search_author(author_query), None)
1259
 
1260
  if search_result is not None:
 
1274
  'url_picture': author['url_picture'],
1275
  }
1276
  else:
1277
+ self.print_fn("author not found")
1278
  author_stats = {
1279
  'name': author_query,
1280
  'affiliation': "",
 
1288
  'url_picture': "",
1289
  }
1290
 
1291
+ # pself.print_fn(author_stats)
1292
  return author_stats
1293
 
1294
  def author_stats(self, papers):
 
1326
  start_positions = torch.tensor([1])
1327
  end_positions = torch.tensor([3])
1328
  outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions)
1329
+ self.print_fn("context: " + text)
1330
+ self.print_fn("question: " + question)
1331
+ self.print_fn("outputs: " + outputs)
1332
  return outputs
1333
 
1334
  def zip_outputs(self, dump_dir, query):
 
1354
  if not num_papers:
1355
  num_papers = self.DEFAULTS['num_papers']
1356
  # arxiv api relevance search and data preparation
1357
+ self.print_fn("\nsearching arXiv for top 100 papers.. ")
1358
  results, searched_papers = self.search(query, max_search=max_search)
1359
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1360
+ self.print_fn("\nfound " + str(len(searched_papers)) + " papers")
1361
 
1362
  # paper selection by scibert vector embedding relevance scores
1363
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
 
1370
 
1371
  joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1372
 
1373
+ self.print_fn("\nStandardizing known section headings per paper.. ")
1374
  papers_standardized = self.standardize_headings(papers_highlighted)
1375
  joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1376
 
1377
+ self.print_fn("\nBuilding paper-wise corpus.. ")
1378
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1379
  joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1380
 
1381
+ self.print_fn("\nBuilding section-wise corpus.. ")
1382
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1383
  joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1384
 
1385
+ self.print_fn("\nBuilding basic research highlights.. ")
1386
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1387
  joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1388
 
1389
+ self.print_fn("\nReducing corpus to lines.. ")
1390
  corpus_lines = self.get_corpus_lines(corpus)
1391
  joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1392
 
 
1402
  '''
1403
 
1404
  '''
1405
+ self.print_fn("papers_highlighted types:"+ str(np.unique([str(type(p['sections'][0]['highlights'])) for p in papers_highlighted])))
1406
+ self.print_fn("papers_highlighted example:")
1407
+ self.print_fn(random.sample(list(papers_highlighted), 1)[0]['sections'][0]['highlights'])
1408
+ self.print_fn("corpus types:"+ str(np.unique([str(type(txt)) for k,txt in corpus.items()])))
1409
+ self.print_fn("corpus example:")
1410
+ self.print_fn(random.sample(list(corpus.items()), 1)[0])
1411
+ self.print_fn("corpus_lines types:"+ str(np.unique([str(type(txt)) for txt in corpus_lines])))
1412
+ self.print_fn("corpus_lines example:")
1413
+ self.print_fn(random.sample(list(corpus_lines), 1)[0])
1414
+ self.print_fn("corpus_sectionwise types:"+ str(np.unique([str(type(txt)) for k,txt in corpus_sectionwise.items()])))
1415
+ self.print_fn("corpus_sectionwise example:")
1416
+ self.print_fn(random.sample(list(corpus_sectionwise.items()), 1)[0])
1417
+ self.print_fn("research_blocks types:"+ str(np.unique([str(type(txt)) for k,txt in research_blocks.items()])))
1418
+ self.print_fn("research_blocks example:")
1419
+ self.print_fn(random.sample(list(research_blocks.items()), 1)[0])
1420
  '''
1421
+ # self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
1422
 
1423
+ self.print_fn("\nBuilding abstract.. ")
1424
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1425
  joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1426
  '''
1427
+ self.print_fn("abstract_block type:"+ str(type(abstract_block)))
1428
+ self.print_fn("abstract_block:")
1429
+ self.print_fn(abstract_block)
1430
  '''
1431
 
1432
+ self.print_fn("\nBuilding introduction.. ")
1433
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1434
  joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1435
  '''
1436
+ self.print_fn("intro_block type:"+ str(type(intro_block)))
1437
+ self.print_fn("intro_block:")
1438
+ self.print_fn(intro_block)
1439
  '''
1440
+ self.print_fn("\nBuilding custom sections.. ")
1441
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1442
  joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1443
  joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
1444
 
1445
  '''
1446
+ self.print_fn("clusters extracted")
1447
+ self.print_fn("clustered_sentences types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sentences.items()])))
1448
+ self.print_fn("clustered_sentences example:")
1449
+ self.print_fn(random.sample(list(clustered_sections.items()), 1)[0])
1450
+ self.print_fn("clustered_sections types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sections.items()])))
1451
+ self.print_fn("clustered_sections example:")
1452
+ self.print_fn(random.sample(list(clustered_sections.items()), 1)[0])
1453
  '''
1454
  clustered_sections['abstract'] = abstract_block
1455
  clustered_sections['introduction'] = intro_block
1456
  joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1457
 
1458
+ self.print_fn("\nBuilding conclusion.. ")
1459
  conclusion_block = self.get_conclusion(clustered_sections)
1460
  joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1461
  clustered_sections['conclusion'] = conclusion_block
1462
  '''
1463
+ self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
1464
+ self.print_fn("conclusion_block:")
1465
+ self.print_fn(conclusion_block)
1466
  '''
1467
 
1468
  survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
 
1472
  shutil.copy(self.dump_dir + survey_file, survey_file)
1473
  assert (os.path.exists(survey_file))
1474
  output_zip = self.zip_outputs(self.dump_dir, query)
1475
+ self.print_fn("\nSurvey complete.. \nSurvey file path :" + os.path.abspath(
1476
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1477
 
1478
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)