sidphbot commited on
Commit
3e308f4
1 Parent(s): a2d09f1
Files changed (1) hide show
  1. src/Surveyor.py +24 -24
src/Surveyor.py CHANGED
@@ -159,7 +159,7 @@ class Surveyor:
159
  if not no_save_models:
160
  self.embedder.save(models_dir + "/embedder")
161
  else:
162
- self.print_fn("\n-Initializing from previously saved models at" + models_dir)
163
  self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
164
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
165
  self.title_model.eval()
@@ -245,7 +245,7 @@ class Surveyor:
245
  papers = papers_meta[:self.num_papers]
246
  selected_papers = papers
247
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
248
- self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
249
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
250
  # _ = self.get_freq_cited(cites)
251
  '''
@@ -257,14 +257,14 @@ class Surveyor:
257
  '''
258
  selected_papers.extend(new_papers)
259
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
260
- self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
261
  papers.extend(new_papers)
262
 
263
  joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
264
  copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
265
  copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
266
 
267
- self.print_fn("\n-Extracting section-wise highlights.. ")
268
  papers = self.extract_highlights(papers)
269
 
270
  return papers, selected_papers, cites
@@ -339,11 +339,11 @@ class Surveyor:
339
  def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
340
 
341
  import arxiv2bib
342
- self.print_fn("\n-building bibliography entries.. ")
343
  bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
344
  bibentries = [r.bibtex() for r in bibentries]
345
 
346
- self.print_fn("\n-building final survey file .. at "+ filename)
347
  file = open(filename, 'w+')
348
  if query is None:
349
  query = 'Internal(existing) research'
@@ -772,7 +772,7 @@ class Surveyor:
772
  res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
773
  res_doc = self.nlp(res)
774
  res_lines = set([str(sent) for sent in list(res_doc.sents)])
775
- # self.print_fn("\n-".join(res_sents))
776
  with torch.no_grad():
777
  keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
778
  keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
@@ -798,14 +798,14 @@ class Surveyor:
798
  return papers
799
 
800
  def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
801
- self.print_fn("\n-extracting sections.. ")
802
  papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
803
 
804
- self.print_fn("\n-extracting images.. for future correlation use-cases ")
805
  papers = self.extract_images(papers, pdf_dir, img_dir)
806
 
807
  if tables:
808
- self.print_fn("\n-extracting tables.. for future correlation use-cases ")
809
  papers = self.extract_tables(papers, pdf_dir, tab_dir)
810
 
811
  return papers, ids_none
@@ -1061,7 +1061,7 @@ class Surveyor:
1061
  for p in papers:
1062
  if p['id'] == pid:
1063
  return p
1064
- self.print_fn("\n-paper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
1065
 
1066
 
1067
  def alpha_length(self, s):
@@ -1195,7 +1195,7 @@ class Surveyor:
1195
  else:
1196
  discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
1197
 
1198
- self.print_fn("\n-Papers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
1199
 
1200
  return results, searched_papers
1201
 
@@ -1354,10 +1354,10 @@ class Surveyor:
1354
  if (query is None) and (id_list is None):
1355
  raise ValueError('please provide a base to survey on: list of arxiv IDs or a few research keywords')
1356
  # arxiv api relevance search and data preparation
1357
- self.print_fn("\n-searching arXiv for top 100 papers.. ")
1358
  results, searched_papers = self.search(query, id_list, max_search=max_search)
1359
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1360
- self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
1361
 
1362
  # paper selection by scibert vector embedding relevance scores
1363
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
@@ -1370,23 +1370,23 @@ class Surveyor:
1370
 
1371
  joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1372
 
1373
- self.print_fn("\n-Standardizing known section headings per paper.. ")
1374
  papers_standardized = self.standardize_headings(papers_highlighted)
1375
  joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1376
 
1377
- self.print_fn("\n-Building paper-wise corpus.. ")
1378
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1379
  joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1380
 
1381
- self.print_fn("\n-Building section-wise corpus.. ")
1382
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1383
  joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1384
 
1385
- self.print_fn("\n-Building basic research highlights.. ")
1386
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1387
  joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1388
 
1389
- self.print_fn("\n-Reducing corpus to lines.. ")
1390
  corpus_lines = self.get_corpus_lines(corpus)
1391
  joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1392
 
@@ -1420,7 +1420,7 @@ class Surveyor:
1420
  '''
1421
  # self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
1422
 
1423
- self.print_fn("\n-Building abstract.. ")
1424
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1425
  joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1426
  '''
@@ -1429,7 +1429,7 @@ class Surveyor:
1429
  self.print_fn(abstract_block)
1430
  '''
1431
 
1432
- self.print_fn("\n-Building introduction.. ")
1433
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1434
  joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1435
  '''
@@ -1437,7 +1437,7 @@ class Surveyor:
1437
  self.print_fn("intro_block:")
1438
  self.print_fn(intro_block)
1439
  '''
1440
- self.print_fn("\n-Building custom sections.. ")
1441
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1442
  joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1443
  joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
@@ -1455,7 +1455,7 @@ class Surveyor:
1455
  clustered_sections['introduction'] = intro_block
1456
  joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1457
 
1458
- self.print_fn("\n-Building conclusion.. ")
1459
  conclusion_block = self.get_conclusion(clustered_sections)
1460
  joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1461
  clustered_sections['conclusion'] = conclusion_block
@@ -1477,7 +1477,7 @@ class Surveyor:
1477
  shutil.copy(self.dump_dir + survey_file, survey_file)
1478
  assert (os.path.exists(survey_file))
1479
  output_zip = self.zip_outputs(self.dump_dir, query)
1480
- self.print_fn("\n-Survey complete.. \nSurvey file path :" + os.path.abspath(
1481
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1482
 
1483
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
 
159
  if not no_save_models:
160
  self.embedder.save(models_dir + "/embedder")
161
  else:
162
+ self.print_fn("\n- Initializing from previously saved models at" + models_dir)
163
  self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
164
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
165
  self.title_model.eval()
 
245
  papers = papers_meta[:self.num_papers]
246
  selected_papers = papers
247
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
248
+ self.print_fn("\n- First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
249
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
250
  # _ = self.get_freq_cited(cites)
251
  '''
 
257
  '''
258
  selected_papers.extend(new_papers)
259
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
260
+ self.print_fn("\n- Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
261
  papers.extend(new_papers)
262
 
263
  joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
264
  copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
265
  copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
266
 
267
+ self.print_fn("\n- Extracting section-wise highlights.. ")
268
  papers = self.extract_highlights(papers)
269
 
270
  return papers, selected_papers, cites
 
339
  def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
340
 
341
  import arxiv2bib
342
+ self.print_fn("\n- building bibliography entries.. ")
343
  bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
344
  bibentries = [r.bibtex() for r in bibentries]
345
 
346
+ self.print_fn("\n- building final survey file .. at "+ filename)
347
  file = open(filename, 'w+')
348
  if query is None:
349
  query = 'Internal(existing) research'
 
772
  res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
773
  res_doc = self.nlp(res)
774
  res_lines = set([str(sent) for sent in list(res_doc.sents)])
775
+ # self.print_fn("\n- ".join(res_sents))
776
  with torch.no_grad():
777
  keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
778
  keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
 
798
  return papers
799
 
800
  def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
801
+ self.print_fn("\n- extracting sections.. ")
802
  papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
803
 
804
+ self.print_fn("\n- extracting images.. for future correlation use-cases ")
805
  papers = self.extract_images(papers, pdf_dir, img_dir)
806
 
807
  if tables:
808
+ self.print_fn("\n- extracting tables.. for future correlation use-cases ")
809
  papers = self.extract_tables(papers, pdf_dir, tab_dir)
810
 
811
  return papers, ids_none
 
1061
  for p in papers:
1062
  if p['id'] == pid:
1063
  return p
1064
+ self.print_fn("\n- paper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
1065
 
1066
 
1067
  def alpha_length(self, s):
 
1195
  else:
1196
  discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
1197
 
1198
+ self.print_fn("\n- Papers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
1199
 
1200
  return results, searched_papers
1201
 
 
1354
  if (query is None) and (id_list is None):
1355
  raise ValueError('please provide a base to survey on: list of arxiv IDs or a few research keywords')
1356
  # arxiv api relevance search and data preparation
1357
+ self.print_fn("\n- searching arXiv for top 100 papers.. ")
1358
  results, searched_papers = self.search(query, id_list, max_search=max_search)
1359
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1360
+ self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
1361
 
1362
  # paper selection by scibert vector embedding relevance scores
1363
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
 
1370
 
1371
  joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1372
 
1373
+ self.print_fn("\n- Standardizing known section headings per paper.. ")
1374
  papers_standardized = self.standardize_headings(papers_highlighted)
1375
  joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1376
 
1377
+ self.print_fn("\n- Building paper-wise corpus.. ")
1378
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1379
  joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1380
 
1381
+ self.print_fn("\n- Building section-wise corpus.. ")
1382
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1383
  joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1384
 
1385
+ self.print_fn("\n- Building basic research highlights.. ")
1386
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1387
  joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1388
 
1389
+ self.print_fn("\n- Reducing corpus to lines.. ")
1390
  corpus_lines = self.get_corpus_lines(corpus)
1391
  joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1392
 
 
1420
  '''
1421
  # self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
1422
 
1423
+ self.print_fn("\n- Building abstract.. ")
1424
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1425
  joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1426
  '''
 
1429
  self.print_fn(abstract_block)
1430
  '''
1431
 
1432
+ self.print_fn("\n- Building introduction.. ")
1433
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1434
  joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1435
  '''
 
1437
  self.print_fn("intro_block:")
1438
  self.print_fn(intro_block)
1439
  '''
1440
+ self.print_fn("\n- Building custom sections.. ")
1441
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1442
  joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1443
  joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
 
1455
  clustered_sections['introduction'] = intro_block
1456
  joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1457
 
1458
+ self.print_fn("\n- Building conclusion.. ")
1459
  conclusion_block = self.get_conclusion(clustered_sections)
1460
  joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1461
  clustered_sections['conclusion'] = conclusion_block
 
1477
  shutil.copy(self.dump_dir + survey_file, survey_file)
1478
  assert (os.path.exists(survey_file))
1479
  output_zip = self.zip_outputs(self.dump_dir, query)
1480
+ self.print_fn("\n- Survey complete.. \nSurvey file path :" + os.path.abspath(
1481
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1482
 
1483
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)