sidphbot commited on
Commit
440f11d
1 Parent(s): adf54e4
Files changed (2) hide show
  1. app.py +1 -1
  2. src/Surveyor.py +30 -31
app.py CHANGED
@@ -28,7 +28,6 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
28
 
29
 
30
  def survey_space(surveyor):
31
- st.container().title('Auto-Research V0.1 - Automated Survey generation from research keywords')
32
  form = st.sidebar.form(key='survey_form')
33
  research_keywords = form.text_input("What would you like to research in today?")
34
  max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
@@ -42,6 +41,7 @@ def survey_space(surveyor):
42
 
43
 
44
  if __name__ == '__main__':
 
45
  global surveyor
46
  surveyor_obj = Surveyor(print_fn=st.write)
47
  survey_space(surveyor_obj)
 
28
 
29
 
30
  def survey_space(surveyor):
 
31
  form = st.sidebar.form(key='survey_form')
32
  research_keywords = form.text_input("What would you like to research in today?")
33
  max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
 
41
 
42
 
43
  if __name__ == '__main__':
44
+ st.container().title('Auto-Research V0.1 - Automated Survey generation from research keywords')
45
  global surveyor
46
  surveyor_obj = Surveyor(print_fn=st.write)
47
  survey_space(surveyor_obj)
src/Surveyor.py CHANGED
@@ -75,11 +75,10 @@ class Surveyor:
75
  self.print_fn = print
76
  if print_fn is not None:
77
  self.print_fn = print_fn
78
-
79
  self.torch_device = 'cpu'
80
  self.print_fn("\nTorch_device: " + self.torch_device)
81
  if torch.cuda.is_available():
82
- self.print_fn("\nloading defaults for gpu")
83
  self.torch_device = 'cuda'
84
  spacy.require_gpu()
85
 
@@ -153,7 +152,7 @@ class Surveyor:
153
  if not no_save_models:
154
  self.embedder.save(models_dir + "/embedder")
155
  else:
156
- self.print_fn("\nInitializing from previously saved models at" + models_dir)
157
  self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
158
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
159
  self.title_model.eval()
@@ -235,9 +234,9 @@ class Surveyor:
235
 
236
  papers = papers_meta[:self.num_papers]
237
  selected_papers = papers
238
- self.print_fn("\nFirst stage paper collection...")
239
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
240
- self.print_fn("\nFirst stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
241
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
242
  _ = self.get_freq_cited(cites)
243
  '''
@@ -248,16 +247,16 @@ class Surveyor:
248
  new_papers.extend(new_searched_papers)
249
  '''
250
  selected_papers.extend(new_papers)
251
- self.print_fn("\nSecond stage paper collection...")
252
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
253
- self.print_fn("\nSecond stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
254
  papers.extend(new_papers)
255
 
256
  joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
257
  copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
258
  copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
259
 
260
- self.print_fn("\nExtracting section-wise highlights.. ")
261
  papers = self.extract_highlights(papers)
262
 
263
  return papers, selected_papers
@@ -270,7 +269,7 @@ class Surveyor:
270
  [cites_list.append(val) for val in v]
271
  cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
272
  sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
273
- self.print_fn("\nThe most cited paper ids are:\n" + str(sorted_cites))
274
 
275
  return sorted_cites.keys()
276
 
@@ -333,11 +332,11 @@ class Surveyor:
333
  def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
334
 
335
  import arxiv2bib
336
- self.print_fn("\nbuilding bibliography entries.. ")
337
  bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
338
  bibentries = [r.bibtex() for r in bibentries]
339
 
340
- self.print_fn("\nbuilding final survey file .. at "+ filename)
341
  file = open(filename, 'w+')
342
  if query is None:
343
  query = 'Internal(existing) research'
@@ -768,7 +767,7 @@ class Surveyor:
768
  res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
769
  res_doc = self.nlp(res)
770
  res_lines = set([str(sent) for sent in list(res_doc.sents)])
771
- # self.print_fn("\n".join(res_sents))
772
  with torch.no_grad():
773
  keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
774
  keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
@@ -794,14 +793,14 @@ class Surveyor:
794
  return papers
795
 
796
  def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
797
- self.print_fn("\nextracting sections.. ")
798
  papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
799
 
800
- self.print_fn("\nextracting images.. for future correlation use-cases ")
801
  papers = self.extract_images(papers, pdf_dir, img_dir)
802
 
803
  if tables:
804
- self.print_fn("\nextracting tables.. for future correlation use-cases ")
805
  papers = self.extract_tables(papers, pdf_dir, tab_dir)
806
 
807
  return papers, ids_none
@@ -1057,7 +1056,7 @@ class Surveyor:
1057
  for p in papers:
1058
  if p['id'] == pid:
1059
  return p
1060
- self.print_fn("\npaper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
1061
 
1062
 
1063
  def alpha_length(self, s):
@@ -1191,7 +1190,7 @@ class Surveyor:
1191
  else:
1192
  discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
1193
 
1194
- self.print_fn("\nPapers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
1195
 
1196
  return results, searched_papers
1197
 
@@ -1199,7 +1198,7 @@ class Surveyor:
1199
  import arxiv
1200
  from urllib.parse import urlparse
1201
  ids = [p['id'] for p in papers]
1202
- self.print_fn("\ndownloading below selected papers: ")
1203
  self.print_fn(ids)
1204
  # asert(False)
1205
  papers_filtered = arxiv.Search(id_list=ids).get()
@@ -1242,7 +1241,7 @@ class Surveyor:
1242
 
1243
 
1244
  cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
1245
- self.print_fn("\ncitation-network: ")
1246
  self.print_fn(cites)
1247
 
1248
  for p in papers:
@@ -1354,10 +1353,10 @@ class Surveyor:
1354
  if not num_papers:
1355
  num_papers = self.DEFAULTS['num_papers']
1356
  # arxiv api relevance search and data preparation
1357
- self.print_fn("\nsearching arXiv for top 100 papers.. ")
1358
  results, searched_papers = self.search(query, max_search=max_search)
1359
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1360
- self.print_fn("\nfound " + str(len(searched_papers)) + " papers")
1361
 
1362
  # paper selection by scibert vector embedding relevance scores
1363
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
@@ -1370,23 +1369,23 @@ class Surveyor:
1370
 
1371
  joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1372
 
1373
- self.print_fn("\nStandardizing known section headings per paper.. ")
1374
  papers_standardized = self.standardize_headings(papers_highlighted)
1375
  joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1376
 
1377
- self.print_fn("\nBuilding paper-wise corpus.. ")
1378
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1379
  joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1380
 
1381
- self.print_fn("\nBuilding section-wise corpus.. ")
1382
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1383
  joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1384
 
1385
- self.print_fn("\nBuilding basic research highlights.. ")
1386
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1387
  joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1388
 
1389
- self.print_fn("\nReducing corpus to lines.. ")
1390
  corpus_lines = self.get_corpus_lines(corpus)
1391
  joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1392
 
@@ -1420,7 +1419,7 @@ class Surveyor:
1420
  '''
1421
  # self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
1422
 
1423
- self.print_fn("\nBuilding abstract.. ")
1424
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1425
  joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1426
  '''
@@ -1429,7 +1428,7 @@ class Surveyor:
1429
  self.print_fn(abstract_block)
1430
  '''
1431
 
1432
- self.print_fn("\nBuilding introduction.. ")
1433
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1434
  joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1435
  '''
@@ -1437,7 +1436,7 @@ class Surveyor:
1437
  self.print_fn("intro_block:")
1438
  self.print_fn(intro_block)
1439
  '''
1440
- self.print_fn("\nBuilding custom sections.. ")
1441
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1442
  joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1443
  joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
@@ -1455,7 +1454,7 @@ class Surveyor:
1455
  clustered_sections['introduction'] = intro_block
1456
  joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1457
 
1458
- self.print_fn("\nBuilding conclusion.. ")
1459
  conclusion_block = self.get_conclusion(clustered_sections)
1460
  joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1461
  clustered_sections['conclusion'] = conclusion_block
@@ -1472,7 +1471,7 @@ class Surveyor:
1472
  shutil.copy(self.dump_dir + survey_file, survey_file)
1473
  assert (os.path.exists(survey_file))
1474
  output_zip = self.zip_outputs(self.dump_dir, query)
1475
- self.print_fn("\nSurvey complete.. \nSurvey file path :" + os.path.abspath(
1476
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1477
 
1478
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
 
75
  self.print_fn = print
76
  if print_fn is not None:
77
  self.print_fn = print_fn
78
+
79
  self.torch_device = 'cpu'
80
  self.print_fn("\nTorch_device: " + self.torch_device)
81
  if torch.cuda.is_available():
 
82
  self.torch_device = 'cuda'
83
  spacy.require_gpu()
84
 
 
152
  if not no_save_models:
153
  self.embedder.save(models_dir + "/embedder")
154
  else:
155
+ self.print_fn("\n-Initializing from previously saved models at" + models_dir)
156
  self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
157
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
158
  self.title_model.eval()
 
234
 
235
  papers = papers_meta[:self.num_papers]
236
  selected_papers = papers
237
+ self.print_fn("\n-First stage paper collection...")
238
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
239
+ self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
240
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
241
  _ = self.get_freq_cited(cites)
242
  '''
 
247
  new_papers.extend(new_searched_papers)
248
  '''
249
  selected_papers.extend(new_papers)
250
+ self.print_fn("\n-Second stage paper collection...")
251
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
252
+ self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
253
  papers.extend(new_papers)
254
 
255
  joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
256
  copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
257
  copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
258
 
259
+ self.print_fn("\n-Extracting section-wise highlights.. ")
260
  papers = self.extract_highlights(papers)
261
 
262
  return papers, selected_papers
 
269
  [cites_list.append(val) for val in v]
270
  cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
271
  sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
272
+ self.print_fn("\n-The most cited paper ids are:\n" + str(sorted_cites))
273
 
274
  return sorted_cites.keys()
275
 
 
332
  def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
333
 
334
  import arxiv2bib
335
+ self.print_fn("\n-building bibliography entries.. ")
336
  bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
337
  bibentries = [r.bibtex() for r in bibentries]
338
 
339
+ self.print_fn("\n-building final survey file .. at "+ filename)
340
  file = open(filename, 'w+')
341
  if query is None:
342
  query = 'Internal(existing) research'
 
767
  res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
768
  res_doc = self.nlp(res)
769
  res_lines = set([str(sent) for sent in list(res_doc.sents)])
770
+ # self.print_fn("\n-".join(res_sents))
771
  with torch.no_grad():
772
  keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
773
  keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
 
793
  return papers
794
 
795
  def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
796
+ self.print_fn("\n-extracting sections.. ")
797
  papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
798
 
799
+ self.print_fn("\n-extracting images.. for future correlation use-cases ")
800
  papers = self.extract_images(papers, pdf_dir, img_dir)
801
 
802
  if tables:
803
+ self.print_fn("\n-extracting tables.. for future correlation use-cases ")
804
  papers = self.extract_tables(papers, pdf_dir, tab_dir)
805
 
806
  return papers, ids_none
 
1056
  for p in papers:
1057
  if p['id'] == pid:
1058
  return p
1059
+ self.print_fn("\n-paper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
1060
 
1061
 
1062
  def alpha_length(self, s):
 
1190
  else:
1191
  discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
1192
 
1193
+ self.print_fn("\n-Papers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
1194
 
1195
  return results, searched_papers
1196
 
 
1198
  import arxiv
1199
  from urllib.parse import urlparse
1200
  ids = [p['id'] for p in papers]
1201
+ self.print_fn("\n-downloading below selected papers: ")
1202
  self.print_fn(ids)
1203
  # asert(False)
1204
  papers_filtered = arxiv.Search(id_list=ids).get()
 
1241
 
1242
 
1243
  cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
1244
+ self.print_fn("\n-citation-network: ")
1245
  self.print_fn(cites)
1246
 
1247
  for p in papers:
 
1353
  if not num_papers:
1354
  num_papers = self.DEFAULTS['num_papers']
1355
  # arxiv api relevance search and data preparation
1356
+ self.print_fn("\n-searching arXiv for top 100 papers.. ")
1357
  results, searched_papers = self.search(query, max_search=max_search)
1358
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1359
+ self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
1360
 
1361
  # paper selection by scibert vector embedding relevance scores
1362
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
 
1369
 
1370
  joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1371
 
1372
+ self.print_fn("\n-Standardizing known section headings per paper.. ")
1373
  papers_standardized = self.standardize_headings(papers_highlighted)
1374
  joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1375
 
1376
+ self.print_fn("\n-Building paper-wise corpus.. ")
1377
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1378
  joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1379
 
1380
+ self.print_fn("\n-Building section-wise corpus.. ")
1381
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1382
  joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1383
 
1384
+ self.print_fn("\n-Building basic research highlights.. ")
1385
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1386
  joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1387
 
1388
+ self.print_fn("\n-Reducing corpus to lines.. ")
1389
  corpus_lines = self.get_corpus_lines(corpus)
1390
  joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1391
 
 
1419
  '''
1420
  # self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
1421
 
1422
+ self.print_fn("\n-Building abstract.. ")
1423
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1424
  joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1425
  '''
 
1428
  self.print_fn(abstract_block)
1429
  '''
1430
 
1431
+ self.print_fn("\n-Building introduction.. ")
1432
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1433
  joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1434
  '''
 
1436
  self.print_fn("intro_block:")
1437
  self.print_fn(intro_block)
1438
  '''
1439
+ self.print_fn("\n-Building custom sections.. ")
1440
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1441
  joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1442
  joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
 
1454
  clustered_sections['introduction'] = intro_block
1455
  joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1456
 
1457
+ self.print_fn("\n-Building conclusion.. ")
1458
  conclusion_block = self.get_conclusion(clustered_sections)
1459
  joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1460
  clustered_sections['conclusion'] = conclusion_block
 
1471
  shutil.copy(self.dump_dir + survey_file, survey_file)
1472
  assert (os.path.exists(survey_file))
1473
  output_zip = self.zip_outputs(self.dump_dir, query)
1474
+ self.print_fn("\n-Survey complete.. \nSurvey file path :" + os.path.abspath(
1475
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1476
 
1477
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)