sidphbot commited on
Commit
5e33e5d
1 Parent(s): 8aa52c4

UI changes

Browse files
Files changed (1) hide show
  1. src/Surveyor.py +13 -19
src/Surveyor.py CHANGED
@@ -244,11 +244,10 @@ class Surveyor:
244
 
245
  papers = papers_meta[:self.num_papers]
246
  selected_papers = papers
247
- self.print_fn("\n-First stage paper collection...")
248
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
249
  self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
250
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
251
- _ = self.get_freq_cited(cites)
252
  '''
253
  filtered_idlist = []
254
  for c in self.get_freq_cited(cites):
@@ -257,7 +256,6 @@ class Surveyor:
257
  new_papers.extend(new_searched_papers)
258
  '''
259
  selected_papers.extend(new_papers)
260
- self.print_fn("\n-Second stage paper collection...")
261
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
262
  self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
263
  papers.extend(new_papers)
@@ -269,7 +267,7 @@ class Surveyor:
269
  self.print_fn("\n-Extracting section-wise highlights.. ")
270
  papers = self.extract_highlights(papers)
271
 
272
- return papers, selected_papers
273
 
274
 
275
  def get_freq_cited(self, cites_dict, k=5):
@@ -279,7 +277,6 @@ class Surveyor:
279
  [cites_list.append(val) for val in v]
280
  cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
281
  sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
282
- self.print_fn("\n-The most cited paper ids are:\n" + str(sorted_cites))
283
 
284
  return sorted_cites.keys()
285
 
@@ -732,7 +729,7 @@ class Surveyor:
732
  score = self.text_para_similarity(query, highlights)
733
  scores.append(score)
734
  pids.append(id)
735
- self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
736
 
737
  idx = np.argsort(scores)[:num_papers]
738
  #for i in range(len(scores)):
@@ -747,12 +744,12 @@ class Surveyor:
747
  for p in papers_selected:
748
  self.print_fn("Selected Paper: " + p['title'])
749
 
750
- self.print_fn("constrast with natural selection: forward")
751
- for p in papers[:4]:
752
- self.print_fn("Selected Paper: " + p['title'])
753
- self.print_fn("constrast with natural selection: backward")
754
- for p in papers[-4:]:
755
- self.print_fn("Selected Paper: " + p['title'])
756
  # arxiv search producing better relevnce
757
  return papers_selected
758
 
@@ -1205,8 +1202,6 @@ class Surveyor:
1205
  import arxiv
1206
  from urllib.parse import urlparse
1207
  ids = [p['id'] for p in papers]
1208
- self.print_fn("\n-downloading below selected papers: ")
1209
- self.print_fn(ids)
1210
  # asert(False)
1211
  papers_filtered = arxiv.Search(id_list=ids).get()
1212
  for p in papers_filtered:
@@ -1219,7 +1214,6 @@ class Surveyor:
1219
  import arxiv
1220
  from urllib.parse import urlparse
1221
  ids = [p['id'] for p in papers]
1222
- self.print_fn(ids)
1223
  # asert(False)
1224
  papers_filtered = arxiv.Search(id_list=ids).get()
1225
  for p in papers_filtered:
@@ -1246,10 +1240,7 @@ class Surveyor:
1246
  def cocitation_network(self, papers, txt_dir):
1247
  import multiprocessing
1248
 
1249
-
1250
  cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
1251
- self.print_fn("\n-citation-network: ")
1252
- self.print_fn(cites)
1253
 
1254
  for p in papers:
1255
  p['cites'] = cites[p['id']]
@@ -1370,7 +1361,7 @@ class Surveyor:
1370
  # paper selection by scibert vector embedding relevance scores
1371
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
1372
 
1373
- papers_highlighted, papers_selected = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
1374
  searched_papers)
1375
 
1376
  if weigh_authors:
@@ -1478,6 +1469,9 @@ class Surveyor:
1478
  survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
1479
  self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
1480
 
 
 
 
1481
  shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
1482
  shutil.copy(self.dump_dir + survey_file, survey_file)
1483
  assert (os.path.exists(survey_file))
 
244
 
245
  papers = papers_meta[:self.num_papers]
246
  selected_papers = papers
 
247
  ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
248
  self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
249
  new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
250
+ # _ = self.get_freq_cited(cites)
251
  '''
252
  filtered_idlist = []
253
  for c in self.get_freq_cited(cites):
 
256
  new_papers.extend(new_searched_papers)
257
  '''
258
  selected_papers.extend(new_papers)
 
259
  _, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
260
  self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
261
  papers.extend(new_papers)
 
267
  self.print_fn("\n-Extracting section-wise highlights.. ")
268
  papers = self.extract_highlights(papers)
269
 
270
+ return papers, selected_papers, cites
271
 
272
 
273
  def get_freq_cited(self, cites_dict, k=5):
 
277
  [cites_list.append(val) for val in v]
278
  cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
279
  sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
 
280
 
281
  return sorted_cites.keys()
282
 
 
729
  score = self.text_para_similarity(query, highlights)
730
  scores.append(score)
731
  pids.append(id)
732
+ # self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
733
 
734
  idx = np.argsort(scores)[:num_papers]
735
  #for i in range(len(scores)):
 
744
  for p in papers_selected:
745
  self.print_fn("Selected Paper: " + p['title'])
746
 
747
+ #self.print_fn("constrast with natural selection: forward")
748
+ #for p in papers[:4]:
749
+ # self.print_fn("Selected Paper: " + p['title'])
750
+ #self.print_fn("constrast with natural selection: backward")
751
+ #for p in papers[-4:]:
752
+ # self.print_fn("Selected Paper: " + p['title'])
753
  # arxiv search producing better relevnce
754
  return papers_selected
755
 
 
1202
  import arxiv
1203
  from urllib.parse import urlparse
1204
  ids = [p['id'] for p in papers]
 
 
1205
  # asert(False)
1206
  papers_filtered = arxiv.Search(id_list=ids).get()
1207
  for p in papers_filtered:
 
1214
  import arxiv
1215
  from urllib.parse import urlparse
1216
  ids = [p['id'] for p in papers]
 
1217
  # asert(False)
1218
  papers_filtered = arxiv.Search(id_list=ids).get()
1219
  for p in papers_filtered:
 
1240
  def cocitation_network(self, papers, txt_dir):
1241
  import multiprocessing
1242
 
 
1243
  cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
 
 
1244
 
1245
  for p in papers:
1246
  p['cites'] = cites[p['id']]
 
1361
  # paper selection by scibert vector embedding relevance scores
1362
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
1363
 
1364
+ papers_highlighted, papers_selected, cites = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
1365
  searched_papers)
1366
 
1367
  if weigh_authors:
 
1469
  survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
1470
  self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
1471
 
1472
+ self.survey_print_fn("\n-citation-network: ")
1473
+ self.survey_print_fn(cites)
1474
+
1475
  shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
1476
  shutil.copy(self.dump_dir + survey_file, survey_file)
1477
  assert (os.path.exists(survey_file))