Spaces:
Build error
Build error
UI changes
Browse files- src/Surveyor.py +13 -19
src/Surveyor.py
CHANGED
@@ -244,11 +244,10 @@ class Surveyor:
|
|
244 |
|
245 |
papers = papers_meta[:self.num_papers]
|
246 |
selected_papers = papers
|
247 |
-
self.print_fn("\n-First stage paper collection...")
|
248 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
249 |
self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
250 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
251 |
-
_ = self.get_freq_cited(cites)
|
252 |
'''
|
253 |
filtered_idlist = []
|
254 |
for c in self.get_freq_cited(cites):
|
@@ -257,7 +256,6 @@ class Surveyor:
|
|
257 |
new_papers.extend(new_searched_papers)
|
258 |
'''
|
259 |
selected_papers.extend(new_papers)
|
260 |
-
self.print_fn("\n-Second stage paper collection...")
|
261 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
262 |
self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
263 |
papers.extend(new_papers)
|
@@ -269,7 +267,7 @@ class Surveyor:
|
|
269 |
self.print_fn("\n-Extracting section-wise highlights.. ")
|
270 |
papers = self.extract_highlights(papers)
|
271 |
|
272 |
-
return papers, selected_papers
|
273 |
|
274 |
|
275 |
def get_freq_cited(self, cites_dict, k=5):
|
@@ -279,7 +277,6 @@ class Surveyor:
|
|
279 |
[cites_list.append(val) for val in v]
|
280 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
281 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
282 |
-
self.print_fn("\n-The most cited paper ids are:\n" + str(sorted_cites))
|
283 |
|
284 |
return sorted_cites.keys()
|
285 |
|
@@ -732,7 +729,7 @@ class Surveyor:
|
|
732 |
score = self.text_para_similarity(query, highlights)
|
733 |
scores.append(score)
|
734 |
pids.append(id)
|
735 |
-
self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
|
736 |
|
737 |
idx = np.argsort(scores)[:num_papers]
|
738 |
#for i in range(len(scores)):
|
@@ -747,12 +744,12 @@ class Surveyor:
|
|
747 |
for p in papers_selected:
|
748 |
self.print_fn("Selected Paper: " + p['title'])
|
749 |
|
750 |
-
self.print_fn("constrast with natural selection: forward")
|
751 |
-
for p in papers[:4]:
|
752 |
-
|
753 |
-
self.print_fn("constrast with natural selection: backward")
|
754 |
-
for p in papers[-4:]:
|
755 |
-
|
756 |
# arxiv search producing better relevnce
|
757 |
return papers_selected
|
758 |
|
@@ -1205,8 +1202,6 @@ class Surveyor:
|
|
1205 |
import arxiv
|
1206 |
from urllib.parse import urlparse
|
1207 |
ids = [p['id'] for p in papers]
|
1208 |
-
self.print_fn("\n-downloading below selected papers: ")
|
1209 |
-
self.print_fn(ids)
|
1210 |
# asert(False)
|
1211 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1212 |
for p in papers_filtered:
|
@@ -1219,7 +1214,6 @@ class Surveyor:
|
|
1219 |
import arxiv
|
1220 |
from urllib.parse import urlparse
|
1221 |
ids = [p['id'] for p in papers]
|
1222 |
-
self.print_fn(ids)
|
1223 |
# asert(False)
|
1224 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1225 |
for p in papers_filtered:
|
@@ -1246,10 +1240,7 @@ class Surveyor:
|
|
1246 |
def cocitation_network(self, papers, txt_dir):
|
1247 |
import multiprocessing
|
1248 |
|
1249 |
-
|
1250 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
1251 |
-
self.print_fn("\n-citation-network: ")
|
1252 |
-
self.print_fn(cites)
|
1253 |
|
1254 |
for p in papers:
|
1255 |
p['cites'] = cites[p['id']]
|
@@ -1370,7 +1361,7 @@ class Surveyor:
|
|
1370 |
# paper selection by scibert vector embedding relevance scores
|
1371 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
1372 |
|
1373 |
-
papers_highlighted, papers_selected = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
|
1374 |
searched_papers)
|
1375 |
|
1376 |
if weigh_authors:
|
@@ -1478,6 +1469,9 @@ class Surveyor:
|
|
1478 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
1479 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
|
1480 |
|
|
|
|
|
|
|
1481 |
shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
|
1482 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1483 |
assert (os.path.exists(survey_file))
|
|
|
244 |
|
245 |
papers = papers_meta[:self.num_papers]
|
246 |
selected_papers = papers
|
|
|
247 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
248 |
self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
249 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
250 |
+
# _ = self.get_freq_cited(cites)
|
251 |
'''
|
252 |
filtered_idlist = []
|
253 |
for c in self.get_freq_cited(cites):
|
|
|
256 |
new_papers.extend(new_searched_papers)
|
257 |
'''
|
258 |
selected_papers.extend(new_papers)
|
|
|
259 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
260 |
self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
261 |
papers.extend(new_papers)
|
|
|
267 |
self.print_fn("\n-Extracting section-wise highlights.. ")
|
268 |
papers = self.extract_highlights(papers)
|
269 |
|
270 |
+
return papers, selected_papers, cites
|
271 |
|
272 |
|
273 |
def get_freq_cited(self, cites_dict, k=5):
|
|
|
277 |
[cites_list.append(val) for val in v]
|
278 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
279 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
|
|
280 |
|
281 |
return sorted_cites.keys()
|
282 |
|
|
|
729 |
score = self.text_para_similarity(query, highlights)
|
730 |
scores.append(score)
|
731 |
pids.append(id)
|
732 |
+
# self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
|
733 |
|
734 |
idx = np.argsort(scores)[:num_papers]
|
735 |
#for i in range(len(scores)):
|
|
|
744 |
for p in papers_selected:
|
745 |
self.print_fn("Selected Paper: " + p['title'])
|
746 |
|
747 |
+
#self.print_fn("constrast with natural selection: forward")
|
748 |
+
#for p in papers[:4]:
|
749 |
+
# self.print_fn("Selected Paper: " + p['title'])
|
750 |
+
#self.print_fn("constrast with natural selection: backward")
|
751 |
+
#for p in papers[-4:]:
|
752 |
+
# self.print_fn("Selected Paper: " + p['title'])
|
753 |
# arxiv search producing better relevnce
|
754 |
return papers_selected
|
755 |
|
|
|
1202 |
import arxiv
|
1203 |
from urllib.parse import urlparse
|
1204 |
ids = [p['id'] for p in papers]
|
|
|
|
|
1205 |
# asert(False)
|
1206 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1207 |
for p in papers_filtered:
|
|
|
1214 |
import arxiv
|
1215 |
from urllib.parse import urlparse
|
1216 |
ids = [p['id'] for p in papers]
|
|
|
1217 |
# asert(False)
|
1218 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1219 |
for p in papers_filtered:
|
|
|
1240 |
def cocitation_network(self, papers, txt_dir):
|
1241 |
import multiprocessing
|
1242 |
|
|
|
1243 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
|
|
|
|
1244 |
|
1245 |
for p in papers:
|
1246 |
p['cites'] = cites[p['id']]
|
|
|
1361 |
# paper selection by scibert vector embedding relevance scores
|
1362 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
1363 |
|
1364 |
+
papers_highlighted, papers_selected, cites = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
|
1365 |
searched_papers)
|
1366 |
|
1367 |
if weigh_authors:
|
|
|
1469 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
1470 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
|
1471 |
|
1472 |
+
self.survey_print_fn("\n-citation-network: ")
|
1473 |
+
self.survey_print_fn(cites)
|
1474 |
+
|
1475 |
shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
|
1476 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1477 |
assert (os.path.exists(survey_file))
|