Spaces:
Build error
Build error
log fix
Browse files- src/Surveyor.py +24 -24
src/Surveyor.py
CHANGED
@@ -159,7 +159,7 @@ class Surveyor:
|
|
159 |
if not no_save_models:
|
160 |
self.embedder.save(models_dir + "/embedder")
|
161 |
else:
|
162 |
-
self.print_fn("\n-Initializing from previously saved models at" + models_dir)
|
163 |
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
164 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
165 |
self.title_model.eval()
|
@@ -245,7 +245,7 @@ class Surveyor:
|
|
245 |
papers = papers_meta[:self.num_papers]
|
246 |
selected_papers = papers
|
247 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
248 |
-
self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
249 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
250 |
# _ = self.get_freq_cited(cites)
|
251 |
'''
|
@@ -257,14 +257,14 @@ class Surveyor:
|
|
257 |
'''
|
258 |
selected_papers.extend(new_papers)
|
259 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
260 |
-
self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
261 |
papers.extend(new_papers)
|
262 |
|
263 |
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
264 |
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
265 |
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
266 |
|
267 |
-
self.print_fn("\n-Extracting section-wise highlights.. ")
|
268 |
papers = self.extract_highlights(papers)
|
269 |
|
270 |
return papers, selected_papers, cites
|
@@ -339,11 +339,11 @@ class Surveyor:
|
|
339 |
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
340 |
|
341 |
import arxiv2bib
|
342 |
-
self.print_fn("\n-building bibliography entries.. ")
|
343 |
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
344 |
bibentries = [r.bibtex() for r in bibentries]
|
345 |
|
346 |
-
self.print_fn("\n-building final survey file .. at "+ filename)
|
347 |
file = open(filename, 'w+')
|
348 |
if query is None:
|
349 |
query = 'Internal(existing) research'
|
@@ -772,7 +772,7 @@ class Surveyor:
|
|
772 |
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
773 |
res_doc = self.nlp(res)
|
774 |
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
775 |
-
# self.print_fn("\n-".join(res_sents))
|
776 |
with torch.no_grad():
|
777 |
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
778 |
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
@@ -798,14 +798,14 @@ class Surveyor:
|
|
798 |
return papers
|
799 |
|
800 |
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
801 |
-
self.print_fn("\n-extracting sections.. ")
|
802 |
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
803 |
|
804 |
-
self.print_fn("\n-extracting images.. for future correlation use-cases ")
|
805 |
papers = self.extract_images(papers, pdf_dir, img_dir)
|
806 |
|
807 |
if tables:
|
808 |
-
self.print_fn("\n-extracting tables.. for future correlation use-cases ")
|
809 |
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
810 |
|
811 |
return papers, ids_none
|
@@ -1061,7 +1061,7 @@ class Surveyor:
|
|
1061 |
for p in papers:
|
1062 |
if p['id'] == pid:
|
1063 |
return p
|
1064 |
-
self.print_fn("\n-paper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
|
1065 |
|
1066 |
|
1067 |
def alpha_length(self, s):
|
@@ -1195,7 +1195,7 @@ class Surveyor:
|
|
1195 |
else:
|
1196 |
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
1197 |
|
1198 |
-
self.print_fn("\n-Papers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
|
1199 |
|
1200 |
return results, searched_papers
|
1201 |
|
@@ -1354,10 +1354,10 @@ class Surveyor:
|
|
1354 |
if (query is None) and (id_list is None):
|
1355 |
raise ValueError('please provide a base to survey on: list of arxiv IDs or a few research keywords')
|
1356 |
# arxiv api relevance search and data preparation
|
1357 |
-
self.print_fn("\n-searching arXiv for top 100 papers.. ")
|
1358 |
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
1359 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
1360 |
-
self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
|
1361 |
|
1362 |
# paper selection by scibert vector embedding relevance scores
|
1363 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
@@ -1370,23 +1370,23 @@ class Surveyor:
|
|
1370 |
|
1371 |
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
1372 |
|
1373 |
-
self.print_fn("\n-Standardizing known section headings per paper.. ")
|
1374 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
1375 |
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
1376 |
|
1377 |
-
self.print_fn("\n-Building paper-wise corpus.. ")
|
1378 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1379 |
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
1380 |
|
1381 |
-
self.print_fn("\n-Building section-wise corpus.. ")
|
1382 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1383 |
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
1384 |
|
1385 |
-
self.print_fn("\n-Building basic research highlights.. ")
|
1386 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1387 |
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
1388 |
|
1389 |
-
self.print_fn("\n-Reducing corpus to lines.. ")
|
1390 |
corpus_lines = self.get_corpus_lines(corpus)
|
1391 |
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
1392 |
|
@@ -1420,7 +1420,7 @@ class Surveyor:
|
|
1420 |
'''
|
1421 |
# self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
|
1422 |
|
1423 |
-
self.print_fn("\n-Building abstract.. ")
|
1424 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1425 |
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
1426 |
'''
|
@@ -1429,7 +1429,7 @@ class Surveyor:
|
|
1429 |
self.print_fn(abstract_block)
|
1430 |
'''
|
1431 |
|
1432 |
-
self.print_fn("\n-Building introduction.. ")
|
1433 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1434 |
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
1435 |
'''
|
@@ -1437,7 +1437,7 @@ class Surveyor:
|
|
1437 |
self.print_fn("intro_block:")
|
1438 |
self.print_fn(intro_block)
|
1439 |
'''
|
1440 |
-
self.print_fn("\n-Building custom sections.. ")
|
1441 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1442 |
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
1443 |
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
@@ -1455,7 +1455,7 @@ class Surveyor:
|
|
1455 |
clustered_sections['introduction'] = intro_block
|
1456 |
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
1457 |
|
1458 |
-
self.print_fn("\n-Building conclusion.. ")
|
1459 |
conclusion_block = self.get_conclusion(clustered_sections)
|
1460 |
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
1461 |
clustered_sections['conclusion'] = conclusion_block
|
@@ -1477,7 +1477,7 @@ class Surveyor:
|
|
1477 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1478 |
assert (os.path.exists(survey_file))
|
1479 |
output_zip = self.zip_outputs(self.dump_dir, query)
|
1480 |
-
self.print_fn("\n-Survey complete.. \nSurvey file path :" + os.path.abspath(
|
1481 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
1482 |
|
1483 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|
|
|
159 |
if not no_save_models:
|
160 |
self.embedder.save(models_dir + "/embedder")
|
161 |
else:
|
162 |
+
self.print_fn("\n- Initializing from previously saved models at" + models_dir)
|
163 |
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
164 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
165 |
self.title_model.eval()
|
|
|
245 |
papers = papers_meta[:self.num_papers]
|
246 |
selected_papers = papers
|
247 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
248 |
+
self.print_fn("\n- First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
249 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
250 |
# _ = self.get_freq_cited(cites)
|
251 |
'''
|
|
|
257 |
'''
|
258 |
selected_papers.extend(new_papers)
|
259 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
260 |
+
self.print_fn("\n- Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
261 |
papers.extend(new_papers)
|
262 |
|
263 |
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
264 |
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
265 |
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
266 |
|
267 |
+
self.print_fn("\n- Extracting section-wise highlights.. ")
|
268 |
papers = self.extract_highlights(papers)
|
269 |
|
270 |
return papers, selected_papers, cites
|
|
|
339 |
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
340 |
|
341 |
import arxiv2bib
|
342 |
+
self.print_fn("\n- building bibliography entries.. ")
|
343 |
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
344 |
bibentries = [r.bibtex() for r in bibentries]
|
345 |
|
346 |
+
self.print_fn("\n- building final survey file .. at "+ filename)
|
347 |
file = open(filename, 'w+')
|
348 |
if query is None:
|
349 |
query = 'Internal(existing) research'
|
|
|
772 |
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
773 |
res_doc = self.nlp(res)
|
774 |
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
775 |
+
# self.print_fn("\n- ".join(res_sents))
|
776 |
with torch.no_grad():
|
777 |
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
778 |
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
|
|
798 |
return papers
|
799 |
|
800 |
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
801 |
+
self.print_fn("\n- extracting sections.. ")
|
802 |
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
803 |
|
804 |
+
self.print_fn("\n- extracting images.. for future correlation use-cases ")
|
805 |
papers = self.extract_images(papers, pdf_dir, img_dir)
|
806 |
|
807 |
if tables:
|
808 |
+
self.print_fn("\n- extracting tables.. for future correlation use-cases ")
|
809 |
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
810 |
|
811 |
return papers, ids_none
|
|
|
1061 |
for p in papers:
|
1062 |
if p['id'] == pid:
|
1063 |
return p
|
1064 |
+
self.print_fn("\n- paper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
|
1065 |
|
1066 |
|
1067 |
def alpha_length(self, s):
|
|
|
1195 |
else:
|
1196 |
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
1197 |
|
1198 |
+
self.print_fn("\n- Papers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
|
1199 |
|
1200 |
return results, searched_papers
|
1201 |
|
|
|
1354 |
if (query is None) and (id_list is None):
|
1355 |
raise ValueError('please provide a base to survey on: list of arxiv IDs or a few research keywords')
|
1356 |
# arxiv api relevance search and data preparation
|
1357 |
+
self.print_fn("\n- searching arXiv for top 100 papers.. ")
|
1358 |
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
1359 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
1360 |
+
self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
|
1361 |
|
1362 |
# paper selection by scibert vector embedding relevance scores
|
1363 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
|
|
1370 |
|
1371 |
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
1372 |
|
1373 |
+
self.print_fn("\n- Standardizing known section headings per paper.. ")
|
1374 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
1375 |
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
1376 |
|
1377 |
+
self.print_fn("\n- Building paper-wise corpus.. ")
|
1378 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1379 |
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
1380 |
|
1381 |
+
self.print_fn("\n- Building section-wise corpus.. ")
|
1382 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1383 |
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
1384 |
|
1385 |
+
self.print_fn("\n- Building basic research highlights.. ")
|
1386 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1387 |
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
1388 |
|
1389 |
+
self.print_fn("\n- Reducing corpus to lines.. ")
|
1390 |
corpus_lines = self.get_corpus_lines(corpus)
|
1391 |
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
1392 |
|
|
|
1420 |
'''
|
1421 |
# self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
|
1422 |
|
1423 |
+
self.print_fn("\n- Building abstract.. ")
|
1424 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1425 |
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
1426 |
'''
|
|
|
1429 |
self.print_fn(abstract_block)
|
1430 |
'''
|
1431 |
|
1432 |
+
self.print_fn("\n- Building introduction.. ")
|
1433 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1434 |
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
1435 |
'''
|
|
|
1437 |
self.print_fn("intro_block:")
|
1438 |
self.print_fn(intro_block)
|
1439 |
'''
|
1440 |
+
self.print_fn("\n- Building custom sections.. ")
|
1441 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1442 |
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
1443 |
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
|
|
1455 |
clustered_sections['introduction'] = intro_block
|
1456 |
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
1457 |
|
1458 |
+
self.print_fn("\n- Building conclusion.. ")
|
1459 |
conclusion_block = self.get_conclusion(clustered_sections)
|
1460 |
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
1461 |
clustered_sections['conclusion'] = conclusion_block
|
|
|
1477 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1478 |
assert (os.path.exists(survey_file))
|
1479 |
output_zip = self.zip_outputs(self.dump_dir, query)
|
1480 |
+
self.print_fn("\n- Survey complete.. \nSurvey file path :" + os.path.abspath(
|
1481 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
1482 |
|
1483 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|