Spaces:
Build error
Build error
stdout capture
Browse files- app.py +6 -34
- src/Surveyor.py +219 -207
app.py
CHANGED
@@ -4,38 +4,11 @@ import numpy as np
|
|
4 |
|
5 |
from src.Surveyor import Surveyor
|
6 |
|
7 |
-
import contextlib
|
8 |
-
from functools import wraps
|
9 |
-
from io import StringIO
|
10 |
-
|
11 |
-
def capture_output(func):
|
12 |
-
"""Capture output from running a function and write using streamlit."""
|
13 |
-
|
14 |
-
@wraps(func)
|
15 |
-
def wrapper(*args, **kwargs):
|
16 |
-
# Redirect output to string buffers
|
17 |
-
stdout, stderr = StringIO(), StringIO()
|
18 |
-
try:
|
19 |
-
with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
|
20 |
-
return func(*args, **kwargs)
|
21 |
-
except Exception as err:
|
22 |
-
st.write(f"Failure while executing: {err}")
|
23 |
-
finally:
|
24 |
-
if _stdout := stdout.getvalue():
|
25 |
-
st.write("Execution stdout:")
|
26 |
-
st.code(_stdout)
|
27 |
-
if _stderr := stderr.getvalue():
|
28 |
-
st.write("Execution stderr:")
|
29 |
-
st.code(_stderr)
|
30 |
-
|
31 |
-
return wrapper
|
32 |
-
|
33 |
def run_survey(surveyor, research_keywords, max_search, num_papers):
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
)
|
39 |
|
40 |
with open(str(zip_file_name), "rb") as file:
|
41 |
btn = st.download_button(
|
@@ -55,7 +28,7 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
|
|
55 |
|
56 |
|
57 |
def survey_space(surveyor):
|
58 |
-
st.
|
59 |
form = st.sidebar.form(key='survey_form')
|
60 |
research_keywords = form.text_input("What would you like to research in today?")
|
61 |
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
@@ -65,11 +38,10 @@ def survey_space(surveyor):
|
|
65 |
submit = form.form_submit_button('Submit')
|
66 |
|
67 |
if submit:
|
68 |
-
st.write("hello")
|
69 |
run_survey(surveyor, research_keywords, max_search, num_papers)
|
70 |
|
71 |
|
72 |
if __name__ == '__main__':
|
73 |
global surveyor
|
74 |
-
surveyor_obj = Surveyor()
|
75 |
survey_space(surveyor_obj)
|
|
|
4 |
|
5 |
from src.Surveyor import Surveyor
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def run_survey(surveyor, research_keywords, max_search, num_papers):
|
8 |
+
zip_file_name, survey_file_name = surveyor.survey(research_keywords,
|
9 |
+
max_search=max_search,
|
10 |
+
num_papers=num_papers
|
11 |
+
)
|
|
|
12 |
|
13 |
with open(str(zip_file_name), "rb") as file:
|
14 |
btn = st.download_button(
|
|
|
28 |
|
29 |
|
30 |
def survey_space(surveyor):
|
31 |
+
st.container().title('Auto-Research V0.1 - Automated Survey generation from research keywords')
|
32 |
form = st.sidebar.form(key='survey_form')
|
33 |
research_keywords = form.text_input("What would you like to research in today?")
|
34 |
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
|
|
38 |
submit = form.form_submit_button('Submit')
|
39 |
|
40 |
if submit:
|
|
|
41 |
run_survey(surveyor, research_keywords, max_search, num_papers)
|
42 |
|
43 |
|
44 |
if __name__ == '__main__':
|
45 |
global surveyor
|
46 |
+
surveyor_obj = Surveyor(print_fn=st.write)
|
47 |
survey_space(surveyor_obj)
|
src/Surveyor.py
CHANGED
@@ -44,7 +44,8 @@ class Surveyor:
|
|
44 |
kw_model_name=None,
|
45 |
high_gpu=False,
|
46 |
refresh_models=False,
|
47 |
-
no_save_models=False
|
|
|
48 |
):
|
49 |
'''
|
50 |
Initializes models and directory structure for the surveyor
|
@@ -71,10 +72,14 @@ class Surveyor:
|
|
71 |
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
72 |
|
73 |
'''
|
|
|
|
|
|
|
|
|
74 |
self.torch_device = 'cpu'
|
75 |
-
|
76 |
if torch.cuda.is_available():
|
77 |
-
|
78 |
self.torch_device = 'cuda'
|
79 |
spacy.require_gpu()
|
80 |
|
@@ -109,7 +114,7 @@ class Surveyor:
|
|
109 |
similarity_nlp_name = self.DEFAULTS["similarity_nlp_name"]
|
110 |
|
111 |
if refresh_models or not models_found:
|
112 |
-
|
113 |
if not no_save_models:
|
114 |
self.clean_dirs([models_dir])
|
115 |
|
@@ -148,7 +153,7 @@ class Surveyor:
|
|
148 |
if not no_save_models:
|
149 |
self.embedder.save(models_dir + "/embedder")
|
150 |
else:
|
151 |
-
|
152 |
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
153 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
154 |
self.title_model.eval()
|
@@ -230,9 +235,9 @@ class Surveyor:
|
|
230 |
|
231 |
papers = papers_meta[:self.num_papers]
|
232 |
selected_papers = papers
|
233 |
-
|
234 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
235 |
-
|
236 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
237 |
_ = self.get_freq_cited(cites)
|
238 |
'''
|
@@ -243,16 +248,16 @@ class Surveyor:
|
|
243 |
new_papers.extend(new_searched_papers)
|
244 |
'''
|
245 |
selected_papers.extend(new_papers)
|
246 |
-
|
247 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
248 |
-
|
249 |
papers.extend(new_papers)
|
250 |
|
251 |
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
252 |
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
253 |
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
254 |
|
255 |
-
|
256 |
papers = self.extract_highlights(papers)
|
257 |
|
258 |
return papers, selected_papers
|
@@ -265,7 +270,7 @@ class Surveyor:
|
|
265 |
[cites_list.append(val) for val in v]
|
266 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
267 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
268 |
-
|
269 |
|
270 |
return sorted_cites.keys()
|
271 |
|
@@ -275,7 +280,7 @@ class Surveyor:
|
|
275 |
|
276 |
if repeat:
|
277 |
with tempfile.TemporaryDirectory() as dirpath:
|
278 |
-
|
279 |
# full text preparation of selected papers
|
280 |
self.download_pdfs(papers, dirpath)
|
281 |
dirpath_pdfs = os.listdir(dirpath)
|
@@ -283,22 +288,22 @@ class Surveyor:
|
|
283 |
full_file_name = os.path.join(dirpath, file_name)
|
284 |
if os.path.isfile(full_file_name):
|
285 |
shutil.copy(full_file_name, pdf_dir)
|
286 |
-
|
287 |
self.convert_pdfs(dirpath, txt_dir)
|
288 |
else:
|
289 |
-
|
290 |
# full text preparation of selected papers
|
291 |
self.download_pdfs(papers, pdf_dir)
|
292 |
-
|
293 |
self.convert_pdfs(pdf_dir, txt_dir)
|
294 |
# plugging citations to our papers object
|
295 |
-
|
296 |
papers, cites = self.cocitation_network(papers, txt_dir)
|
297 |
joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp')
|
298 |
from distutils.dir_util import copy_tree
|
299 |
copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir))
|
300 |
copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir))
|
301 |
-
|
302 |
papers, ids_none = self.extract_structure(papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir)
|
303 |
return ids_none, papers, cites
|
304 |
|
@@ -328,92 +333,92 @@ class Surveyor:
|
|
328 |
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
329 |
|
330 |
import arxiv2bib
|
331 |
-
|
332 |
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
333 |
bibentries = [r.bibtex() for r in bibentries]
|
334 |
|
335 |
-
|
336 |
file = open(filename, 'w+')
|
337 |
if query is None:
|
338 |
query = 'Internal(existing) research'
|
339 |
file.write("----------------------------------------------------------------------")
|
340 |
file.write("Title: A survey on " + query)
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
|
345 |
-
|
346 |
file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
|
347 |
-
|
348 |
file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
|
349 |
"\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
|
350 |
"\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
|
351 |
"\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
|
352 |
"\nentries(only to avoid LaTex overhead). ")
|
353 |
-
|
354 |
"\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
|
355 |
"\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
|
356 |
"\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
|
357 |
"\nentries(only to avoid LaTex overhead). ")
|
358 |
file.write("----------------------------------------------------------------------")
|
359 |
-
|
360 |
file.write("")
|
361 |
-
|
362 |
file.write('ABSTRACT')
|
363 |
-
|
364 |
-
|
365 |
file.write("=================================================")
|
366 |
file.write("")
|
367 |
-
|
368 |
file.write(research_sections['abstract'])
|
369 |
-
|
370 |
file.write("")
|
371 |
-
|
372 |
file.write('INTRODUCTION')
|
373 |
-
|
374 |
-
|
375 |
file.write("=================================================")
|
376 |
file.write("")
|
377 |
-
|
378 |
file.write(research_sections['introduction'])
|
379 |
-
|
380 |
file.write("")
|
381 |
-
|
382 |
for k, v in research_sections.items():
|
383 |
if k not in ['abstract', 'introduction', 'conclusion']:
|
384 |
file.write(k.upper())
|
385 |
-
|
386 |
-
|
387 |
file.write("=================================================")
|
388 |
file.write("")
|
389 |
-
|
390 |
file.write(v)
|
391 |
-
|
392 |
file.write("")
|
393 |
-
|
394 |
file.write('CONCLUSION')
|
395 |
-
|
396 |
-
|
397 |
file.write("=================================================")
|
398 |
file.write("")
|
399 |
-
|
400 |
file.write(research_sections['conclusion'])
|
401 |
-
|
402 |
file.write("")
|
403 |
-
|
404 |
|
405 |
file.write('REFERENCES')
|
406 |
-
|
407 |
-
|
408 |
file.write("=================================================")
|
409 |
file.write("")
|
410 |
-
|
411 |
for entry in bibentries:
|
412 |
file.write(entry)
|
413 |
-
|
414 |
file.write("")
|
415 |
-
|
416 |
-
|
417 |
file.write("========================XXX=========================")
|
418 |
file.close()
|
419 |
|
@@ -421,14 +426,15 @@ class Surveyor:
|
|
421 |
|
422 |
research_blocks = {}
|
423 |
for head, textarr in corpus_known_sections.items():
|
424 |
-
|
425 |
-
|
|
|
426 |
with torch.no_grad():
|
427 |
summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5)
|
428 |
res = self.nlp(summtext)
|
429 |
res = set([str(sent) for sent in list(res.sents)])
|
430 |
summtext = ''.join([line for line in res])
|
431 |
-
#
|
432 |
research_blocks[head] = summtext
|
433 |
|
434 |
return research_blocks
|
@@ -444,7 +450,8 @@ class Surveyor:
|
|
444 |
sequences = ledmodel.generate(input_ids, global_attention_mask=global_attention_mask).sequences
|
445 |
summary = ledtokenizer.batch_decode(sequences)
|
446 |
'''
|
447 |
-
|
|
|
448 |
inputs = self.ledtokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
|
449 |
return_tensors='pt').to(self.torch_device)
|
450 |
with torch.no_grad():
|
@@ -454,7 +461,7 @@ class Surveyor:
|
|
454 |
res = self.nlp(summary[0])
|
455 |
res = set([str(sent) for sent in list(res.sents)])
|
456 |
summtext = ''.join([line for line in res])
|
457 |
-
#
|
458 |
return summtext
|
459 |
|
460 |
def get_abstract(self, abs_lines, corpus_known_sections, research_blocks):
|
@@ -463,7 +470,7 @@ class Surveyor:
|
|
463 |
abs_lines = ""
|
464 |
abs_lines += " ".join([l.lower() for l in corpus_known_sections['abstract']])
|
465 |
abs_lines += research_blocks['abstract']
|
466 |
-
#
|
467 |
|
468 |
try:
|
469 |
return self.abstractive_summary(abs_lines)
|
@@ -475,12 +482,12 @@ class Surveyor:
|
|
475 |
abs_lines = []
|
476 |
types = set()
|
477 |
for k, v in corpus.items():
|
478 |
-
#
|
479 |
types.add(type(v))
|
480 |
abstext = k + '. ' + v.replace('\n', ' ')
|
481 |
abstext = self.nlp(abstext)
|
482 |
abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)])
|
483 |
-
#
|
484 |
# abs_lines = '\n'.join([str(sent) for sent in abs_lines.sents])
|
485 |
return abs_lines
|
486 |
|
@@ -501,9 +508,9 @@ class Surveyor:
|
|
501 |
if p['id'] not in selected_pids:
|
502 |
meta_abs.append(self.generate_title(p['abstract']))
|
503 |
docs.extend(meta_abs)
|
504 |
-
#
|
505 |
-
#
|
506 |
-
#
|
507 |
#assert (len(meta_abs) + len(selected_pids) == len(papers_meta))
|
508 |
assert ('str' in str(type(random.sample(docs, 1)[0])))
|
509 |
return [doc for doc in docs if doc != '']
|
@@ -513,7 +520,8 @@ class Surveyor:
|
|
513 |
from sklearn.cluster import KMeans
|
514 |
# from bertopic import BERTopic
|
515 |
# topic_model = BERTopic(embedding_model=embedder)
|
516 |
-
|
|
|
517 |
corpus_embeddings = self.embedder.encode(abs_lines)
|
518 |
# Normalize the embeddings to unit length
|
519 |
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
|
@@ -533,17 +541,17 @@ class Surveyor:
|
|
533 |
clustered_sentences[cluster_id] = []
|
534 |
'''
|
535 |
if dummy_count < 5:
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
dummy_count += 1
|
540 |
'''
|
541 |
clustered_sentences[cluster_id].append(abs_lines[sentence_id])
|
542 |
|
543 |
# for i, cluster in clustered_sentences.items():
|
544 |
-
#
|
545 |
-
#
|
546 |
-
#
|
547 |
|
548 |
return self.get_clustered_sections(clustered_sentences), clustered_sentences
|
549 |
|
@@ -552,7 +560,8 @@ class Surveyor:
|
|
552 |
from sklearn.cluster import KMeans
|
553 |
# from bertopic import BERTopic
|
554 |
# topic_model = BERTopic(embedding_model=embedder)
|
555 |
-
|
|
|
556 |
abs_lines = self.get_sectioned_docs(papers, papers_meta)
|
557 |
corpus_embeddings = self.embedder.encode(abs_lines)
|
558 |
# Normalize the embeddings to unit length
|
@@ -573,22 +582,23 @@ class Surveyor:
|
|
573 |
clustered_sentences[cluster_id] = []
|
574 |
'''
|
575 |
if dummy_count < 5:
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
dummy_count += 1
|
580 |
'''
|
581 |
clustered_sentences[cluster_id].append(abs_lines[sentence_id])
|
582 |
|
583 |
# for i, cluster in clustered_sentences.items():
|
584 |
-
#
|
585 |
-
#
|
586 |
-
#
|
587 |
|
588 |
return self.get_clustered_sections(clustered_sentences), clustered_sentences
|
589 |
|
590 |
def generate_title(self, longtext):
|
591 |
-
|
|
|
592 |
|
593 |
inputs = self.title_tokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
|
594 |
return_tensors='pt').to(self.torch_device)
|
@@ -602,7 +612,7 @@ class Surveyor:
|
|
602 |
def get_clustered_sections(self, clustered_lines):
|
603 |
clusters_dict = {}
|
604 |
for i, cluster in clustered_lines.items():
|
605 |
-
#
|
606 |
try:
|
607 |
clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
|
608 |
str(" ".join(cluster)).lower())
|
@@ -641,7 +651,7 @@ class Surveyor:
|
|
641 |
for section in p['sections']:
|
642 |
if kh in section['heading']:
|
643 |
khtext.extend(section['highlights'])
|
644 |
-
#
|
645 |
corpus_known_sections[kh] = khtext
|
646 |
return corpus_known_sections
|
647 |
|
@@ -649,15 +659,15 @@ class Surveyor:
|
|
649 |
known = ['abstract', 'introduction', 'discussion', 'relatedwork', 'contribution', 'analysis', 'experiments',
|
650 |
'conclusion']
|
651 |
for p in papers:
|
652 |
-
#
|
653 |
headings = [section['heading'] for section in p['sections'] if len(section['heading'].split()) < 3]
|
654 |
-
#
|
655 |
for kh in known:
|
656 |
for section in p['sections']:
|
657 |
if len(section['heading'].split()) < 3:
|
658 |
-
#
|
659 |
if kh in ''.join(filter(str.isalpha, section['heading'].replace(' ', '').lower())):
|
660 |
-
#
|
661 |
section['heading'] = kh
|
662 |
return papers
|
663 |
|
@@ -671,14 +681,14 @@ class Surveyor:
|
|
671 |
if pid == p['id']:
|
672 |
corpus[pid] = p['abstract'] + str(' '.join(ph))
|
673 |
'''
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
p = random.choice(list(papers))
|
678 |
-
|
679 |
-
|
680 |
p = random.choice(list(papers_meta))
|
681 |
-
|
682 |
'''
|
683 |
return corpus
|
684 |
|
@@ -690,25 +700,25 @@ class Surveyor:
|
|
690 |
def build_meta_corpus(self, papers):
|
691 |
meta_corpus = {}
|
692 |
for p in papers:
|
693 |
-
#
|
694 |
pid = p['id']
|
695 |
ptext = p['title'] + ". " + p['abstract']
|
696 |
doc = self.nlp(ptext)
|
697 |
phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)])
|
698 |
meta_corpus[pid] = str(' '.join(phs))
|
699 |
'''
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
p = random.choice(list(papers))
|
704 |
-
|
705 |
'''
|
706 |
return meta_corpus
|
707 |
|
708 |
def select_papers(self, papers, query, num_papers=20):
|
709 |
import numpy as np
|
710 |
-
#
|
711 |
-
#
|
712 |
meta_corpus = self.build_meta_corpus(papers)
|
713 |
scores = []
|
714 |
pids = []
|
@@ -716,32 +726,33 @@ class Surveyor:
|
|
716 |
score = self.text_para_similarity(query, highlights)
|
717 |
scores.append(score)
|
718 |
pids.append(id)
|
719 |
-
|
720 |
|
721 |
idx = np.argsort(scores)[:num_papers]
|
722 |
#for i in range(len(scores)):
|
723 |
-
#
|
724 |
-
#
|
725 |
-
#
|
726 |
idx = [pids[i] for i in idx]
|
727 |
-
#
|
728 |
papers_selected = [p for p in papers if p['id'] in idx]
|
729 |
# assert(len(papers_selected)==num_papers)
|
730 |
-
|
731 |
for p in papers_selected:
|
732 |
-
|
733 |
|
734 |
-
|
735 |
for p in papers[:4]:
|
736 |
-
|
737 |
-
|
738 |
for p in papers[-4:]:
|
739 |
-
|
740 |
# arxiv search producing better relevnce
|
741 |
return papers_selected
|
742 |
|
743 |
def extractive_summary(self, text):
|
744 |
-
|
|
|
745 |
with torch.no_grad():
|
746 |
res = self.model(text, ratio=0.5)
|
747 |
res_doc = self.nlp(res)
|
@@ -751,12 +762,13 @@ class Surveyor:
|
|
751 |
# text = " ".join(lines)
|
752 |
# text_doc = nlp(" ".join([l.lower() for l in lines]))
|
753 |
# text = ' '.join([ str(sent) for sent in list(text_doc.sents)])
|
754 |
-
|
|
|
755 |
with torch.no_grad():
|
756 |
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
757 |
res_doc = self.nlp(res)
|
758 |
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
759 |
-
#
|
760 |
with torch.no_grad():
|
761 |
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
762 |
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
@@ -782,14 +794,14 @@ class Surveyor:
|
|
782 |
return papers
|
783 |
|
784 |
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
785 |
-
|
786 |
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
787 |
|
788 |
-
|
789 |
papers = self.extract_images(papers, pdf_dir, img_dir)
|
790 |
|
791 |
if tables:
|
792 |
-
|
793 |
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
794 |
|
795 |
return papers, ids_none
|
@@ -816,12 +828,12 @@ class Surveyor:
|
|
816 |
'''
|
817 |
for f, h in headings_all.items():
|
818 |
if len(h) < 4:
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
'''
|
823 |
# from pprint import pprint
|
824 |
-
#
|
825 |
papers_none = [p for p in papers if p['id'] in ids_none]
|
826 |
for p in papers_none:
|
827 |
os.remove(txt_dir + '/'+ p['id'] + '.txt')
|
@@ -848,7 +860,7 @@ class Surveyor:
|
|
848 |
start = headings[i]
|
849 |
end = headings[i + 1]
|
850 |
section = self.get_section(start, end, lines)
|
851 |
-
#
|
852 |
'''
|
853 |
if i > 0:
|
854 |
old = headings[i-1]
|
@@ -884,19 +896,19 @@ class Surveyor:
|
|
884 |
start = [i for i in range(len(lines)) if first is lines[i]][0]
|
885 |
end = [i for i in range(len(lines)) if last is lines[i]][0]
|
886 |
section_lines = lines[start + 1:end]
|
887 |
-
#
|
888 |
-
#
|
889 |
-
#
|
890 |
return section_lines
|
891 |
except ValueError:
|
892 |
-
|
893 |
-
|
894 |
-
|
895 |
return ""
|
896 |
|
897 |
def check_list_elems_in_list(self, headings, lines):
|
898 |
import numpy as np
|
899 |
-
# [
|
900 |
return np.all([True if head in lines else False for head in headings])
|
901 |
|
902 |
def check_first_char_upper(self, text):
|
@@ -916,25 +928,25 @@ class Surveyor:
|
|
916 |
assert (self.check_list_elems_in_list(headings, refined))
|
917 |
headings = self.check_duplicates(headings)
|
918 |
|
919 |
-
#
|
920 |
-
#
|
921 |
-
#
|
922 |
|
923 |
# scan_failed - rescan with first match for abstract hook
|
924 |
if len(headings) == 0:
|
925 |
-
#
|
926 |
-
#
|
927 |
abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())]
|
928 |
if len(abs_cans) != 0:
|
929 |
abs_head = abs_cans[0]
|
930 |
refined, headings = self.scan_text(lines, abs_head=abs_head)
|
931 |
self.check_list_elems_in_list(headings, refined)
|
932 |
headings = self.check_duplicates(headings)
|
933 |
-
#
|
934 |
-
#
|
935 |
|
936 |
# if len(headings) == 0:
|
937 |
-
#
|
938 |
|
939 |
return refined, headings
|
940 |
|
@@ -944,7 +956,7 @@ class Surveyor:
|
|
944 |
if len(dups) > 0:
|
945 |
[my_finallist.append(n) for n in my_list if n not in my_finallist]
|
946 |
|
947 |
-
#
|
948 |
return my_finallist
|
949 |
|
950 |
def clean_lines(self, text):
|
@@ -973,7 +985,7 @@ class Surveyor:
|
|
973 |
|
974 |
def scan_text(self, lines, abs_head=None):
|
975 |
import re
|
976 |
-
#
|
977 |
record = False
|
978 |
headings = []
|
979 |
refined = []
|
@@ -994,7 +1006,7 @@ class Surveyor:
|
|
994 |
refined.append(line)
|
995 |
break
|
996 |
refined, headings = self.scanline(record, headings, refined, i, lines)
|
997 |
-
#
|
998 |
return refined, headings
|
999 |
|
1000 |
def scanline(self, record, headings, refined, id, lines):
|
@@ -1003,22 +1015,22 @@ class Surveyor:
|
|
1003 |
line = lines[id]
|
1004 |
|
1005 |
if not len(line) == 0:
|
1006 |
-
#
|
1007 |
-
#
|
1008 |
if record:
|
1009 |
refined.append(line)
|
1010 |
if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match(
|
1011 |
"^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$",
|
1012 |
line) and self.char_length(line) > 7:
|
1013 |
-
#
|
1014 |
-
#
|
1015 |
if np.mean([len(s) for s in lines[id + 2:id + 6]]) > 40 and self.check_first_char_upper(
|
1016 |
line) and re.match("^[a-zA-Z1-9\.\:\-\s]*$", line) and len(line.split()) < 10:
|
1017 |
# if len(line) < 20 and np.mean([len(s) for s in lines[i+1:i+5]]) > 30 :
|
1018 |
headings.append(line)
|
1019 |
assert (line in refined)
|
1020 |
-
#
|
1021 |
-
#
|
1022 |
else:
|
1023 |
known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography']
|
1024 |
missing = [h for h in known_headings if not np.any([True for head in headings if h in head])]
|
@@ -1045,7 +1057,7 @@ class Surveyor:
|
|
1045 |
for p in papers:
|
1046 |
if p['id'] == pid:
|
1047 |
return p
|
1048 |
-
|
1049 |
|
1050 |
|
1051 |
def alpha_length(self, s):
|
@@ -1066,7 +1078,7 @@ class Surveyor:
|
|
1066 |
|
1067 |
def extract_images(self, papers, pdf_dir, img_dir):
|
1068 |
import fitz
|
1069 |
-
#
|
1070 |
for p in papers:
|
1071 |
file = pdf_dir + p['id'] + ".pdf"
|
1072 |
pdf_file = fitz.open(file)
|
@@ -1076,10 +1088,10 @@ class Surveyor:
|
|
1076 |
images.extend(page.getImageList())
|
1077 |
images_files = [self.save_image(pdf_file.extractImage(img[0]), i, p['id'], img_dir) for i, img in
|
1078 |
enumerate(set(images)) if img[0]]
|
1079 |
-
#
|
1080 |
p['images'] = images_files
|
1081 |
-
#
|
1082 |
-
#
|
1083 |
return papers
|
1084 |
|
1085 |
|
@@ -1105,7 +1117,7 @@ class Surveyor:
|
|
1105 |
# save it to local disk
|
1106 |
fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext
|
1107 |
image.save(open(f"{fname}", "wb"))
|
1108 |
-
#
|
1109 |
return fname
|
1110 |
|
1111 |
def save_tables(self, dfs, pid, tab_dir):
|
@@ -1125,7 +1137,7 @@ class Surveyor:
|
|
1125 |
for p in papers:
|
1126 |
dfs = tabula.read_pdf(pdf_dir + p['id'] + ".pdf", pages='all', multiple_tables=True, silent=True)
|
1127 |
p['tables'] = self.save_tables(dfs, p['id'], tab_dir)
|
1128 |
-
#
|
1129 |
return papers
|
1130 |
|
1131 |
def extract_tables_from_file(self, pdf_file_name, tab_dir):
|
@@ -1179,7 +1191,7 @@ class Surveyor:
|
|
1179 |
else:
|
1180 |
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
1181 |
|
1182 |
-
|
1183 |
|
1184 |
return results, searched_papers
|
1185 |
|
@@ -1187,8 +1199,8 @@ class Surveyor:
|
|
1187 |
import arxiv
|
1188 |
from urllib.parse import urlparse
|
1189 |
ids = [p['id'] for p in papers]
|
1190 |
-
|
1191 |
-
|
1192 |
# asert(False)
|
1193 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1194 |
for p in papers_filtered:
|
@@ -1201,7 +1213,7 @@ class Surveyor:
|
|
1201 |
import arxiv
|
1202 |
from urllib.parse import urlparse
|
1203 |
ids = [p['id'] for p in papers]
|
1204 |
-
|
1205 |
# asert(False)
|
1206 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1207 |
for p in papers_filtered:
|
@@ -1230,8 +1242,8 @@ class Surveyor:
|
|
1230 |
|
1231 |
|
1232 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
1233 |
-
|
1234 |
-
|
1235 |
|
1236 |
for p in papers:
|
1237 |
p['cites'] = cites[p['id']]
|
@@ -1242,7 +1254,7 @@ class Surveyor:
|
|
1242 |
from scholarly import scholarly
|
1243 |
import operator
|
1244 |
# Retrieve the author's data, fill-in, and print
|
1245 |
-
|
1246 |
search_result = next(scholarly.search_author(author_query), None)
|
1247 |
|
1248 |
if search_result is not None:
|
@@ -1262,7 +1274,7 @@ class Surveyor:
|
|
1262 |
'url_picture': author['url_picture'],
|
1263 |
}
|
1264 |
else:
|
1265 |
-
|
1266 |
author_stats = {
|
1267 |
'name': author_query,
|
1268 |
'affiliation': "",
|
@@ -1276,7 +1288,7 @@ class Surveyor:
|
|
1276 |
'url_picture': "",
|
1277 |
}
|
1278 |
|
1279 |
-
#
|
1280 |
return author_stats
|
1281 |
|
1282 |
def author_stats(self, papers):
|
@@ -1314,9 +1326,9 @@ class Surveyor:
|
|
1314 |
start_positions = torch.tensor([1])
|
1315 |
end_positions = torch.tensor([3])
|
1316 |
outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions)
|
1317 |
-
|
1318 |
-
|
1319 |
-
|
1320 |
return outputs
|
1321 |
|
1322 |
def zip_outputs(self, dump_dir, query):
|
@@ -1342,10 +1354,10 @@ class Surveyor:
|
|
1342 |
if not num_papers:
|
1343 |
num_papers = self.DEFAULTS['num_papers']
|
1344 |
# arxiv api relevance search and data preparation
|
1345 |
-
|
1346 |
results, searched_papers = self.search(query, max_search=max_search)
|
1347 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
1348 |
-
|
1349 |
|
1350 |
# paper selection by scibert vector embedding relevance scores
|
1351 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
@@ -1358,23 +1370,23 @@ class Surveyor:
|
|
1358 |
|
1359 |
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
1360 |
|
1361 |
-
|
1362 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
1363 |
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
1364 |
|
1365 |
-
|
1366 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1367 |
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
1368 |
|
1369 |
-
|
1370 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1371 |
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
1372 |
|
1373 |
-
|
1374 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1375 |
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
1376 |
|
1377 |
-
|
1378 |
corpus_lines = self.get_corpus_lines(corpus)
|
1379 |
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
1380 |
|
@@ -1390,67 +1402,67 @@ class Surveyor:
|
|
1390 |
'''
|
1391 |
|
1392 |
'''
|
1393 |
-
|
1394 |
-
|
1395 |
-
|
1396 |
-
|
1397 |
-
|
1398 |
-
|
1399 |
-
|
1400 |
-
|
1401 |
-
|
1402 |
-
|
1403 |
-
|
1404 |
-
|
1405 |
-
|
1406 |
-
|
1407 |
-
|
1408 |
'''
|
1409 |
-
#
|
1410 |
|
1411 |
-
|
1412 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1413 |
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
1414 |
'''
|
1415 |
-
|
1416 |
-
|
1417 |
-
|
1418 |
'''
|
1419 |
|
1420 |
-
|
1421 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1422 |
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
1423 |
'''
|
1424 |
-
|
1425 |
-
|
1426 |
-
|
1427 |
'''
|
1428 |
-
|
1429 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1430 |
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
1431 |
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
1432 |
|
1433 |
'''
|
1434 |
-
|
1435 |
-
|
1436 |
-
|
1437 |
-
|
1438 |
-
|
1439 |
-
|
1440 |
-
|
1441 |
'''
|
1442 |
clustered_sections['abstract'] = abstract_block
|
1443 |
clustered_sections['introduction'] = intro_block
|
1444 |
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
1445 |
|
1446 |
-
|
1447 |
conclusion_block = self.get_conclusion(clustered_sections)
|
1448 |
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
1449 |
clustered_sections['conclusion'] = conclusion_block
|
1450 |
'''
|
1451 |
-
|
1452 |
-
|
1453 |
-
|
1454 |
'''
|
1455 |
|
1456 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
@@ -1460,7 +1472,7 @@ class Surveyor:
|
|
1460 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1461 |
assert (os.path.exists(survey_file))
|
1462 |
output_zip = self.zip_outputs(self.dump_dir, query)
|
1463 |
-
|
1464 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
1465 |
|
1466 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|
|
|
44 |
kw_model_name=None,
|
45 |
high_gpu=False,
|
46 |
refresh_models=False,
|
47 |
+
no_save_models=False,
|
48 |
+
print_fn=None
|
49 |
):
|
50 |
'''
|
51 |
Initializes models and directory structure for the surveyor
|
|
|
72 |
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
73 |
|
74 |
'''
|
75 |
+
self.print_fn = print
|
76 |
+
if print_fn is not None:
|
77 |
+
self.print_fn = print_fn
|
78 |
+
|
79 |
self.torch_device = 'cpu'
|
80 |
+
self.print_fn("\nTorch_device: " + self.torch_device)
|
81 |
if torch.cuda.is_available():
|
82 |
+
self.print_fn("\nloading defaults for gpu")
|
83 |
self.torch_device = 'cuda'
|
84 |
spacy.require_gpu()
|
85 |
|
|
|
114 |
similarity_nlp_name = self.DEFAULTS["similarity_nlp_name"]
|
115 |
|
116 |
if refresh_models or not models_found:
|
117 |
+
self.print_fn(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
|
118 |
if not no_save_models:
|
119 |
self.clean_dirs([models_dir])
|
120 |
|
|
|
153 |
if not no_save_models:
|
154 |
self.embedder.save(models_dir + "/embedder")
|
155 |
else:
|
156 |
+
self.print_fn("\nInitializing from previously saved models at" + models_dir)
|
157 |
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
158 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
159 |
self.title_model.eval()
|
|
|
235 |
|
236 |
papers = papers_meta[:self.num_papers]
|
237 |
selected_papers = papers
|
238 |
+
self.print_fn("\nFirst stage paper collection...")
|
239 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
240 |
+
self.print_fn("\nFirst stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
241 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
242 |
_ = self.get_freq_cited(cites)
|
243 |
'''
|
|
|
248 |
new_papers.extend(new_searched_papers)
|
249 |
'''
|
250 |
selected_papers.extend(new_papers)
|
251 |
+
self.print_fn("\nSecond stage paper collection...")
|
252 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
253 |
+
self.print_fn("\nSecond stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
254 |
papers.extend(new_papers)
|
255 |
|
256 |
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
257 |
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
258 |
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
259 |
|
260 |
+
self.print_fn("\nExtracting section-wise highlights.. ")
|
261 |
papers = self.extract_highlights(papers)
|
262 |
|
263 |
return papers, selected_papers
|
|
|
270 |
[cites_list.append(val) for val in v]
|
271 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
272 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
273 |
+
self.print_fn("\nThe most cited paper ids are:\n" + str(sorted_cites))
|
274 |
|
275 |
return sorted_cites.keys()
|
276 |
|
|
|
280 |
|
281 |
if repeat:
|
282 |
with tempfile.TemporaryDirectory() as dirpath:
|
283 |
+
self.print_fn("\n- downloading extra pdfs.. ")
|
284 |
# full text preparation of selected papers
|
285 |
self.download_pdfs(papers, dirpath)
|
286 |
dirpath_pdfs = os.listdir(dirpath)
|
|
|
288 |
full_file_name = os.path.join(dirpath, file_name)
|
289 |
if os.path.isfile(full_file_name):
|
290 |
shutil.copy(full_file_name, pdf_dir)
|
291 |
+
self.print_fn("\n- converting extra pdfs.. ")
|
292 |
self.convert_pdfs(dirpath, txt_dir)
|
293 |
else:
|
294 |
+
self.print_fn("\n- downloading pdfs.. ")
|
295 |
# full text preparation of selected papers
|
296 |
self.download_pdfs(papers, pdf_dir)
|
297 |
+
self.print_fn("\n- converting pdfs.. ")
|
298 |
self.convert_pdfs(pdf_dir, txt_dir)
|
299 |
# plugging citations to our papers object
|
300 |
+
self.print_fn("\n- plugging in citation network.. ")
|
301 |
papers, cites = self.cocitation_network(papers, txt_dir)
|
302 |
joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp')
|
303 |
from distutils.dir_util import copy_tree
|
304 |
copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir))
|
305 |
copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir))
|
306 |
+
self.print_fn("\n- extracting structure.. ")
|
307 |
papers, ids_none = self.extract_structure(papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir)
|
308 |
return ids_none, papers, cites
|
309 |
|
|
|
333 |
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
334 |
|
335 |
import arxiv2bib
|
336 |
+
self.print_fn("\nbuilding bibliography entries.. ")
|
337 |
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
338 |
bibentries = [r.bibtex() for r in bibentries]
|
339 |
|
340 |
+
self.print_fn("\nbuilding final survey file .. at "+ filename)
|
341 |
file = open(filename, 'w+')
|
342 |
if query is None:
|
343 |
query = 'Internal(existing) research'
|
344 |
file.write("----------------------------------------------------------------------")
|
345 |
file.write("Title: A survey on " + query)
|
346 |
+
self.print_fn("")
|
347 |
+
self.print_fn("----------------------------------------------------------------------")
|
348 |
+
self.print_fn("Title: A survey on " + query)
|
349 |
file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
|
350 |
+
self.print_fn("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
|
351 |
file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
|
352 |
+
self.print_fn("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
|
353 |
file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
|
354 |
"\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
|
355 |
"\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
|
356 |
"\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
|
357 |
"\nentries(only to avoid LaTex overhead). ")
|
358 |
+
self.print_fn("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
|
359 |
"\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
|
360 |
"\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
|
361 |
"\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
|
362 |
"\nentries(only to avoid LaTex overhead). ")
|
363 |
file.write("----------------------------------------------------------------------")
|
364 |
+
self.print_fn("----------------------------------------------------------------------")
|
365 |
file.write("")
|
366 |
+
self.print_fn("")
|
367 |
file.write('ABSTRACT')
|
368 |
+
self.print_fn('ABSTRACT')
|
369 |
+
self.print_fn("=================================================")
|
370 |
file.write("=================================================")
|
371 |
file.write("")
|
372 |
+
self.print_fn("")
|
373 |
file.write(research_sections['abstract'])
|
374 |
+
self.print_fn(research_sections['abstract'])
|
375 |
file.write("")
|
376 |
+
self.print_fn("")
|
377 |
file.write('INTRODUCTION')
|
378 |
+
self.print_fn('INTRODUCTION')
|
379 |
+
self.print_fn("=================================================")
|
380 |
file.write("=================================================")
|
381 |
file.write("")
|
382 |
+
self.print_fn("")
|
383 |
file.write(research_sections['introduction'])
|
384 |
+
self.print_fn(research_sections['introduction'])
|
385 |
file.write("")
|
386 |
+
self.print_fn("")
|
387 |
for k, v in research_sections.items():
|
388 |
if k not in ['abstract', 'introduction', 'conclusion']:
|
389 |
file.write(k.upper())
|
390 |
+
self.print_fn(k.upper())
|
391 |
+
self.print_fn("=================================================")
|
392 |
file.write("=================================================")
|
393 |
file.write("")
|
394 |
+
self.print_fn("")
|
395 |
file.write(v)
|
396 |
+
self.print_fn(v)
|
397 |
file.write("")
|
398 |
+
self.print_fn("")
|
399 |
file.write('CONCLUSION')
|
400 |
+
self.print_fn('CONCLUSION')
|
401 |
+
self.print_fn("=================================================")
|
402 |
file.write("=================================================")
|
403 |
file.write("")
|
404 |
+
self.print_fn("")
|
405 |
file.write(research_sections['conclusion'])
|
406 |
+
self.print_fn(research_sections['conclusion'])
|
407 |
file.write("")
|
408 |
+
self.print_fn("")
|
409 |
|
410 |
file.write('REFERENCES')
|
411 |
+
self.print_fn('REFERENCES')
|
412 |
+
self.print_fn("=================================================")
|
413 |
file.write("=================================================")
|
414 |
file.write("")
|
415 |
+
self.print_fn("")
|
416 |
for entry in bibentries:
|
417 |
file.write(entry)
|
418 |
+
self.print_fn(entry)
|
419 |
file.write("")
|
420 |
+
self.print_fn("")
|
421 |
+
self.print_fn("========================XXX=========================")
|
422 |
file.write("========================XXX=========================")
|
423 |
file.close()
|
424 |
|
|
|
426 |
|
427 |
research_blocks = {}
|
428 |
for head, textarr in corpus_known_sections.items():
|
429 |
+
if 'cuda' in self.torch_device:
|
430 |
+
torch.cuda.empty_cache()
|
431 |
+
# self.print_fn(head.upper())
|
432 |
with torch.no_grad():
|
433 |
summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5)
|
434 |
res = self.nlp(summtext)
|
435 |
res = set([str(sent) for sent in list(res.sents)])
|
436 |
summtext = ''.join([line for line in res])
|
437 |
+
# pself.print_fn(summtext)
|
438 |
research_blocks[head] = summtext
|
439 |
|
440 |
return research_blocks
|
|
|
450 |
sequences = ledmodel.generate(input_ids, global_attention_mask=global_attention_mask).sequences
|
451 |
summary = ledtokenizer.batch_decode(sequences)
|
452 |
'''
|
453 |
+
if 'cuda' in self.torch_device:
|
454 |
+
torch.cuda.empty_cache()
|
455 |
inputs = self.ledtokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
|
456 |
return_tensors='pt').to(self.torch_device)
|
457 |
with torch.no_grad():
|
|
|
461 |
res = self.nlp(summary[0])
|
462 |
res = set([str(sent) for sent in list(res.sents)])
|
463 |
summtext = ''.join([line for line in res])
|
464 |
+
#self.print_fn("abstractive summary type:" + str(type(summary)))
|
465 |
return summtext
|
466 |
|
467 |
def get_abstract(self, abs_lines, corpus_known_sections, research_blocks):
|
|
|
470 |
abs_lines = ""
|
471 |
abs_lines += " ".join([l.lower() for l in corpus_known_sections['abstract']])
|
472 |
abs_lines += research_blocks['abstract']
|
473 |
+
# self.print_fn(abs_lines)
|
474 |
|
475 |
try:
|
476 |
return self.abstractive_summary(abs_lines)
|
|
|
482 |
abs_lines = []
|
483 |
types = set()
|
484 |
for k, v in corpus.items():
|
485 |
+
# self.print_fn(v)
|
486 |
types.add(type(v))
|
487 |
abstext = k + '. ' + v.replace('\n', ' ')
|
488 |
abstext = self.nlp(abstext)
|
489 |
abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)])
|
490 |
+
#self.print_fn("unique corpus value types:" + str(types))
|
491 |
# abs_lines = '\n'.join([str(sent) for sent in abs_lines.sents])
|
492 |
return abs_lines
|
493 |
|
|
|
508 |
if p['id'] not in selected_pids:
|
509 |
meta_abs.append(self.generate_title(p['abstract']))
|
510 |
docs.extend(meta_abs)
|
511 |
+
#self.print_fn("meta_abs num"+str(len(meta_abs)))
|
512 |
+
#self.print_fn("selected_pids num"+str(len(selected_pids)))
|
513 |
+
#self.print_fn("papers_meta num"+str(len(papers_meta)))
|
514 |
#assert (len(meta_abs) + len(selected_pids) == len(papers_meta))
|
515 |
assert ('str' in str(type(random.sample(docs, 1)[0])))
|
516 |
return [doc for doc in docs if doc != '']
|
|
|
520 |
from sklearn.cluster import KMeans
|
521 |
# from bertopic import BERTopic
|
522 |
# topic_model = BERTopic(embedding_model=embedder)
|
523 |
+
if 'cuda' in self.torch_device:
|
524 |
+
torch.cuda.empty_cache()
|
525 |
corpus_embeddings = self.embedder.encode(abs_lines)
|
526 |
# Normalize the embeddings to unit length
|
527 |
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
|
|
|
541 |
clustered_sentences[cluster_id] = []
|
542 |
'''
|
543 |
if dummy_count < 5:
|
544 |
+
self.print_fn("abs_line: "+abs_lines[sentence_id])
|
545 |
+
self.print_fn("cluster_ID: "+str(cluster_id))
|
546 |
+
self.print_fn("embedding: "+str(corpus_embeddings[sentence_id]))
|
547 |
dummy_count += 1
|
548 |
'''
|
549 |
clustered_sentences[cluster_id].append(abs_lines[sentence_id])
|
550 |
|
551 |
# for i, cluster in clustered_sentences.items():
|
552 |
+
# self.print_fn("Cluster ", i+1)
|
553 |
+
# self.print_fn(cluster)
|
554 |
+
# self.print_fn("")
|
555 |
|
556 |
return self.get_clustered_sections(clustered_sentences), clustered_sentences
|
557 |
|
|
|
560 |
from sklearn.cluster import KMeans
|
561 |
# from bertopic import BERTopic
|
562 |
# topic_model = BERTopic(embedding_model=embedder)
|
563 |
+
if 'cuda' in self.torch_device:
|
564 |
+
torch.cuda.empty_cache()
|
565 |
abs_lines = self.get_sectioned_docs(papers, papers_meta)
|
566 |
corpus_embeddings = self.embedder.encode(abs_lines)
|
567 |
# Normalize the embeddings to unit length
|
|
|
582 |
clustered_sentences[cluster_id] = []
|
583 |
'''
|
584 |
if dummy_count < 5:
|
585 |
+
self.print_fn("abs_line: "+abs_lines[sentence_id])
|
586 |
+
self.print_fn("cluster_ID: "+str(cluster_id))
|
587 |
+
self.print_fn("embedding: "+str(corpus_embeddings[sentence_id]))
|
588 |
dummy_count += 1
|
589 |
'''
|
590 |
clustered_sentences[cluster_id].append(abs_lines[sentence_id])
|
591 |
|
592 |
# for i, cluster in clustered_sentences.items():
|
593 |
+
# self.print_fn("Cluster ", i+1)
|
594 |
+
# self.print_fn(cluster)
|
595 |
+
# self.print_fn("")
|
596 |
|
597 |
return self.get_clustered_sections(clustered_sentences), clustered_sentences
|
598 |
|
599 |
def generate_title(self, longtext):
|
600 |
+
if 'cuda' in self.torch_device:
|
601 |
+
torch.cuda.empty_cache()
|
602 |
|
603 |
inputs = self.title_tokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
|
604 |
return_tensors='pt').to(self.torch_device)
|
|
|
612 |
def get_clustered_sections(self, clustered_lines):
|
613 |
clusters_dict = {}
|
614 |
for i, cluster in clustered_lines.items():
|
615 |
+
# self.print_fn(cluster)
|
616 |
try:
|
617 |
clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
|
618 |
str(" ".join(cluster)).lower())
|
|
|
651 |
for section in p['sections']:
|
652 |
if kh in section['heading']:
|
653 |
khtext.extend(section['highlights'])
|
654 |
+
# self.print_fn(khtext)
|
655 |
corpus_known_sections[kh] = khtext
|
656 |
return corpus_known_sections
|
657 |
|
|
|
659 |
known = ['abstract', 'introduction', 'discussion', 'relatedwork', 'contribution', 'analysis', 'experiments',
|
660 |
'conclusion']
|
661 |
for p in papers:
|
662 |
+
# self.print_fn("================================")
|
663 |
headings = [section['heading'] for section in p['sections'] if len(section['heading'].split()) < 3]
|
664 |
+
# self.print_fn("id: "+ str(p['id'])+"\nHeadings: \n"+str('\n'.join(headings)))
|
665 |
for kh in known:
|
666 |
for section in p['sections']:
|
667 |
if len(section['heading'].split()) < 3:
|
668 |
+
# self.print_fn(section['heading'])
|
669 |
if kh in ''.join(filter(str.isalpha, section['heading'].replace(' ', '').lower())):
|
670 |
+
# self.print_fn("orig head: "+ section['heading'] +", plain head:" + kh)
|
671 |
section['heading'] = kh
|
672 |
return papers
|
673 |
|
|
|
681 |
if pid == p['id']:
|
682 |
corpus[pid] = p['abstract'] + str(' '.join(ph))
|
683 |
'''
|
684 |
+
self.print_fn("================== final corpus ====================")
|
685 |
+
self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers_meta)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in corpus.items()]))
|
686 |
+
self.print_fn("======== sample point ========")
|
687 |
p = random.choice(list(papers))
|
688 |
+
self.print_fn("paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
|
689 |
+
self.print_fn("======== sample meta point ========")
|
690 |
p = random.choice(list(papers_meta))
|
691 |
+
self.print_fn("meta paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
|
692 |
'''
|
693 |
return corpus
|
694 |
|
|
|
700 |
def build_meta_corpus(self, papers):
|
701 |
meta_corpus = {}
|
702 |
for p in papers:
|
703 |
+
# pself.print_fn(p)
|
704 |
pid = p['id']
|
705 |
ptext = p['title'] + ". " + p['abstract']
|
706 |
doc = self.nlp(ptext)
|
707 |
phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)])
|
708 |
meta_corpus[pid] = str(' '.join(phs))
|
709 |
'''
|
710 |
+
self.print_fn("================== meta corpus ====================")
|
711 |
+
self.print_fn('\n'.join([str("paper: "+ get_by_pid(pid, papers)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in meta_corpus.items()]))
|
712 |
+
self.print_fn("======== sample point ========")
|
713 |
p = random.choice(list(papers))
|
714 |
+
self.print_fn("paper: "+ p['title']+" \nhighlights: " + str(meta_corpus[p['id']]))
|
715 |
'''
|
716 |
return meta_corpus
|
717 |
|
718 |
def select_papers(self, papers, query, num_papers=20):
|
719 |
import numpy as np
|
720 |
+
# self.print_fn("paper sample: ")
|
721 |
+
# self.print_fn(papers)
|
722 |
meta_corpus = self.build_meta_corpus(papers)
|
723 |
scores = []
|
724 |
pids = []
|
|
|
726 |
score = self.text_para_similarity(query, highlights)
|
727 |
scores.append(score)
|
728 |
pids.append(id)
|
729 |
+
self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
|
730 |
|
731 |
idx = np.argsort(scores)[:num_papers]
|
732 |
#for i in range(len(scores)):
|
733 |
+
# self.print_fn("paper: " + str(self.get_by_pid(pids[i], papers)['title']))
|
734 |
+
# self.print_fn("score: " + str(scores[i]))
|
735 |
+
# self.print_fn("argsort ids("+str(num_papers)+" papers): "+ str(idx))
|
736 |
idx = [pids[i] for i in idx]
|
737 |
+
# self.print_fn("argsort pids("+str(num_papers)+" papers): "+ str(idx))
|
738 |
papers_selected = [p for p in papers if p['id'] in idx]
|
739 |
# assert(len(papers_selected)==num_papers)
|
740 |
+
self.print_fn("num papers selected: " + str(len(papers_selected)))
|
741 |
for p in papers_selected:
|
742 |
+
self.print_fn("Selected Paper: " + p['title'])
|
743 |
|
744 |
+
self.print_fn("constrast with natural selection: forward")
|
745 |
for p in papers[:4]:
|
746 |
+
self.print_fn("Selected Paper: " + p['title'])
|
747 |
+
self.print_fn("constrast with natural selection: backward")
|
748 |
for p in papers[-4:]:
|
749 |
+
self.print_fn("Selected Paper: " + p['title'])
|
750 |
# arxiv search producing better relevnce
|
751 |
return papers_selected
|
752 |
|
753 |
def extractive_summary(self, text):
|
754 |
+
if 'cuda' in self.torch_device:
|
755 |
+
torch.cuda.empty_cache()
|
756 |
with torch.no_grad():
|
757 |
res = self.model(text, ratio=0.5)
|
758 |
res_doc = self.nlp(res)
|
|
|
762 |
# text = " ".join(lines)
|
763 |
# text_doc = nlp(" ".join([l.lower() for l in lines]))
|
764 |
# text = ' '.join([ str(sent) for sent in list(text_doc.sents)])
|
765 |
+
if 'cuda' in self.torch_device:
|
766 |
+
torch.cuda.empty_cache()
|
767 |
with torch.no_grad():
|
768 |
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
769 |
res_doc = self.nlp(res)
|
770 |
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
771 |
+
# self.print_fn("\n".join(res_sents))
|
772 |
with torch.no_grad():
|
773 |
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
774 |
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
|
|
794 |
return papers
|
795 |
|
796 |
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
797 |
+
self.print_fn("\nextracting sections.. ")
|
798 |
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
799 |
|
800 |
+
self.print_fn("\nextracting images.. for future correlation use-cases ")
|
801 |
papers = self.extract_images(papers, pdf_dir, img_dir)
|
802 |
|
803 |
if tables:
|
804 |
+
self.print_fn("\nextracting tables.. for future correlation use-cases ")
|
805 |
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
806 |
|
807 |
return papers, ids_none
|
|
|
828 |
'''
|
829 |
for f, h in headings_all.items():
|
830 |
if len(h) < 4:
|
831 |
+
self.print_fn("=================headings almost undetected================")
|
832 |
+
self.print_fn(f)
|
833 |
+
self.print_fn(h)
|
834 |
'''
|
835 |
# from pprint import pprint
|
836 |
+
# pself.print_fn({f: len(h) for f,h in headings_all.items()})
|
837 |
papers_none = [p for p in papers if p['id'] in ids_none]
|
838 |
for p in papers_none:
|
839 |
os.remove(txt_dir + '/'+ p['id'] + '.txt')
|
|
|
860 |
start = headings[i]
|
861 |
end = headings[i + 1]
|
862 |
section = self.get_section(start, end, lines)
|
863 |
+
# self.print_fn(start + " : "+ str(len(section)) +" lines")
|
864 |
'''
|
865 |
if i > 0:
|
866 |
old = headings[i-1]
|
|
|
896 |
start = [i for i in range(len(lines)) if first is lines[i]][0]
|
897 |
end = [i for i in range(len(lines)) if last is lines[i]][0]
|
898 |
section_lines = lines[start + 1:end]
|
899 |
+
# self.print_fn("heading: " + str(first))
|
900 |
+
# self.print_fn("section_lines: "+ str(section_lines))
|
901 |
+
# self.print_fn(section_lines)
|
902 |
return section_lines
|
903 |
except ValueError:
|
904 |
+
self.print_fn("value error :")
|
905 |
+
self.print_fn("first heading :" + str(first) + ", second heading :" + str(last))
|
906 |
+
self.print_fn("first index :" + str(start) + ", second index :" + str(end))
|
907 |
return ""
|
908 |
|
909 |
def check_list_elems_in_list(self, headings, lines):
|
910 |
import numpy as np
|
911 |
+
# [self.print_fn(head) for head in headings if head not in lines ]
|
912 |
return np.all([True if head in lines else False for head in headings])
|
913 |
|
914 |
def check_first_char_upper(self, text):
|
|
|
928 |
assert (self.check_list_elems_in_list(headings, refined))
|
929 |
headings = self.check_duplicates(headings)
|
930 |
|
931 |
+
# self.print_fn('===========================================')
|
932 |
+
# self.print_fn(txt_file +": first scan: \n"+str(len(headings))+" headings")
|
933 |
+
# self.print_fn('\n'.join(headings))
|
934 |
|
935 |
# scan_failed - rescan with first match for abstract hook
|
936 |
if len(headings) == 0:
|
937 |
+
# self.print_fn('===================')
|
938 |
+
# self.print_fn("run 1 failed")
|
939 |
abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())]
|
940 |
if len(abs_cans) != 0:
|
941 |
abs_head = abs_cans[0]
|
942 |
refined, headings = self.scan_text(lines, abs_head=abs_head)
|
943 |
self.check_list_elems_in_list(headings, refined)
|
944 |
headings = self.check_duplicates(headings)
|
945 |
+
# self.print_fn('===================')
|
946 |
+
# self.print_fn(txt_file +": second scan: \n"+str(len(headings))+" headings")
|
947 |
|
948 |
# if len(headings) == 0:
|
949 |
+
# self.print_fn("heading scan failed completely")
|
950 |
|
951 |
return refined, headings
|
952 |
|
|
|
956 |
if len(dups) > 0:
|
957 |
[my_finallist.append(n) for n in my_list if n not in my_finallist]
|
958 |
|
959 |
+
# self.print_fn("original: "+str(len(my_list))+" new: "+str(len(my_finallist)))
|
960 |
return my_finallist
|
961 |
|
962 |
def clean_lines(self, text):
|
|
|
985 |
|
986 |
def scan_text(self, lines, abs_head=None):
|
987 |
import re
|
988 |
+
# self.print_fn('\n'.join(lines))
|
989 |
record = False
|
990 |
headings = []
|
991 |
refined = []
|
|
|
1006 |
refined.append(line)
|
1007 |
break
|
1008 |
refined, headings = self.scanline(record, headings, refined, i, lines)
|
1009 |
+
# self.print_fn('=========in scan_text loop i : '+str(i)+' heading count : '+str(len(headings))+' =========')
|
1010 |
return refined, headings
|
1011 |
|
1012 |
def scanline(self, record, headings, refined, id, lines):
|
|
|
1015 |
line = lines[id]
|
1016 |
|
1017 |
if not len(line) == 0:
|
1018 |
+
# self.print_fn("in scanline")
|
1019 |
+
# self.print_fn(line)
|
1020 |
if record:
|
1021 |
refined.append(line)
|
1022 |
if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match(
|
1023 |
"^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$",
|
1024 |
line) and self.char_length(line) > 7:
|
1025 |
+
# self.print_fn("candidate")
|
1026 |
+
# self.print_fn(line)
|
1027 |
if np.mean([len(s) for s in lines[id + 2:id + 6]]) > 40 and self.check_first_char_upper(
|
1028 |
line) and re.match("^[a-zA-Z1-9\.\:\-\s]*$", line) and len(line.split()) < 10:
|
1029 |
# if len(line) < 20 and np.mean([len(s) for s in lines[i+1:i+5]]) > 30 :
|
1030 |
headings.append(line)
|
1031 |
assert (line in refined)
|
1032 |
+
# self.print_fn("selected")
|
1033 |
+
# self.print_fn(line)
|
1034 |
else:
|
1035 |
known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography']
|
1036 |
missing = [h for h in known_headings if not np.any([True for head in headings if h in head])]
|
|
|
1057 |
for p in papers:
|
1058 |
if p['id'] == pid:
|
1059 |
return p
|
1060 |
+
self.print_fn("\npaper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
|
1061 |
|
1062 |
|
1063 |
def alpha_length(self, s):
|
|
|
1078 |
|
1079 |
def extract_images(self, papers, pdf_dir, img_dir):
|
1080 |
import fitz
|
1081 |
+
# self.print_fn("in images")
|
1082 |
for p in papers:
|
1083 |
file = pdf_dir + p['id'] + ".pdf"
|
1084 |
pdf_file = fitz.open(file)
|
|
|
1088 |
images.extend(page.getImageList())
|
1089 |
images_files = [self.save_image(pdf_file.extractImage(img[0]), i, p['id'], img_dir) for i, img in
|
1090 |
enumerate(set(images)) if img[0]]
|
1091 |
+
# self.print_fn(len(images_per_paper))
|
1092 |
p['images'] = images_files
|
1093 |
+
# self.print_fn(len(p.keys()))
|
1094 |
+
# self.print_fn(papers[0].keys())
|
1095 |
return papers
|
1096 |
|
1097 |
|
|
|
1117 |
# save it to local disk
|
1118 |
fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext
|
1119 |
image.save(open(f"{fname}", "wb"))
|
1120 |
+
# self.print_fn(fname)
|
1121 |
return fname
|
1122 |
|
1123 |
def save_tables(self, dfs, pid, tab_dir):
|
|
|
1137 |
for p in papers:
|
1138 |
dfs = tabula.read_pdf(pdf_dir + p['id'] + ".pdf", pages='all', multiple_tables=True, silent=True)
|
1139 |
p['tables'] = self.save_tables(dfs, p['id'], tab_dir)
|
1140 |
+
# self.print_fn(papers[0].keys())
|
1141 |
return papers
|
1142 |
|
1143 |
def extract_tables_from_file(self, pdf_file_name, tab_dir):
|
|
|
1191 |
else:
|
1192 |
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
1193 |
|
1194 |
+
self.print_fn("\nPapers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
|
1195 |
|
1196 |
return results, searched_papers
|
1197 |
|
|
|
1199 |
import arxiv
|
1200 |
from urllib.parse import urlparse
|
1201 |
ids = [p['id'] for p in papers]
|
1202 |
+
self.print_fn("\ndownloading below selected papers: ")
|
1203 |
+
self.print_fn(ids)
|
1204 |
# asert(False)
|
1205 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1206 |
for p in papers_filtered:
|
|
|
1213 |
import arxiv
|
1214 |
from urllib.parse import urlparse
|
1215 |
ids = [p['id'] for p in papers]
|
1216 |
+
self.print_fn(ids)
|
1217 |
# asert(False)
|
1218 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
1219 |
for p in papers_filtered:
|
|
|
1242 |
|
1243 |
|
1244 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
1245 |
+
self.print_fn("\ncitation-network: ")
|
1246 |
+
self.print_fn(cites)
|
1247 |
|
1248 |
for p in papers:
|
1249 |
p['cites'] = cites[p['id']]
|
|
|
1254 |
from scholarly import scholarly
|
1255 |
import operator
|
1256 |
# Retrieve the author's data, fill-in, and print
|
1257 |
+
self.print_fn("Searching Author: " + author_query)
|
1258 |
search_result = next(scholarly.search_author(author_query), None)
|
1259 |
|
1260 |
if search_result is not None:
|
|
|
1274 |
'url_picture': author['url_picture'],
|
1275 |
}
|
1276 |
else:
|
1277 |
+
self.print_fn("author not found")
|
1278 |
author_stats = {
|
1279 |
'name': author_query,
|
1280 |
'affiliation': "",
|
|
|
1288 |
'url_picture': "",
|
1289 |
}
|
1290 |
|
1291 |
+
# pself.print_fn(author_stats)
|
1292 |
return author_stats
|
1293 |
|
1294 |
def author_stats(self, papers):
|
|
|
1326 |
start_positions = torch.tensor([1])
|
1327 |
end_positions = torch.tensor([3])
|
1328 |
outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions)
|
1329 |
+
self.print_fn("context: " + text)
|
1330 |
+
self.print_fn("question: " + question)
|
1331 |
+
self.print_fn("outputs: " + outputs)
|
1332 |
return outputs
|
1333 |
|
1334 |
def zip_outputs(self, dump_dir, query):
|
|
|
1354 |
if not num_papers:
|
1355 |
num_papers = self.DEFAULTS['num_papers']
|
1356 |
# arxiv api relevance search and data preparation
|
1357 |
+
self.print_fn("\nsearching arXiv for top 100 papers.. ")
|
1358 |
results, searched_papers = self.search(query, max_search=max_search)
|
1359 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
1360 |
+
self.print_fn("\nfound " + str(len(searched_papers)) + " papers")
|
1361 |
|
1362 |
# paper selection by scibert vector embedding relevance scores
|
1363 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
|
|
1370 |
|
1371 |
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
1372 |
|
1373 |
+
self.print_fn("\nStandardizing known section headings per paper.. ")
|
1374 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
1375 |
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
1376 |
|
1377 |
+
self.print_fn("\nBuilding paper-wise corpus.. ")
|
1378 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1379 |
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
1380 |
|
1381 |
+
self.print_fn("\nBuilding section-wise corpus.. ")
|
1382 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1383 |
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
1384 |
|
1385 |
+
self.print_fn("\nBuilding basic research highlights.. ")
|
1386 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1387 |
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
1388 |
|
1389 |
+
self.print_fn("\nReducing corpus to lines.. ")
|
1390 |
corpus_lines = self.get_corpus_lines(corpus)
|
1391 |
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
1392 |
|
|
|
1402 |
'''
|
1403 |
|
1404 |
'''
|
1405 |
+
self.print_fn("papers_highlighted types:"+ str(np.unique([str(type(p['sections'][0]['highlights'])) for p in papers_highlighted])))
|
1406 |
+
self.print_fn("papers_highlighted example:")
|
1407 |
+
self.print_fn(random.sample(list(papers_highlighted), 1)[0]['sections'][0]['highlights'])
|
1408 |
+
self.print_fn("corpus types:"+ str(np.unique([str(type(txt)) for k,txt in corpus.items()])))
|
1409 |
+
self.print_fn("corpus example:")
|
1410 |
+
self.print_fn(random.sample(list(corpus.items()), 1)[0])
|
1411 |
+
self.print_fn("corpus_lines types:"+ str(np.unique([str(type(txt)) for txt in corpus_lines])))
|
1412 |
+
self.print_fn("corpus_lines example:")
|
1413 |
+
self.print_fn(random.sample(list(corpus_lines), 1)[0])
|
1414 |
+
self.print_fn("corpus_sectionwise types:"+ str(np.unique([str(type(txt)) for k,txt in corpus_sectionwise.items()])))
|
1415 |
+
self.print_fn("corpus_sectionwise example:")
|
1416 |
+
self.print_fn(random.sample(list(corpus_sectionwise.items()), 1)[0])
|
1417 |
+
self.print_fn("research_blocks types:"+ str(np.unique([str(type(txt)) for k,txt in research_blocks.items()])))
|
1418 |
+
self.print_fn("research_blocks example:")
|
1419 |
+
self.print_fn(random.sample(list(research_blocks.items()), 1)[0])
|
1420 |
'''
|
1421 |
+
# self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
|
1422 |
|
1423 |
+
self.print_fn("\nBuilding abstract.. ")
|
1424 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1425 |
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
1426 |
'''
|
1427 |
+
self.print_fn("abstract_block type:"+ str(type(abstract_block)))
|
1428 |
+
self.print_fn("abstract_block:")
|
1429 |
+
self.print_fn(abstract_block)
|
1430 |
'''
|
1431 |
|
1432 |
+
self.print_fn("\nBuilding introduction.. ")
|
1433 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1434 |
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
1435 |
'''
|
1436 |
+
self.print_fn("intro_block type:"+ str(type(intro_block)))
|
1437 |
+
self.print_fn("intro_block:")
|
1438 |
+
self.print_fn(intro_block)
|
1439 |
'''
|
1440 |
+
self.print_fn("\nBuilding custom sections.. ")
|
1441 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1442 |
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
1443 |
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
1444 |
|
1445 |
'''
|
1446 |
+
self.print_fn("clusters extracted")
|
1447 |
+
self.print_fn("clustered_sentences types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sentences.items()])))
|
1448 |
+
self.print_fn("clustered_sentences example:")
|
1449 |
+
self.print_fn(random.sample(list(clustered_sections.items()), 1)[0])
|
1450 |
+
self.print_fn("clustered_sections types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sections.items()])))
|
1451 |
+
self.print_fn("clustered_sections example:")
|
1452 |
+
self.print_fn(random.sample(list(clustered_sections.items()), 1)[0])
|
1453 |
'''
|
1454 |
clustered_sections['abstract'] = abstract_block
|
1455 |
clustered_sections['introduction'] = intro_block
|
1456 |
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
1457 |
|
1458 |
+
self.print_fn("\nBuilding conclusion.. ")
|
1459 |
conclusion_block = self.get_conclusion(clustered_sections)
|
1460 |
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
1461 |
clustered_sections['conclusion'] = conclusion_block
|
1462 |
'''
|
1463 |
+
self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
|
1464 |
+
self.print_fn("conclusion_block:")
|
1465 |
+
self.print_fn(conclusion_block)
|
1466 |
'''
|
1467 |
|
1468 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
|
|
1472 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1473 |
assert (os.path.exists(survey_file))
|
1474 |
output_zip = self.zip_outputs(self.dump_dir, query)
|
1475 |
+
self.print_fn("\nSurvey complete.. \nSurvey file path :" + os.path.abspath(
|
1476 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
1477 |
|
1478 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|