sidphbot commited on
Commit
8b54034
1 Parent(s): 32196eb

temp dir for each survey

Browse files
Files changed (2) hide show
  1. app.py +26 -19
  2. src/Surveyor.py +36 -41
app.py CHANGED
@@ -9,25 +9,32 @@ from pathlib import Path
9
  from src.Surveyor import Surveyor
10
 
11
 
12
-
13
- def get_surveyor_instance(td, _print_fn, _survey_print_fn):
14
  with st.spinner('Loading The-Researcher ...'):
15
- survey_root = Path(td)
16
- dir_args = {f'{dname}_dir': survey_root / dname for dname in ['pdf', 'txt', 'img', 'tab', 'dump']}
17
- for d in dir_args.values():
18
- d.mkdir(exist_ok=True, parents=True)
19
- dir_args = {k: str(v.resolve()) for k, v in dir_args.items()}
20
- return Surveyor(print_fn=_print_fn, survey_print_fn=_survey_print_fn, high_gpu=True, **dir_args)
21
-
22
-
23
- def run_survey(_print_fn, _survey_print_fn, download_placeholder, research_keywords=None, arxiv_ids=None, max_search=None, num_papers=None):
24
- with tempfile.TemporaryDirectory() as td:
25
- zip_file_name, survey_file_name = get_surveyor_instance(td, _print_fn, _survey_print_fn).survey(research_keywords,
26
- arxiv_ids,
27
- max_search=max_search,
28
- num_papers=num_papers
29
- )
30
- show_survey_download(zip_file_name, survey_file_name, download_placeholder)
 
 
 
 
 
 
 
31
 
32
 
33
  def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
@@ -81,7 +88,7 @@ if __name__ == '__main__':
81
  submit = st.form_submit_button(label="Submit")
82
  st.sidebar.write('#### execution log:')
83
 
84
- run_kwargs = {'_print_fn':st.sidebar.write, '_survey_print_fn':st.write,
85
  'download_placeholder':download_placeholder}
86
  if submit:
87
  if session_data['research_keywords'] != '':
 
9
  from src.Surveyor import Surveyor
10
 
11
 
12
+ @st.experimental_singleton(suppress_st_warning=True)
13
+ def get_surveyor_instance(_print_fn, _survey_print_fn):
14
  with st.spinner('Loading The-Researcher ...'):
15
+ return Surveyor(print_fn=_print_fn, survey_print_fn=_survey_print_fn, high_gpu=True)
16
+
17
+
18
+ def run_survey(surveyor, download_placeholder, research_keywords=None, arxiv_ids=None, max_search=None, num_papers=None):
19
+ import hashlib
20
+ import time
21
+
22
+ hash = hashlib.sha1()
23
+ hash.update(str(time.time()))
24
+ temp_hash = hash.hexdigest()
25
+ survey_root = Path(temp_hash).resolve()
26
+ dir_args = {f'{dname}_dir': survey_root / dname for dname in ['pdf', 'txt', 'img', 'tab', 'dump']}
27
+ for d in dir_args.values():
28
+ d.mkdir(exist_ok=True, parents=True)
29
+ print(survey_root)
30
+ print(dir_args)
31
+ dir_args = {k: str(v.resolve()) for k, v in dir_args.items()}
32
+ zip_file_name, survey_file_name = surveyor.survey(research_keywords,
33
+ arxiv_ids,
34
+ max_search=max_search,
35
+ num_papers=num_papers
36
+ **dir_args)
37
+ show_survey_download(zip_file_name, survey_file_name, download_placeholder)
38
 
39
 
40
  def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
 
88
  submit = st.form_submit_button(label="Submit")
89
  st.sidebar.write('#### execution log:')
90
 
91
+ run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
92
  'download_placeholder':download_placeholder}
93
  if submit:
94
  if session_data['research_keywords'] != '':
src/Surveyor.py CHANGED
@@ -30,11 +30,6 @@ class Surveyor:
30
 
31
  def __init__(
32
  self,
33
- pdf_dir=None,
34
- txt_dir=None,
35
- img_dir=None,
36
- tab_dir=None,
37
- dump_dir=None,
38
  models_dir=None,
39
  title_model_name=None,
40
  ex_summ_model_name=None,
@@ -53,11 +48,6 @@ class Surveyor:
53
  Initializes models and directory structure for the surveyor
54
 
55
  Optional Params:
56
- - pdf_dir: String, pdf paper storage directory - defaults to arxiv_data/tarpdfs/
57
- - txt_dir: String, text-converted paper storage directory - defaults to arxiv_data/fulltext/
58
- - img_dir: String, image image storage directory - defaults to arxiv_data/images/
59
- - tab_dir: String, tables storage directory - defaults to arxiv_data/tables/
60
- - dump_dir: String, all_output_dir - defaults to arxiv_dumps/
61
  - models_dir: String, directory to save to huge models
62
  - title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
63
  - ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
@@ -192,41 +182,41 @@ class Surveyor:
192
  self.similarity_nlp = spacy.load(similarity_nlp_name)
193
  self.kw_model = KeyBERT(kw_model_name)
194
 
195
- self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
196
 
197
  def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
198
 
199
  if pdf_dir:
200
- self.pdf_dir = pdf_dir
201
  else:
202
- self.pdf_dir = self.DEFAULTS["pdf_dir"]
203
 
204
  if txt_dir:
205
- self.txt_dir = txt_dir
206
  else:
207
- self.txt_dir = self.DEFAULTS["txt_dir"]
208
 
209
  if img_dir:
210
- self.img_dir = img_dir
211
  else:
212
- self.img_dir = self.DEFAULTS["img_dir"]
213
 
214
  if tab_dir:
215
- self.tab_dir = tab_dir
216
  else:
217
- self.tab_dir = self.DEFAULTS["tab_dir"]
218
 
219
  if dump_dir:
220
- self.dump_dir = dump_dir
221
  else:
222
- self.dump_dir = self.DEFAULTS["dump_dir"]
223
 
224
- dirs = [self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir]
225
  if sum([True for dir in dirs if 'arxiv_data/' in dir]):
226
  base = os.path.dirname("arxiv_data/")
227
  if not os.path.exists(base):
228
  os.mkdir(base)
229
  self.clean_dirs(dirs)
 
230
 
231
  def clean_dirs(self, dirs):
232
  import shutil
@@ -1345,9 +1335,14 @@ class Surveyor:
1345
  zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
1346
  zipdir(dump_dir, zipf)
1347
 
1348
- def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False):
 
1349
  import joblib
1350
  import os, shutil
 
 
 
 
1351
  if not max_search:
1352
  max_search = self.DEFAULTS['max_search']
1353
  if not num_papers:
@@ -1357,39 +1352,39 @@ class Surveyor:
1357
  # arxiv api relevance search and data preparation
1358
  self.print_fn("\n- searching arXiv for top 100 papers.. ")
1359
  results, searched_papers = self.search(query, id_list, max_search=max_search)
1360
- joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1361
  self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
1362
 
1363
  # paper selection by scibert vector embedding relevance scores
1364
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
1365
 
1366
- papers_highlighted, papers_selected, cites = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
1367
  searched_papers)
1368
 
1369
  if weigh_authors:
1370
  authors = self.author_stats(papers_highlighted)
1371
 
1372
- joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
1373
 
1374
  self.print_fn("\n- Standardizing known section headings per paper.. ")
1375
  papers_standardized = self.standardize_headings(papers_highlighted)
1376
- joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
1377
 
1378
  self.print_fn("\n- Building paper-wise corpus.. ")
1379
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1380
- joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
1381
 
1382
  self.print_fn("\n- Building section-wise corpus.. ")
1383
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1384
- joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
1385
 
1386
  self.print_fn("\n- Building basic research highlights.. ")
1387
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1388
- joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
1389
 
1390
  self.print_fn("\n- Reducing corpus to lines.. ")
1391
  corpus_lines = self.get_corpus_lines(corpus)
1392
- joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
1393
 
1394
  # temp
1395
  # searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
@@ -1423,7 +1418,7 @@ class Surveyor:
1423
 
1424
  self.print_fn("\n- Building abstract.. ")
1425
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1426
- joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
1427
  '''
1428
  self.print_fn("abstract_block type:"+ str(type(abstract_block)))
1429
  self.print_fn("abstract_block:")
@@ -1432,7 +1427,7 @@ class Surveyor:
1432
 
1433
  self.print_fn("\n- Building introduction.. ")
1434
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1435
- joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
1436
  '''
1437
  self.print_fn("intro_block type:"+ str(type(intro_block)))
1438
  self.print_fn("intro_block:")
@@ -1440,8 +1435,8 @@ class Surveyor:
1440
  '''
1441
  self.print_fn("\n- Building custom sections.. ")
1442
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1443
- joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
1444
- joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
1445
 
1446
  '''
1447
  self.print_fn("clusters extracted")
@@ -1454,11 +1449,11 @@ class Surveyor:
1454
  '''
1455
  clustered_sections['abstract'] = abstract_block
1456
  clustered_sections['introduction'] = intro_block
1457
- joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
1458
 
1459
  self.print_fn("\n- Building conclusion.. ")
1460
  conclusion_block = self.get_conclusion(clustered_sections)
1461
- joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
1462
  clustered_sections['conclusion'] = conclusion_block
1463
  '''
1464
  self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
@@ -1469,18 +1464,18 @@ class Surveyor:
1469
  query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
1470
 
1471
  survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
1472
- survey_file = Path(self.dump_dir).resolve() / survey_file
1473
  self.build_doc(clustered_sections, papers_standardized, query=query, filename=str(survey_file))
1474
 
1475
  self.survey_print_fn("\n-citation-network: ")
1476
  self.survey_print_fn(cites)
1477
 
1478
- shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
1479
  assert (os.path.exists(survey_file))
1480
 
1481
  zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
1482
- zip_name = Path(self.dump_dir).parent.resolve() / zip_name
1483
- self.zip_outputs(self.dump_dir, str(zip_name))
1484
  self.print_fn("\n- Survey complete.. \nSurvey file path :" + str(survey_file) +
1485
  "\nAll outputs zip path :" + str(zip_name))
1486
 
 
30
 
31
  def __init__(
32
  self,
 
 
 
 
 
33
  models_dir=None,
34
  title_model_name=None,
35
  ex_summ_model_name=None,
 
48
  Initializes models and directory structure for the surveyor
49
 
50
  Optional Params:
 
 
 
 
 
51
  - models_dir: String, directory to save to huge models
52
  - title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
53
  - ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
 
182
  self.similarity_nlp = spacy.load(similarity_nlp_name)
183
  self.kw_model = KeyBERT(kw_model_name)
184
 
 
185
 
186
  def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
187
 
188
  if pdf_dir:
189
+ survey_pdf_dir = pdf_dir
190
  else:
191
+ survey_pdf_dir = self.DEFAULTS["pdf_dir"]
192
 
193
  if txt_dir:
194
+ survey_txt_dir = txt_dir
195
  else:
196
+ survey_txt_dir = self.DEFAULTS["txt_dir"]
197
 
198
  if img_dir:
199
+ survey_img_dir = img_dir
200
  else:
201
+ survey_img_dir = self.DEFAULTS["img_dir"]
202
 
203
  if tab_dir:
204
+ survey_tab_dir = tab_dir
205
  else:
206
+ survey_tab_dir = self.DEFAULTS["tab_dir"]
207
 
208
  if dump_dir:
209
+ survey_dump_dir = dump_dir
210
  else:
211
+ survey_dump_dir = self.DEFAULTS["dump_dir"]
212
 
213
+ dirs = [survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir]
214
  if sum([True for dir in dirs if 'arxiv_data/' in dir]):
215
  base = os.path.dirname("arxiv_data/")
216
  if not os.path.exists(base):
217
  os.mkdir(base)
218
  self.clean_dirs(dirs)
219
+ return dirs
220
 
221
  def clean_dirs(self, dirs):
222
  import shutil
 
1335
  zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
1336
  zipdir(dump_dir, zipf)
1337
 
1338
+ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False,
1339
+ pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
1340
  import joblib
1341
  import os, shutil
1342
+
1343
+ dirs = self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
1344
+ [survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir] = dirs
1345
+
1346
  if not max_search:
1347
  max_search = self.DEFAULTS['max_search']
1348
  if not num_papers:
 
1352
  # arxiv api relevance search and data preparation
1353
  self.print_fn("\n- searching arXiv for top 100 papers.. ")
1354
  results, searched_papers = self.search(query, id_list, max_search=max_search)
1355
+ joblib.dump(searched_papers, survey_dump_dir + 'papers_metadata.dmp')
1356
  self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
1357
 
1358
  # paper selection by scibert vector embedding relevance scores
1359
  # papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
1360
 
1361
+ papers_highlighted, papers_selected, cites = self.pdf_route(survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir,
1362
  searched_papers)
1363
 
1364
  if weigh_authors:
1365
  authors = self.author_stats(papers_highlighted)
1366
 
1367
+ joblib.dump(papers_highlighted, survey_dump_dir + 'papers_highlighted.dmp')
1368
 
1369
  self.print_fn("\n- Standardizing known section headings per paper.. ")
1370
  papers_standardized = self.standardize_headings(papers_highlighted)
1371
+ joblib.dump(papers_standardized, survey_dump_dir + 'papers_standardized.dmp')
1372
 
1373
  self.print_fn("\n- Building paper-wise corpus.. ")
1374
  corpus = self.build_corpus(papers_highlighted, searched_papers)
1375
+ joblib.dump(corpus, survey_dump_dir + 'corpus.dmp')
1376
 
1377
  self.print_fn("\n- Building section-wise corpus.. ")
1378
  corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
1379
+ joblib.dump(corpus_sectionwise, survey_dump_dir + 'corpus_sectionwise.dmp')
1380
 
1381
  self.print_fn("\n- Building basic research highlights.. ")
1382
  research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
1383
+ joblib.dump(research_blocks, survey_dump_dir + 'research_blocks.dmp')
1384
 
1385
  self.print_fn("\n- Reducing corpus to lines.. ")
1386
  corpus_lines = self.get_corpus_lines(corpus)
1387
+ joblib.dump(corpus_lines, survey_dump_dir + 'corpus_lines.dmp')
1388
 
1389
  # temp
1390
  # searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
 
1418
 
1419
  self.print_fn("\n- Building abstract.. ")
1420
  abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
1421
+ joblib.dump(abstract_block, survey_dump_dir + 'abstract_block.dmp')
1422
  '''
1423
  self.print_fn("abstract_block type:"+ str(type(abstract_block)))
1424
  self.print_fn("abstract_block:")
 
1427
 
1428
  self.print_fn("\n- Building introduction.. ")
1429
  intro_block = self.get_intro(corpus_sectionwise, research_blocks)
1430
+ joblib.dump(intro_block, survey_dump_dir + 'intro_block.dmp')
1431
  '''
1432
  self.print_fn("intro_block type:"+ str(type(intro_block)))
1433
  self.print_fn("intro_block:")
 
1435
  '''
1436
  self.print_fn("\n- Building custom sections.. ")
1437
  clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
1438
+ joblib.dump(clustered_sections, survey_dump_dir + 'clustered_sections.dmp')
1439
+ joblib.dump(clustered_sentences, survey_dump_dir + 'clustered_sentences.dmp')
1440
 
1441
  '''
1442
  self.print_fn("clusters extracted")
 
1449
  '''
1450
  clustered_sections['abstract'] = abstract_block
1451
  clustered_sections['introduction'] = intro_block
1452
+ joblib.dump(clustered_sections, survey_dump_dir + 'research_sections.dmp')
1453
 
1454
  self.print_fn("\n- Building conclusion.. ")
1455
  conclusion_block = self.get_conclusion(clustered_sections)
1456
+ joblib.dump(conclusion_block, survey_dump_dir + 'conclusion_block.dmp')
1457
  clustered_sections['conclusion'] = conclusion_block
1458
  '''
1459
  self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
 
1464
  query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
1465
 
1466
  survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
1467
+ survey_file = Path(survey_dump_dir).resolve() / survey_file
1468
  self.build_doc(clustered_sections, papers_standardized, query=query, filename=str(survey_file))
1469
 
1470
  self.survey_print_fn("\n-citation-network: ")
1471
  self.survey_print_fn(cites)
1472
 
1473
+ shutil.copytree('arxiv_data/', survey_dump_dir + '/arxiv_data/')
1474
  assert (os.path.exists(survey_file))
1475
 
1476
  zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
1477
+ zip_name = Path(survey_dump_dir).parent.resolve() / zip_name
1478
+ self.zip_outputs(survey_dump_dir, str(zip_name))
1479
  self.print_fn("\n- Survey complete.. \nSurvey file path :" + str(survey_file) +
1480
  "\nAll outputs zip path :" + str(zip_name))
1481