sidphbot commited on
Commit
f310b8b
1 Parent(s): 3eee270

arxiv id list support

Browse files
Files changed (5) hide show
  1. app.py +52 -23
  2. arxiv_public_data/config.py +1 -1
  3. requirements.txt +2 -0
  4. src/Surveyor.py +4 -69
  5. survey.py +6 -3
app.py CHANGED
@@ -1,43 +1,71 @@
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
 
5
  from src.Surveyor import Surveyor
 
 
 
 
 
 
 
6
 
7
- def run_survey(surveyor, research_keywords, max_search, num_papers):
 
8
  zip_file_name, survey_file_name = surveyor.survey(research_keywords,
 
9
  max_search=max_search,
10
  num_papers=num_papers
11
  )
 
 
12
 
13
- with open(str(zip_file_name), "rb") as file:
14
- btn = st.download_button(
15
- label="Download extracted topic-clustered-highlights, images and tables as zip",
16
- data=file,
17
- file_name=str(zip_file_name)
18
- )
 
 
 
19
 
20
- with open(str(survey_file_name), "rb") as file:
21
- btn = st.download_button(
22
- label="Download detailed generated survey file",
23
- data=file,
24
- file_name=str(survey_file_name)
25
- )
26
- for line in file.readlines():
27
- st.write(line)
28
 
29
 
30
- def survey_space(surveyor):
 
31
  form = st.sidebar.form(key='survey_form')
32
- research_keywords = form.text_input("What would you like to research in today?")
33
  max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
34
- min_value=1, max_value=60, value=10, step=1, key='max_search')
35
  num_papers = form.number_input("num_papers_to_select", help="maximium number of papers to select and analyse - defaults to 8",
36
- min_value=1, max_value=25, value=2, step=1, key='num_papers')
37
  submit = form.form_submit_button('Submit')
38
 
 
 
 
 
 
 
 
 
 
39
  if submit:
40
- run_survey(surveyor, research_keywords, max_search, num_papers)
 
 
 
 
41
 
42
 
43
  if __name__ == '__main__':
@@ -45,6 +73,7 @@ if __name__ == '__main__':
45
  std_col, survey_col = st.columns(2)
46
  std_col.header('execution log:')
47
  survey_col.header('Generated_survey:')
48
- with st.spinner('Loading The-Surveyor ...'):
49
- surveyor_obj = Surveyor(print_fn=std_col.write, survey_print_fn=survey_col.write, refresh_models=True)
50
- survey_space(surveyor_obj)
 
 
1
+
2
+ from turtle import down
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
6
 
7
  from src.Surveyor import Surveyor
8
+ from streamlit_tags import st_tags_sidebar
9
+
10
+
11
+ @st.experimental_singleton
12
+ def get_surveyor_instance(_print_fn, _survey_print_fn):
13
+ with st.spinner('Loading The-Surveyor ...'):
14
+ return Surveyor(_print_fn, _survey_print_fn, refresh_models=True)
15
 
16
+
17
+ def run_survey(surveyor, download_placeholder, research_keywords=None, arxiv_ids=None, max_search=None, num_papers=None):
18
  zip_file_name, survey_file_name = surveyor.survey(research_keywords,
19
+ arxiv_ids,
20
  max_search=max_search,
21
  num_papers=num_papers
22
  )
23
+ show_survey_download(zip_file_name, survey_file_name, download_placeholder)
24
+
25
 
26
+ def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
27
+ download_placeholder.empty()
28
+ with download_placeholder.container():
29
+ with open(str(zip_file_name), "rb") as file:
30
+ btn = st.download_button(
31
+ label="Download extracted topic-clustered-highlights, images and tables as zip",
32
+ data=file,
33
+ file_name=str(zip_file_name)
34
+ )
35
 
36
+ with open(str(survey_file_name), "rb") as file:
37
+ btn = st.download_button(
38
+ label="Download detailed generated survey file",
39
+ data=file,
40
+ file_name=str(survey_file_name)
41
+ )
 
 
42
 
43
 
44
+ def survey_space(surveyor, download_placeholder):
45
+
46
  form = st.sidebar.form(key='survey_form')
47
+ research_keywords = form.text_input("What would you like to research in today?", key='research_keywords')
48
  max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
49
+ min_value=1, max_value=50, value=10, step=1, key='max_search')
50
  num_papers = form.number_input("num_papers_to_select", help="maximium number of papers to select and analyse - defaults to 8",
51
+ min_value=1, max_value=8, value=2, step=1, key='num_papers')
52
  submit = form.form_submit_button('Submit')
53
 
54
+ st.sidebar.write('or')
55
+
56
+ arxiv_ids = st_tags_sidebar(
57
+ label='# Enter Keywords:',
58
+ value=[],
59
+ text='Press enter to add more',
60
+ maxtags = 6,
61
+ key='arxiv_ids')
62
+
63
  if submit:
64
+ run_survey(surveyor, download_placeholder, research_keywords, max_search, num_papers)
65
+ elif len(arxiv_ids):
66
+ run_survey(surveyor, download_placeholder, arxiv_ids)
67
+
68
+
69
 
70
 
71
  if __name__ == '__main__':
 
73
  std_col, survey_col = st.columns(2)
74
  std_col.header('execution log:')
75
  survey_col.header('Generated_survey:')
76
+ download_placeholder = survey_col.container()
77
+ download_placeholder = st.empty()
78
+ surveyor_obj = get_surveyor_instance(_print_fn=std_col.write, _survey_print_fn=survey_col.write)
79
+ survey_space(surveyor_obj, survey_col)
arxiv_public_data/config.py CHANGED
@@ -9,7 +9,7 @@ logging.basicConfig(
9
  baselog = logging.getLogger('arxivdata')
10
  logger = baselog.getChild('config')
11
 
12
- DEFAULT_PATH = os.path.join(os.path.abspath('../'), 'arxiv-data')
13
  JSONFILE = './config.json'
14
  KEY = 'ARXIV_DATA'
15
 
 
9
  baselog = logging.getLogger('arxivdata')
10
  logger = baselog.getChild('config')
11
 
12
+ DEFAULT_PATH = os.path.join(os.path.abspath('.'), 'arxiv-data')
13
  JSONFILE = './config.json'
14
  KEY = 'ARXIV_DATA'
15
 
requirements.txt CHANGED
@@ -3,6 +3,7 @@ arxiv
3
  arxiv2bib
4
  boto3==1.9.118
5
  bert-extractive-summarizer
 
6
  joblib
7
  keybert
8
  numpy
@@ -22,6 +23,7 @@ scispacy
22
  https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz
23
  https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
24
  streamlit
 
25
  summarizer
26
  tabula
27
  tabula_py
 
3
  arxiv2bib
4
  boto3==1.9.118
5
  bert-extractive-summarizer
6
+ fitz==0.0.1.dev2
7
  joblib
8
  keybert
9
  numpy
 
23
  https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz
24
  https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
25
  streamlit
26
+ streamlit-tags
27
  summarizer
28
  tabula
29
  tabula_py
src/Surveyor.py CHANGED
@@ -1355,16 +1355,18 @@ class Surveyor:
1355
  zipdir(dump_dir, zipf)
1356
  return zip_name
1357
 
1358
- def survey(self, query, max_search=None, num_papers=None, debug=False, weigh_authors=False):
1359
  import joblib
1360
  import os, shutil
1361
  if not max_search:
1362
  max_search = self.DEFAULTS['max_search']
1363
  if not num_papers:
1364
  num_papers = self.DEFAULTS['num_papers']
 
 
1365
  # arxiv api relevance search and data preparation
1366
  self.print_fn("\n-searching arXiv for top 100 papers.. ")
1367
- results, searched_papers = self.search(query, max_search=max_search)
1368
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1369
  self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
1370
 
@@ -1485,70 +1487,3 @@ class Surveyor:
1485
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1486
 
1487
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
1488
-
1489
-
1490
- if __name__ == '__main__':
1491
- import argparse
1492
-
1493
- parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
1494
- parser.add_argument('query', metavar='query_string', type=str,
1495
- help='your research query/keywords')
1496
- parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
1497
- help='maximium number of papers to gaze at - defaults to 100')
1498
- parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
1499
- help='maximium number of papers to download and analyse - defaults to 25')
1500
- parser.add_argument('--pdf_dir', metavar='pdf_dir', type=str, default=None,
1501
- help='pdf paper storage directory - defaults to arxiv_data/tarpdfs/')
1502
- parser.add_argument('--txt_dir', metavar='txt_dir', type=str, default=None,
1503
- help='text-converted paper storage directory - defaults to arxiv_data/fulltext/')
1504
- parser.add_argument('--img_dir', metavar='img_dir', type=str, default=None,
1505
- help='image storage directory - defaults to arxiv_data/images/')
1506
- parser.add_argument('--tab_dir', metavar='tab_dir', type=str, default=None,
1507
- help='tables storage directory - defaults to arxiv_data/tables/')
1508
- parser.add_argument('--dump_dir', metavar='dump_dir', type=str, default=None,
1509
- help='all_output_dir - defaults to arxiv_dumps/')
1510
- parser.add_argument('--models_dir', metavar='save_models_dir', type=str, default=None,
1511
- help='directory to save models (> 5GB) - defaults to saved_models/')
1512
- parser.add_argument('--title_model_name', metavar='title_model_name', type=str, default=None,
1513
- help='title model name/tag in hugging-face, defaults to \'Callidior/bert2bert-base-arxiv-titlegen\'')
1514
- parser.add_argument('--ex_summ_model_name', metavar='extractive_summ_model_name', type=str, default=None,
1515
- help='extractive summary model name/tag in hugging-face, defaults to \'allenai/scibert_scivocab_uncased\'')
1516
- parser.add_argument('--ledmodel_name', metavar='ledmodel_name', type=str, default=None,
1517
- help='led model(for abstractive summary) name/tag in hugging-face, defaults to \'allenai/led-large-16384-arxiv\'')
1518
- parser.add_argument('--embedder_name', metavar='sentence_embedder_name', type=str, default=None,
1519
- help='sentence embedder name/tag in hugging-face, defaults to \'paraphrase-MiniLM-L6-v2\'')
1520
- parser.add_argument('--nlp_name', metavar='spacy_model_name', type=str, default=None,
1521
- help='spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_scibert\'')
1522
- parser.add_argument('--similarity_nlp_name', metavar='similarity_nlp_name', type=str, default=None,
1523
- help='spacy downstream model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_lg\'')
1524
- parser.add_argument('--kw_model_name', metavar='kw_model_name', type=str, default=None,
1525
- help='keyword extraction model name/tag in hugging-face, defaults to \'distilbert-base-nli-mean-tokens\'')
1526
- parser.add_argument('--refresh_models', metavar='refresh_models', type=str, default=None,
1527
- help='Refresh model downloads with given names (needs atleast one model name param above), defaults to False')
1528
- parser.add_argument('--high_gpu', metavar='high_gpu', type=str, default=None,
1529
- help='High GPU usage permitted, defaults to False')
1530
-
1531
- args = parser.parse_args()
1532
-
1533
- surveyor = Surveyor(
1534
- pdf_dir=args.pdf_dir,
1535
- txt_dir=args.txt_dir,
1536
- img_dir=args.img_dir,
1537
- tab_dir=args.tab_dir,
1538
- dump_dir=args.dump_dir,
1539
- models_dir=args.models_dir,
1540
- title_model_name=args.title_model_name,
1541
- ex_summ_model_name=args.ex_summ_model_name,
1542
- ledmodel_name=args.ledmodel_name,
1543
- embedder_name=args.embedder_name,
1544
- nlp_name=args.nlp_name,
1545
- similarity_nlp_name=args.similarity_nlp_name,
1546
- kw_model_name=args.kw_model_name,
1547
- refresh_models=args.refresh_models,
1548
- high_gpu=args.high_gpu
1549
-
1550
- )
1551
-
1552
- surveyor.survey(args.query, max_search=args.max_search, num_papers=args.num_papers,
1553
- debug=False, weigh_authors=False)
1554
-
 
1355
  zipdir(dump_dir, zipf)
1356
  return zip_name
1357
 
1358
+ def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False):
1359
  import joblib
1360
  import os, shutil
1361
  if not max_search:
1362
  max_search = self.DEFAULTS['max_search']
1363
  if not num_papers:
1364
  num_papers = self.DEFAULTS['num_papers']
1365
+ if (query is None) and (id_list is None):
1366
+ raise ValueError('please provide a base to survey on: list of arxiv IDs or a few research keywords')
1367
  # arxiv api relevance search and data preparation
1368
  self.print_fn("\n-searching arXiv for top 100 papers.. ")
1369
+ results, searched_papers = self.search(query, id_list, max_search=max_search)
1370
  joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
1371
  self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
1372
 
 
1487
  survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
1488
 
1489
  return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
survey.py CHANGED
@@ -9,8 +9,11 @@ if __name__ == '__main__':
9
  import argparse
10
 
11
  parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
12
- parser.add_argument('query', metavar='query_string', type=str,
13
- help='your research query/keywords')
 
 
 
14
  parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
15
  help='maximium number of papers to gaze at - defaults to 100')
16
  parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
@@ -67,6 +70,6 @@ if __name__ == '__main__':
67
 
68
  )
69
 
70
- surveyor.survey(args.query, max_search=args.max_search, num_papers=args.num_papers,
71
  debug=False, weigh_authors=False)
72
 
 
9
  import argparse
10
 
11
  parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
12
+
13
+ data = parser.add_mutually_exclusive_group(required=True)
14
+ data.add_argument('--query', type=str, help='your research query/keywords')
15
+ data.add_argument('--arxiv_ids', nargs='+', help='arxiv ids for your curated set of papers')
16
+
17
  parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
18
  help='maximium number of papers to gaze at - defaults to 100')
19
  parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
 
70
 
71
  )
72
 
73
+ surveyor.survey(query=args.query, id_list=args.arxiv_ids, max_search=args.max_search, num_papers=args.num_papers,
74
  debug=False, weigh_authors=False)
75