sidphbot commited on
Commit
92027c7
1 Parent(s): d38185d

streamlit init

Browse files
app.py CHANGED
@@ -27,7 +27,7 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
27
  st.write(line)
28
 
29
 
30
- def survey_space():
31
 
32
  st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
33
 
@@ -41,12 +41,10 @@ def survey_space():
41
 
42
  if submit:
43
  st.write("hello")
44
- if surveyor_obj is None:
45
- surveyor_obj = Surveyor()
46
- run_survey(surveyor_obj, research_keywords, max_search, num_papers)
47
 
48
 
49
  if __name__ == '__main__':
50
- global surveyor_obj
51
- surveyor_obj = None
52
- survey_space()
 
27
  st.write(line)
28
 
29
 
30
+ def survey_space(surveyor):
31
 
32
  st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
33
 
 
41
 
42
  if submit:
43
  st.write("hello")
44
+ run_survey(surveyor, research_keywords, max_search, num_papers)
 
 
45
 
46
 
47
  if __name__ == '__main__':
48
+ global surveyor
49
+ surveyor_obj = Surveyor()
50
+ survey_space(surveyor_obj)
arxiv_public_data/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (148 Bytes)
 
arxiv_public_data/__pycache__/config.cpython-310.pyc DELETED
Binary file (1.44 kB)
 
arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc DELETED
Binary file (2.46 kB)
 
arxiv_public_data/__pycache__/fulltext.cpython-310.pyc DELETED
Binary file (8.32 kB)
 
arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc DELETED
Binary file (4.27 kB)
 
arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc DELETED
Binary file (1.73 kB)
 
arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc DELETED
Binary file (4.4 kB)
 
arxiv_public_data/config.py CHANGED
@@ -9,7 +9,7 @@ logging.basicConfig(
9
  baselog = logging.getLogger('arxivdata')
10
  logger = baselog.getChild('config')
11
 
12
- DEFAULT_PATH = os.path.join(os.path.abspath('/'), 'arxiv-data')
13
  JSONFILE = './config.json'
14
  KEY = 'ARXIV_DATA'
15
 
 
9
  baselog = logging.getLogger('arxivdata')
10
  logger = baselog.getChild('config')
11
 
12
+ DEFAULT_PATH = os.path.join(os.path.abspath('../'), 'arxiv-data')
13
  JSONFILE = './config.json'
14
  KEY = 'ARXIV_DATA'
15
 
pyproject.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=42",
4
+ "wheel"
5
+ ]
6
+ build-backend = "setuptools.build_meta"
src/Surveyor.py CHANGED
@@ -16,7 +16,7 @@ except:
16
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
17
  LEDForConditionalGeneration
18
 
19
- from src.defaults import DEFAULTS
20
 
21
 
22
  class Surveyor:
@@ -70,18 +70,20 @@ class Surveyor:
70
  - num_papers: int maximium number of papers to download and analyse - defaults to 25
71
 
72
  '''
73
- self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
74
  print("\nTorch_device: " + self.torch_device)
75
- if 'cuda' in self.torch_device:
76
- print("\nloading spacy for gpu")
 
77
  spacy.require_gpu()
78
 
 
 
 
 
 
79
  if not kw_model_name:
80
  kw_model_name = DEFAULTS["kw_model_name"]
81
- if not high_gpu:
82
- self.high_gpu = DEFAULTS["high_gpu"]
83
- else:
84
- self.high_gpu = high_gpu
85
  self.num_papers = DEFAULTS['num_papers']
86
  self.max_search = DEFAULTS['max_search']
87
  if not models_dir:
@@ -110,8 +112,8 @@ class Surveyor:
110
  if not no_save_models:
111
  self.clean_dirs([models_dir])
112
 
113
- self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
114
- self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name).to(self.torch_device)
115
  self.title_model.eval()
116
  if not no_save_models:
117
  self.title_model.save_pretrained(models_dir + "/title_model")
@@ -142,7 +144,7 @@ class Surveyor:
142
  self.embedder.save(models_dir + "/embedder")
143
  else:
144
  print("\nInitializing from previously saved models at" + models_dir)
145
- self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
146
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
147
  self.title_model.eval()
148
 
@@ -615,7 +617,11 @@ class Surveyor:
615
  paper_body = ""
616
  for k, v in research_sections.items():
617
  paper_body += v
618
- return self.abstractive_summary(paper_body)
 
 
 
 
619
 
620
  def build_corpus_sectionwise(self, papers):
621
  known = ['abstract', 'introduction', 'conclusion']
 
16
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
17
  LEDForConditionalGeneration
18
 
19
+ from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
20
 
21
 
22
  class Surveyor:
 
70
  - num_papers: int maximium number of papers to download and analyse - defaults to 25
71
 
72
  '''
73
+ self.torch_device = 'cpu'
74
  print("\nTorch_device: " + self.torch_device)
75
+ if torch.cuda.is_available():
76
+ print("\nloading defaults for gpu")
77
+ self.torch_device = 'cuda'
78
  spacy.require_gpu()
79
 
80
+ self.high_gpu = high_gpu
81
+ DEFAULTS = DEFAULTS_CPU_COMPAT
82
+ if self.high_gpu:
83
+ DEFAULTS = DEFAULTS_HIGH_GPU
84
+
85
  if not kw_model_name:
86
  kw_model_name = DEFAULTS["kw_model_name"]
 
 
 
 
87
  self.num_papers = DEFAULTS['num_papers']
88
  self.max_search = DEFAULTS['max_search']
89
  if not models_dir:
 
112
  if not no_save_models:
113
  self.clean_dirs([models_dir])
114
 
115
+ self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name, trust_remote_code=True)
116
+ self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name, trust_remote_code=True).to(self.torch_device)
117
  self.title_model.eval()
118
  if not no_save_models:
119
  self.title_model.save_pretrained(models_dir + "/title_model")
 
144
  self.embedder.save(models_dir + "/embedder")
145
  else:
146
  print("\nInitializing from previously saved models at" + models_dir)
147
+ self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name).to(self.torch_device)
148
  self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
149
  self.title_model.eval()
150
 
 
617
  paper_body = ""
618
  for k, v in research_sections.items():
619
  paper_body += v
620
+
621
+ try:
622
+ return self.abstractive_summary(paper_body)
623
+ except:
624
+ return self.abstractive_summary(self.extractive_summary(paper_body))
625
 
626
  def build_corpus_sectionwise(self, papers):
627
  known = ['abstract', 'introduction', 'conclusion']
src/__pycache__/Surveyor.cpython-310.pyc DELETED
Binary file (47.8 kB)
 
src/__pycache__/defaults.cpython-310.pyc DELETED
Binary file (835 Bytes)
 
src/defaults.py CHANGED
@@ -1,5 +1,5 @@
1
  # defaults for arxiv
2
- DEFAULTS = {
3
  "max_search": 100,
4
  "num_papers": 20,
5
  "high_gpu": False,
@@ -16,5 +16,23 @@ DEFAULTS = {
16
  "nlp_name": "en_core_sci_scibert",
17
  "similarity_nlp_name": "en_core_sci_lg",
18
  "kw_model_name": "distilbert-base-nli-mean-tokens",
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
 
1
  # defaults for arxiv
2
+ DEFAULTS_HIGH_GPU = {
3
  "max_search": 100,
4
  "num_papers": 20,
5
  "high_gpu": False,
 
16
  "nlp_name": "en_core_sci_scibert",
17
  "similarity_nlp_name": "en_core_sci_lg",
18
  "kw_model_name": "distilbert-base-nli-mean-tokens",
19
+ }
20
 
21
+ DEFAULTS_CPU_COMPAT = {
22
+ "max_search": 100,
23
+ "num_papers": 20,
24
+ "high_gpu": False,
25
+ "pdf_dir": "arxiv_data/tarpdfs/",
26
+ "txt_dir": "arxiv_data/fulltext/",
27
+ "img_dir": "arxiv_data/images/",
28
+ "tab_dir": "arxiv_data/tables/",
29
+ "dump_dir": "arxiv_dumps/",
30
+ "models_dir": "saved_models/",
31
+ "title_model_name": "ccdv/lsg-bart-base-4096-arxiv",
32
+ "ex_summ_model_name": "allenai/scibert_scivocab_uncased",
33
+ "ledmodel_name": "bhuvaneswari/t5-small-text_summarization",
34
+ "embedder_name": "paraphrase-MiniLM-L6-v2",
35
+ "nlp_name": "en_core_sci_scibert",
36
+ "similarity_nlp_name": "en_core_sci_lg",
37
+ "kw_model_name": "distilbert-base-nli-mean-tokens",
38
  }