Spaces:
Build error
Build error
streamlit init
Browse files- app.py +5 -7
- arxiv_public_data/__pycache__/__init__.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/config.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/fulltext.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc +0 -0
- arxiv_public_data/config.py +1 -1
- pyproject.toml +6 -0
- src/Surveyor.py +18 -12
- src/__pycache__/Surveyor.cpython-310.pyc +0 -0
- src/__pycache__/defaults.cpython-310.pyc +0 -0
- src/defaults.py +19 -1
app.py
CHANGED
@@ -27,7 +27,7 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
|
|
27 |
st.write(line)
|
28 |
|
29 |
|
30 |
-
def survey_space():
|
31 |
|
32 |
st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
|
33 |
|
@@ -41,12 +41,10 @@ def survey_space():
|
|
41 |
|
42 |
if submit:
|
43 |
st.write("hello")
|
44 |
-
|
45 |
-
surveyor_obj = Surveyor()
|
46 |
-
run_survey(surveyor_obj, research_keywords, max_search, num_papers)
|
47 |
|
48 |
|
49 |
if __name__ == '__main__':
|
50 |
-
global
|
51 |
-
surveyor_obj =
|
52 |
-
survey_space()
|
|
|
27 |
st.write(line)
|
28 |
|
29 |
|
30 |
+
def survey_space(surveyor):
|
31 |
|
32 |
st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
|
33 |
|
|
|
41 |
|
42 |
if submit:
|
43 |
st.write("hello")
|
44 |
+
run_survey(surveyor, research_keywords, max_search, num_papers)
|
|
|
|
|
45 |
|
46 |
|
47 |
if __name__ == '__main__':
|
48 |
+
global surveyor
|
49 |
+
surveyor_obj = Surveyor()
|
50 |
+
survey_space(surveyor_obj)
|
arxiv_public_data/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (148 Bytes)
|
|
arxiv_public_data/__pycache__/config.cpython-310.pyc
DELETED
Binary file (1.44 kB)
|
|
arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc
DELETED
Binary file (2.46 kB)
|
|
arxiv_public_data/__pycache__/fulltext.cpython-310.pyc
DELETED
Binary file (8.32 kB)
|
|
arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc
DELETED
Binary file (4.27 kB)
|
|
arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc
DELETED
Binary file (1.73 kB)
|
|
arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc
DELETED
Binary file (4.4 kB)
|
|
arxiv_public_data/config.py
CHANGED
@@ -9,7 +9,7 @@ logging.basicConfig(
|
|
9 |
baselog = logging.getLogger('arxivdata')
|
10 |
logger = baselog.getChild('config')
|
11 |
|
12 |
-
DEFAULT_PATH = os.path.join(os.path.abspath('
|
13 |
JSONFILE = './config.json'
|
14 |
KEY = 'ARXIV_DATA'
|
15 |
|
|
|
9 |
baselog = logging.getLogger('arxivdata')
|
10 |
logger = baselog.getChild('config')
|
11 |
|
12 |
+
DEFAULT_PATH = os.path.join(os.path.abspath('../'), 'arxiv-data')
|
13 |
JSONFILE = './config.json'
|
14 |
KEY = 'ARXIV_DATA'
|
15 |
|
pyproject.toml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = [
|
3 |
+
"setuptools>=42",
|
4 |
+
"wheel"
|
5 |
+
]
|
6 |
+
build-backend = "setuptools.build_meta"
|
src/Surveyor.py
CHANGED
@@ -16,7 +16,7 @@ except:
|
|
16 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
|
17 |
LEDForConditionalGeneration
|
18 |
|
19 |
-
from src.defaults import
|
20 |
|
21 |
|
22 |
class Surveyor:
|
@@ -70,18 +70,20 @@ class Surveyor:
|
|
70 |
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
71 |
|
72 |
'''
|
73 |
-
self.torch_device = '
|
74 |
print("\nTorch_device: " + self.torch_device)
|
75 |
-
if
|
76 |
-
print("\nloading
|
|
|
77 |
spacy.require_gpu()
|
78 |
|
|
|
|
|
|
|
|
|
|
|
79 |
if not kw_model_name:
|
80 |
kw_model_name = DEFAULTS["kw_model_name"]
|
81 |
-
if not high_gpu:
|
82 |
-
self.high_gpu = DEFAULTS["high_gpu"]
|
83 |
-
else:
|
84 |
-
self.high_gpu = high_gpu
|
85 |
self.num_papers = DEFAULTS['num_papers']
|
86 |
self.max_search = DEFAULTS['max_search']
|
87 |
if not models_dir:
|
@@ -110,8 +112,8 @@ class Surveyor:
|
|
110 |
if not no_save_models:
|
111 |
self.clean_dirs([models_dir])
|
112 |
|
113 |
-
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
114 |
-
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name).to(self.torch_device)
|
115 |
self.title_model.eval()
|
116 |
if not no_save_models:
|
117 |
self.title_model.save_pretrained(models_dir + "/title_model")
|
@@ -142,7 +144,7 @@ class Surveyor:
|
|
142 |
self.embedder.save(models_dir + "/embedder")
|
143 |
else:
|
144 |
print("\nInitializing from previously saved models at" + models_dir)
|
145 |
-
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
146 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
147 |
self.title_model.eval()
|
148 |
|
@@ -615,7 +617,11 @@ class Surveyor:
|
|
615 |
paper_body = ""
|
616 |
for k, v in research_sections.items():
|
617 |
paper_body += v
|
618 |
-
|
|
|
|
|
|
|
|
|
619 |
|
620 |
def build_corpus_sectionwise(self, papers):
|
621 |
known = ['abstract', 'introduction', 'conclusion']
|
|
|
16 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
|
17 |
LEDForConditionalGeneration
|
18 |
|
19 |
+
from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
|
20 |
|
21 |
|
22 |
class Surveyor:
|
|
|
70 |
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
71 |
|
72 |
'''
|
73 |
+
self.torch_device = 'cpu'
|
74 |
print("\nTorch_device: " + self.torch_device)
|
75 |
+
if torch.cuda.is_available():
|
76 |
+
print("\nloading defaults for gpu")
|
77 |
+
self.torch_device = 'cuda'
|
78 |
spacy.require_gpu()
|
79 |
|
80 |
+
self.high_gpu = high_gpu
|
81 |
+
DEFAULTS = DEFAULTS_CPU_COMPAT
|
82 |
+
if self.high_gpu:
|
83 |
+
DEFAULTS = DEFAULTS_HIGH_GPU
|
84 |
+
|
85 |
if not kw_model_name:
|
86 |
kw_model_name = DEFAULTS["kw_model_name"]
|
|
|
|
|
|
|
|
|
87 |
self.num_papers = DEFAULTS['num_papers']
|
88 |
self.max_search = DEFAULTS['max_search']
|
89 |
if not models_dir:
|
|
|
112 |
if not no_save_models:
|
113 |
self.clean_dirs([models_dir])
|
114 |
|
115 |
+
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name, trust_remote_code=True)
|
116 |
+
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name, trust_remote_code=True).to(self.torch_device)
|
117 |
self.title_model.eval()
|
118 |
if not no_save_models:
|
119 |
self.title_model.save_pretrained(models_dir + "/title_model")
|
|
|
144 |
self.embedder.save(models_dir + "/embedder")
|
145 |
else:
|
146 |
print("\nInitializing from previously saved models at" + models_dir)
|
147 |
+
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name).to(self.torch_device)
|
148 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
149 |
self.title_model.eval()
|
150 |
|
|
|
617 |
paper_body = ""
|
618 |
for k, v in research_sections.items():
|
619 |
paper_body += v
|
620 |
+
|
621 |
+
try:
|
622 |
+
return self.abstractive_summary(paper_body)
|
623 |
+
except:
|
624 |
+
return self.abstractive_summary(self.extractive_summary(paper_body))
|
625 |
|
626 |
def build_corpus_sectionwise(self, papers):
|
627 |
known = ['abstract', 'introduction', 'conclusion']
|
src/__pycache__/Surveyor.cpython-310.pyc
DELETED
Binary file (47.8 kB)
|
|
src/__pycache__/defaults.cpython-310.pyc
DELETED
Binary file (835 Bytes)
|
|
src/defaults.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# defaults for arxiv
|
2 |
-
|
3 |
"max_search": 100,
|
4 |
"num_papers": 20,
|
5 |
"high_gpu": False,
|
@@ -16,5 +16,23 @@ DEFAULTS = {
|
|
16 |
"nlp_name": "en_core_sci_scibert",
|
17 |
"similarity_nlp_name": "en_core_sci_lg",
|
18 |
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
|
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
}
|
|
|
1 |
# defaults for arxiv
|
2 |
+
DEFAULTS_HIGH_GPU = {
|
3 |
"max_search": 100,
|
4 |
"num_papers": 20,
|
5 |
"high_gpu": False,
|
|
|
16 |
"nlp_name": "en_core_sci_scibert",
|
17 |
"similarity_nlp_name": "en_core_sci_lg",
|
18 |
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
19 |
+
}
|
20 |
|
21 |
+
DEFAULTS_CPU_COMPAT = {
|
22 |
+
"max_search": 100,
|
23 |
+
"num_papers": 20,
|
24 |
+
"high_gpu": False,
|
25 |
+
"pdf_dir": "arxiv_data/tarpdfs/",
|
26 |
+
"txt_dir": "arxiv_data/fulltext/",
|
27 |
+
"img_dir": "arxiv_data/images/",
|
28 |
+
"tab_dir": "arxiv_data/tables/",
|
29 |
+
"dump_dir": "arxiv_dumps/",
|
30 |
+
"models_dir": "saved_models/",
|
31 |
+
"title_model_name": "ccdv/lsg-bart-base-4096-arxiv",
|
32 |
+
"ex_summ_model_name": "allenai/scibert_scivocab_uncased",
|
33 |
+
"ledmodel_name": "bhuvaneswari/t5-small-text_summarization",
|
34 |
+
"embedder_name": "paraphrase-MiniLM-L6-v2",
|
35 |
+
"nlp_name": "en_core_sci_scibert",
|
36 |
+
"similarity_nlp_name": "en_core_sci_lg",
|
37 |
+
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
38 |
}
|