Spaces:
Running
Running
sc_ma
commited on
Commit
•
3a7ead9
1
Parent(s):
d1feb02
Move from ArXiv API to Semantic Scholar API.
Browse files- app.py +3 -1
- auto_backgrounds.py +13 -5
- section_generator.py +2 -2
- utils/prompts.py +5 -0
- utils/references.py +113 -12
- utils/tex_processing.py +3 -1
app.py
CHANGED
@@ -4,8 +4,8 @@ import openai
|
|
4 |
from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
|
5 |
from utils.file_operations import hash_name
|
6 |
|
|
|
7 |
# todo:
|
8 |
-
# 2. update QQ group and Organization cards
|
9 |
# 4. add auto_polishing function
|
10 |
# 5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
|
11 |
# 5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
|
@@ -13,6 +13,8 @@ from utils.file_operations import hash_name
|
|
13 |
# 5.3 Use embedding to find most related papers (find a paper dataset)
|
14 |
# 5.4 Use Semantic Scholar API instead of Arxiv API.
|
15 |
# 6. get logs when the procedure is not completed.
|
|
|
|
|
16 |
|
17 |
openai_key = os.getenv("OPENAI_API_KEY")
|
18 |
access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
|
|
|
4 |
from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
|
5 |
from utils.file_operations import hash_name
|
6 |
|
7 |
+
# note: App白屏bug:允许第三方cookie
|
8 |
# todo:
|
|
|
9 |
# 4. add auto_polishing function
|
10 |
# 5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
|
11 |
# 5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
|
|
|
13 |
# 5.3 Use embedding to find most related papers (find a paper dataset)
|
14 |
# 5.4 Use Semantic Scholar API instead of Arxiv API.
|
15 |
# 6. get logs when the procedure is not completed.
|
16 |
+
# 7. 自己的文件库; 更多的prompts
|
17 |
+
# 8. Change prompts to langchain
|
18 |
|
19 |
openai_key = os.getenv("OPENAI_API_KEY")
|
20 |
access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
|
auto_backgrounds.py
CHANGED
@@ -30,7 +30,8 @@ def log_usage(usage, generating_target, print_out=True):
|
|
30 |
print(message)
|
31 |
logging.info(message)
|
32 |
|
33 |
-
def _generation_setup(title, description="", template="ICLR2022", model="gpt-4"
|
|
|
34 |
'''
|
35 |
todo: use `model` to control which model to use; may use another method to generate keywords or collect references
|
36 |
'''
|
@@ -44,12 +45,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4")
|
|
44 |
# Generate keywords and references
|
45 |
print("Initialize the paper information ...")
|
46 |
input_dict = {"title": title, "description": description}
|
47 |
-
keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo")
|
48 |
print(f"keywords: {keywords}")
|
49 |
log_usage(usage, "keywords")
|
50 |
|
51 |
ref = References(load_papers="")
|
52 |
-
ref.collect_papers(keywords, method=
|
53 |
all_paper_ids = ref.to_bibtex(bibtex_path) # todo: this will used to check if all citations are in this list
|
54 |
|
55 |
print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
|
@@ -90,8 +91,8 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
|
|
90 |
return make_archive("sample-output.pdf", filename)
|
91 |
|
92 |
|
93 |
-
def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
|
94 |
-
paper, destination_folder, _ = _generation_setup(title, description, template, model)
|
95 |
|
96 |
# todo: `list_of_methods` failed to be generated; find a solution ...
|
97 |
# print("Generating figures ...")
|
@@ -125,3 +126,10 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
|
|
125 |
input_dict = {"title": title, "description": description, "generator": "generate_draft"}
|
126 |
filename = hash_name(input_dict) + ".zip"
|
127 |
return make_archive(destination_folder, filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
print(message)
|
31 |
logging.info(message)
|
32 |
|
33 |
+
def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
|
34 |
+
search_engine="ss", tldr=False, max_kw_refs=10):
|
35 |
'''
|
36 |
todo: use `model` to control which model to use; may use another method to generate keywords or collect references
|
37 |
'''
|
|
|
45 |
# Generate keywords and references
|
46 |
print("Initialize the paper information ...")
|
47 |
input_dict = {"title": title, "description": description}
|
48 |
+
keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
|
49 |
print(f"keywords: {keywords}")
|
50 |
log_usage(usage, "keywords")
|
51 |
|
52 |
ref = References(load_papers="")
|
53 |
+
ref.collect_papers(keywords, method=search_engine, tldr=tldr)
|
54 |
all_paper_ids = ref.to_bibtex(bibtex_path) # todo: this will used to check if all citations are in this list
|
55 |
|
56 |
print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
|
|
|
91 |
return make_archive("sample-output.pdf", filename)
|
92 |
|
93 |
|
94 |
+
def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=False, max_kw_refs=5):
|
95 |
+
paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
|
96 |
|
97 |
# todo: `list_of_methods` failed to be generated; find a solution ...
|
98 |
# print("Generating figures ...")
|
|
|
126 |
input_dict = {"title": title, "description": description, "generator": "generate_draft"}
|
127 |
filename = hash_name(input_dict) + ".zip"
|
128 |
return make_archive(destination_folder, filename)
|
129 |
+
|
130 |
+
|
131 |
+
if __name__ == "__main__":
|
132 |
+
title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
|
133 |
+
description = ""
|
134 |
+
output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
|
135 |
+
print(output)
|
section_generator.py
CHANGED
@@ -76,11 +76,11 @@ def section_generation(paper, section, save_to_path, model):
|
|
76 |
print(f"{section} has been generated. Saved to {tex_file}.")
|
77 |
return usage
|
78 |
|
79 |
-
def keywords_generation(input_dict, model):
|
80 |
title = input_dict.get("title")
|
81 |
description = input_dict.get("description", "")
|
82 |
if title is not None:
|
83 |
-
prompts = generate_keywords_prompts(title, description)
|
84 |
gpt_response, usage = get_responses(prompts, model)
|
85 |
keywords = extract_keywords(gpt_response)
|
86 |
return keywords, usage
|
|
|
76 |
print(f"{section} has been generated. Saved to {tex_file}.")
|
77 |
return usage
|
78 |
|
79 |
+
def keywords_generation(input_dict, model, max_kw_refs = 10):
|
80 |
title = input_dict.get("title")
|
81 |
description = input_dict.get("description", "")
|
82 |
if title is not None:
|
83 |
+
prompts = generate_keywords_prompts(title, description, max_kw_refs)
|
84 |
gpt_response, usage = get_responses(prompts, model)
|
85 |
keywords = extract_keywords(gpt_response)
|
86 |
return keywords, usage
|
utils/prompts.py
CHANGED
@@ -10,6 +10,11 @@ INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the
|
|
10 |
"conclusion": "Please read the paper I have written and write the conclusion section.",
|
11 |
"abstract": "Please read the paper I have written and write the abstract."}
|
12 |
|
|
|
|
|
|
|
|
|
|
|
13 |
BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
|
14 |
"related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
|
15 |
"backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",}
|
|
|
10 |
"conclusion": "Please read the paper I have written and write the conclusion section.",
|
11 |
"abstract": "Please read the paper I have written and write the abstract."}
|
12 |
|
13 |
+
INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
|
14 |
+
r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
|
15 |
+
r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
|
16 |
+
|
17 |
+
|
18 |
BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
|
19 |
"related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
|
20 |
"backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",}
|
utils/references.py
CHANGED
@@ -8,10 +8,107 @@
|
|
8 |
import requests
|
9 |
import re
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Build the arXiv API query URL with the given keyword and other parameters
|
16 |
def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
|
17 |
base_url = "http://export.arxiv.org/api/query?"
|
@@ -37,6 +134,7 @@ def _collect_papers_arxiv(keyword, counts=3):
|
|
37 |
title = entry.find(f"{namespace}title").text
|
38 |
link = entry.find(f"{namespace}id").text
|
39 |
summary = entry.find(f"{namespace}summary").text
|
|
|
40 |
|
41 |
# Extract the authors
|
42 |
authors = entry.findall(f"{namespace}author")
|
@@ -86,7 +184,7 @@ class References:
|
|
86 |
else:
|
87 |
self.papers = []
|
88 |
|
89 |
-
def collect_papers(self, keywords_dict, method="arxiv"):
|
90 |
"""
|
91 |
keywords_dict:
|
92 |
{"machine learning": 5, "language model": 2};
|
@@ -95,10 +193,12 @@ class References:
|
|
95 |
match method:
|
96 |
case "arxiv":
|
97 |
process =_collect_papers_arxiv
|
|
|
|
|
98 |
case _:
|
99 |
raise NotImplementedError("Other sources have not been not supported yet.")
|
100 |
for key, counts in keywords_dict.items():
|
101 |
-
self.papers = self.papers + process(key, counts)
|
102 |
|
103 |
seen = set()
|
104 |
papers = []
|
@@ -149,12 +249,13 @@ class References:
|
|
149 |
if __name__ == "__main__":
|
150 |
refs = References()
|
151 |
keywords_dict = {
|
152 |
-
"Deep Q-Networks":
|
153 |
-
"Policy Gradient Methods":
|
154 |
"Actor-Critic Algorithms": 4,
|
155 |
-
"Model-Based Reinforcement Learning":
|
156 |
-
"Exploration-Exploitation Trade-off":
|
157 |
}
|
158 |
-
refs.collect_papers(keywords_dict)
|
159 |
for p in refs.papers:
|
160 |
-
print(p["paper_id"])
|
|
|
|
8 |
import requests
|
9 |
import re
|
10 |
|
11 |
+
#########################################################
|
12 |
+
# Some basic tools
|
13 |
+
#########################################################
|
14 |
+
def remove_newlines(serie):
|
15 |
+
serie = serie.replace('\n', ' ')
|
16 |
+
serie = serie.replace('\\n', ' ')
|
17 |
+
serie = serie.replace(' ', ' ')
|
18 |
+
serie = serie.replace(' ', ' ')
|
19 |
+
return serie
|
20 |
+
|
21 |
+
#########################################################
|
22 |
+
# Semantic Scholar (SS) API
|
23 |
+
#########################################################
|
24 |
+
def ss_search(keywords, limit=20, fields=None):
|
25 |
+
# space between the query to be removed and replaced with +
|
26 |
+
if fields is None:
|
27 |
+
fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
|
28 |
+
keywords = keywords.lower()
|
29 |
+
keywords = keywords.replace(" ", "+")
|
30 |
+
url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
|
31 |
+
# headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
|
32 |
+
headers = {"Accept": "*/*"}
|
33 |
+
|
34 |
+
response = requests.get(url, headers=headers, timeout=30)
|
35 |
+
return response.json()
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
def _collect_papers_ss(keyword, counts=3, tldr=False):
|
40 |
+
def externalIds2link(externalIds):
|
41 |
+
# externalIds is similar to "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
|
42 |
+
if externalIds:
|
43 |
+
# Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
|
44 |
+
# priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
|
45 |
+
# DBLP
|
46 |
+
dblp_id = externalIds.get('DBLP')
|
47 |
+
if dblp_id is not None:
|
48 |
+
dblp_link = f"dblp.org/rec/{dblp_id}"
|
49 |
+
return dblp_link
|
50 |
+
# arXiv
|
51 |
+
arxiv_id = externalIds.get('ArXiv')
|
52 |
+
if arxiv_id is not None:
|
53 |
+
arxiv_link = f"arxiv.org/abs/{arxiv_id}"
|
54 |
+
return arxiv_link
|
55 |
+
return ""
|
56 |
+
else:
|
57 |
+
# if this is an empty dictionary, return an empty string
|
58 |
+
return ""
|
59 |
+
|
60 |
+
def extract_paper_id(last_name, year_str, title):
|
61 |
+
return last_name + year_str + title.split(' ', 1)[0]
|
62 |
+
|
63 |
+
def extract_author_info(raw_authors):
|
64 |
+
authors = [author['name'] for author in raw_authors]
|
65 |
+
|
66 |
+
authors_str = " and ".join(authors)
|
67 |
+
last_name = authors[0].split()[-1]
|
68 |
+
return authors_str, last_name
|
69 |
+
|
70 |
+
def parse_search_results(search_results):
|
71 |
+
# turn the search result to a list of paper dictionary.
|
72 |
+
papers = []
|
73 |
+
for raw_paper in search_results:
|
74 |
+
if raw_paper["abstract"] is None:
|
75 |
+
continue
|
76 |
+
|
77 |
+
authors_str, last_name = extract_author_info(raw_paper['authors'])
|
78 |
+
year_str = str(raw_paper['year'])
|
79 |
+
title = raw_paper['title']
|
80 |
+
journal = raw_paper['venue']
|
81 |
+
if not journal:
|
82 |
+
journal = "arXiv preprint"
|
83 |
+
paper_id = extract_paper_id(last_name, year_str, title).lower()
|
84 |
+
link = externalIds2link(raw_paper['externalIds'])
|
85 |
+
if tldr and raw_paper['tldr'] is not None:
|
86 |
+
abstract = raw_paper['tldr']['text']
|
87 |
+
else:
|
88 |
+
abstract = remove_newlines(raw_paper['abstract'])
|
89 |
+
result = {
|
90 |
+
"paper_id": paper_id,
|
91 |
+
"title": title,
|
92 |
+
"abstract": abstract, # todo: compare results with tldr
|
93 |
+
"link": link,
|
94 |
+
"authors": authors_str,
|
95 |
+
"year": year_str,
|
96 |
+
"journal": journal
|
97 |
+
}
|
98 |
+
papers.append(result)
|
99 |
+
return papers
|
100 |
+
raw_results = ss_search(keyword, limit=counts)
|
101 |
+
if raw_results is not None:
|
102 |
+
search_results = raw_results['data']
|
103 |
+
else:
|
104 |
+
search_results = []
|
105 |
+
results = parse_search_results(search_results)
|
106 |
+
return results
|
107 |
+
|
108 |
+
#########################################################
|
109 |
+
# ArXiv API
|
110 |
+
#########################################################
|
111 |
+
def _collect_papers_arxiv(keyword, counts=3, tldr=False):
|
112 |
# Build the arXiv API query URL with the given keyword and other parameters
|
113 |
def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
|
114 |
base_url = "http://export.arxiv.org/api/query?"
|
|
|
134 |
title = entry.find(f"{namespace}title").text
|
135 |
link = entry.find(f"{namespace}id").text
|
136 |
summary = entry.find(f"{namespace}summary").text
|
137 |
+
summary = remove_newlines(summary)
|
138 |
|
139 |
# Extract the authors
|
140 |
authors = entry.findall(f"{namespace}author")
|
|
|
184 |
else:
|
185 |
self.papers = []
|
186 |
|
187 |
+
def collect_papers(self, keywords_dict, method="arxiv", tldr=False):
|
188 |
"""
|
189 |
keywords_dict:
|
190 |
{"machine learning": 5, "language model": 2};
|
|
|
193 |
match method:
|
194 |
case "arxiv":
|
195 |
process =_collect_papers_arxiv
|
196 |
+
case "ss":
|
197 |
+
process = _collect_papers_ss
|
198 |
case _:
|
199 |
raise NotImplementedError("Other sources have not been not supported yet.")
|
200 |
for key, counts in keywords_dict.items():
|
201 |
+
self.papers = self.papers + process(key, counts, tldr)
|
202 |
|
203 |
seen = set()
|
204 |
papers = []
|
|
|
249 |
if __name__ == "__main__":
|
250 |
refs = References()
|
251 |
keywords_dict = {
|
252 |
+
"Deep Q-Networks": 15,
|
253 |
+
"Policy Gradient Methods": 24,
|
254 |
"Actor-Critic Algorithms": 4,
|
255 |
+
"Model-Based Reinforcement Learning": 13,
|
256 |
+
"Exploration-Exploitation Trade-off": 7
|
257 |
}
|
258 |
+
refs.collect_papers(keywords_dict, method="ss", tldr=True)
|
259 |
for p in refs.papers:
|
260 |
+
print(p["paper_id"])
|
261 |
+
print(len(refs.papers))
|
utils/tex_processing.py
CHANGED
@@ -24,4 +24,6 @@ def replace_title(save_to_path, title):
|
|
24 |
# check if citations are in bibtex.
|
25 |
|
26 |
|
27 |
-
# replace citations
|
|
|
|
|
|
24 |
# check if citations are in bibtex.
|
25 |
|
26 |
|
27 |
+
# replace citations
|
28 |
+
|
29 |
+
# sometimes the output may include thebibliography and bibitem . remove all of it.
|