sc_ma commited on
Commit
3a7ead9
1 Parent(s): d1feb02

Move from ArXiv API to Semantic Scholar API.

Browse files
app.py CHANGED
@@ -4,8 +4,8 @@ import openai
4
  from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
5
  from utils.file_operations import hash_name
6
 
 
7
  # todo:
8
- # 2. update QQ group and Organization cards
9
  # 4. add auto_polishing function
10
  # 5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
11
  # 5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
@@ -13,6 +13,8 @@ from utils.file_operations import hash_name
13
  # 5.3 Use embedding to find most related papers (find a paper dataset)
14
  # 5.4 Use Semantic Scholar API instead of Arxiv API.
15
  # 6. get logs when the procedure is not completed.
 
 
16
 
17
  openai_key = os.getenv("OPENAI_API_KEY")
18
  access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
 
4
  from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
5
  from utils.file_operations import hash_name
6
 
7
+ # note: App白屏bug:允许第三方cookie
8
  # todo:
 
9
  # 4. add auto_polishing function
10
  # 5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
11
  # 5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
 
13
  # 5.3 Use embedding to find most related papers (find a paper dataset)
14
  # 5.4 Use Semantic Scholar API instead of Arxiv API.
15
  # 6. get logs when the procedure is not completed.
16
+ # 7. 自己的文件库; 更多的prompts
17
+ # 8. Change prompts to langchain
18
 
19
  openai_key = os.getenv("OPENAI_API_KEY")
20
  access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
auto_backgrounds.py CHANGED
@@ -30,7 +30,8 @@ def log_usage(usage, generating_target, print_out=True):
30
  print(message)
31
  logging.info(message)
32
 
33
- def _generation_setup(title, description="", template="ICLR2022", model="gpt-4"):
 
34
  '''
35
  todo: use `model` to control which model to use; may use another method to generate keywords or collect references
36
  '''
@@ -44,12 +45,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4")
44
  # Generate keywords and references
45
  print("Initialize the paper information ...")
46
  input_dict = {"title": title, "description": description}
47
- keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo")
48
  print(f"keywords: {keywords}")
49
  log_usage(usage, "keywords")
50
 
51
  ref = References(load_papers="")
52
- ref.collect_papers(keywords, method="arxiv")
53
  all_paper_ids = ref.to_bibtex(bibtex_path) # todo: this will used to check if all citations are in this list
54
 
55
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
@@ -90,8 +91,8 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
90
  return make_archive("sample-output.pdf", filename)
91
 
92
 
93
- def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
94
- paper, destination_folder, _ = _generation_setup(title, description, template, model)
95
 
96
  # todo: `list_of_methods` failed to be generated; find a solution ...
97
  # print("Generating figures ...")
@@ -125,3 +126,10 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
125
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
126
  filename = hash_name(input_dict) + ".zip"
127
  return make_archive(destination_folder, filename)
 
 
 
 
 
 
 
 
30
  print(message)
31
  logging.info(message)
32
 
33
+ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
34
+ search_engine="ss", tldr=False, max_kw_refs=10):
35
  '''
36
  todo: use `model` to control which model to use; may use another method to generate keywords or collect references
37
  '''
 
45
  # Generate keywords and references
46
  print("Initialize the paper information ...")
47
  input_dict = {"title": title, "description": description}
48
+ keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
49
  print(f"keywords: {keywords}")
50
  log_usage(usage, "keywords")
51
 
52
  ref = References(load_papers="")
53
+ ref.collect_papers(keywords, method=search_engine, tldr=tldr)
54
  all_paper_ids = ref.to_bibtex(bibtex_path) # todo: this will used to check if all citations are in this list
55
 
56
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
 
91
  return make_archive("sample-output.pdf", filename)
92
 
93
 
94
+ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=False, max_kw_refs=5):
95
+ paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
96
 
97
  # todo: `list_of_methods` failed to be generated; find a solution ...
98
  # print("Generating figures ...")
 
126
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
127
  filename = hash_name(input_dict) + ".zip"
128
  return make_archive(destination_folder, filename)
129
+
130
+
131
+ if __name__ == "__main__":
132
+ title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
133
+ description = ""
134
+ output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
135
+ print(output)
section_generator.py CHANGED
@@ -76,11 +76,11 @@ def section_generation(paper, section, save_to_path, model):
76
  print(f"{section} has been generated. Saved to {tex_file}.")
77
  return usage
78
 
79
- def keywords_generation(input_dict, model):
80
  title = input_dict.get("title")
81
  description = input_dict.get("description", "")
82
  if title is not None:
83
- prompts = generate_keywords_prompts(title, description)
84
  gpt_response, usage = get_responses(prompts, model)
85
  keywords = extract_keywords(gpt_response)
86
  return keywords, usage
 
76
  print(f"{section} has been generated. Saved to {tex_file}.")
77
  return usage
78
 
79
+ def keywords_generation(input_dict, model, max_kw_refs = 10):
80
  title = input_dict.get("title")
81
  description = input_dict.get("description", "")
82
  if title is not None:
83
+ prompts = generate_keywords_prompts(title, description, max_kw_refs)
84
  gpt_response, usage = get_responses(prompts, model)
85
  keywords = extract_keywords(gpt_response)
86
  return keywords, usage
utils/prompts.py CHANGED
@@ -10,6 +10,11 @@ INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the
10
  "conclusion": "Please read the paper I have written and write the conclusion section.",
11
  "abstract": "Please read the paper I have written and write the abstract."}
12
 
 
 
 
 
 
13
  BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
14
  "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
15
  "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",}
 
10
  "conclusion": "Please read the paper I have written and write the conclusion section.",
11
  "abstract": "Please read the paper I have written and write the abstract."}
12
 
13
+ INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
14
+ r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
15
+ r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
16
+
17
+
18
  BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
19
  "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
20
  "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",}
utils/references.py CHANGED
@@ -8,10 +8,107 @@
8
  import requests
9
  import re
10
 
11
- def _collect_papers_arxiv(keyword, counts=3):
12
- #
13
- # The following codes are used to generate the most related papers
14
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Build the arXiv API query URL with the given keyword and other parameters
16
  def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
17
  base_url = "http://export.arxiv.org/api/query?"
@@ -37,6 +134,7 @@ def _collect_papers_arxiv(keyword, counts=3):
37
  title = entry.find(f"{namespace}title").text
38
  link = entry.find(f"{namespace}id").text
39
  summary = entry.find(f"{namespace}summary").text
 
40
 
41
  # Extract the authors
42
  authors = entry.findall(f"{namespace}author")
@@ -86,7 +184,7 @@ class References:
86
  else:
87
  self.papers = []
88
 
89
- def collect_papers(self, keywords_dict, method="arxiv"):
90
  """
91
  keywords_dict:
92
  {"machine learning": 5, "language model": 2};
@@ -95,10 +193,12 @@ class References:
95
  match method:
96
  case "arxiv":
97
  process =_collect_papers_arxiv
 
 
98
  case _:
99
  raise NotImplementedError("Other sources have not been not supported yet.")
100
  for key, counts in keywords_dict.items():
101
- self.papers = self.papers + process(key, counts)
102
 
103
  seen = set()
104
  papers = []
@@ -149,12 +249,13 @@ class References:
149
  if __name__ == "__main__":
150
  refs = References()
151
  keywords_dict = {
152
- "Deep Q-Networks": 5,
153
- "Policy Gradient Methods": 4,
154
  "Actor-Critic Algorithms": 4,
155
- "Model-Based Reinforcement Learning": 3,
156
- "Exploration-Exploitation Trade-off": 2
157
  }
158
- refs.collect_papers(keywords_dict)
159
  for p in refs.papers:
160
- print(p["paper_id"])
 
 
8
  import requests
9
  import re
10
 
11
+ #########################################################
12
+ # Some basic tools
13
+ #########################################################
14
+ def remove_newlines(serie):
15
+ serie = serie.replace('\n', ' ')
16
+ serie = serie.replace('\\n', ' ')
17
+ serie = serie.replace(' ', ' ')
18
+ serie = serie.replace(' ', ' ')
19
+ return serie
20
+
21
+ #########################################################
22
+ # Semantic Scholar (SS) API
23
+ #########################################################
24
+ def ss_search(keywords, limit=20, fields=None):
25
+ # space between the query to be removed and replaced with +
26
+ if fields is None:
27
+ fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
28
+ keywords = keywords.lower()
29
+ keywords = keywords.replace(" ", "+")
30
+ url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
31
+ # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
32
+ headers = {"Accept": "*/*"}
33
+
34
+ response = requests.get(url, headers=headers, timeout=30)
35
+ return response.json()
36
+
37
+
38
+
39
+ def _collect_papers_ss(keyword, counts=3, tldr=False):
40
+ def externalIds2link(externalIds):
41
+ # externalIds is similar to "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
42
+ if externalIds:
43
+ # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
44
+ # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
45
+ # DBLP
46
+ dblp_id = externalIds.get('DBLP')
47
+ if dblp_id is not None:
48
+ dblp_link = f"dblp.org/rec/{dblp_id}"
49
+ return dblp_link
50
+ # arXiv
51
+ arxiv_id = externalIds.get('ArXiv')
52
+ if arxiv_id is not None:
53
+ arxiv_link = f"arxiv.org/abs/{arxiv_id}"
54
+ return arxiv_link
55
+ return ""
56
+ else:
57
+ # if this is an empty dictionary, return an empty string
58
+ return ""
59
+
60
+ def extract_paper_id(last_name, year_str, title):
61
+ return last_name + year_str + title.split(' ', 1)[0]
62
+
63
+ def extract_author_info(raw_authors):
64
+ authors = [author['name'] for author in raw_authors]
65
+
66
+ authors_str = " and ".join(authors)
67
+ last_name = authors[0].split()[-1]
68
+ return authors_str, last_name
69
+
70
+ def parse_search_results(search_results):
71
+ # turn the search result to a list of paper dictionary.
72
+ papers = []
73
+ for raw_paper in search_results:
74
+ if raw_paper["abstract"] is None:
75
+ continue
76
+
77
+ authors_str, last_name = extract_author_info(raw_paper['authors'])
78
+ year_str = str(raw_paper['year'])
79
+ title = raw_paper['title']
80
+ journal = raw_paper['venue']
81
+ if not journal:
82
+ journal = "arXiv preprint"
83
+ paper_id = extract_paper_id(last_name, year_str, title).lower()
84
+ link = externalIds2link(raw_paper['externalIds'])
85
+ if tldr and raw_paper['tldr'] is not None:
86
+ abstract = raw_paper['tldr']['text']
87
+ else:
88
+ abstract = remove_newlines(raw_paper['abstract'])
89
+ result = {
90
+ "paper_id": paper_id,
91
+ "title": title,
92
+ "abstract": abstract, # todo: compare results with tldr
93
+ "link": link,
94
+ "authors": authors_str,
95
+ "year": year_str,
96
+ "journal": journal
97
+ }
98
+ papers.append(result)
99
+ return papers
100
+ raw_results = ss_search(keyword, limit=counts)
101
+ if raw_results is not None:
102
+ search_results = raw_results['data']
103
+ else:
104
+ search_results = []
105
+ results = parse_search_results(search_results)
106
+ return results
107
+
108
+ #########################################################
109
+ # ArXiv API
110
+ #########################################################
111
+ def _collect_papers_arxiv(keyword, counts=3, tldr=False):
112
  # Build the arXiv API query URL with the given keyword and other parameters
113
  def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
114
  base_url = "http://export.arxiv.org/api/query?"
 
134
  title = entry.find(f"{namespace}title").text
135
  link = entry.find(f"{namespace}id").text
136
  summary = entry.find(f"{namespace}summary").text
137
+ summary = remove_newlines(summary)
138
 
139
  # Extract the authors
140
  authors = entry.findall(f"{namespace}author")
 
184
  else:
185
  self.papers = []
186
 
187
+ def collect_papers(self, keywords_dict, method="arxiv", tldr=False):
188
  """
189
  keywords_dict:
190
  {"machine learning": 5, "language model": 2};
 
193
  match method:
194
  case "arxiv":
195
  process =_collect_papers_arxiv
196
+ case "ss":
197
+ process = _collect_papers_ss
198
  case _:
199
  raise NotImplementedError("Other sources have not been not supported yet.")
200
  for key, counts in keywords_dict.items():
201
+ self.papers = self.papers + process(key, counts, tldr)
202
 
203
  seen = set()
204
  papers = []
 
249
  if __name__ == "__main__":
250
  refs = References()
251
  keywords_dict = {
252
+ "Deep Q-Networks": 15,
253
+ "Policy Gradient Methods": 24,
254
  "Actor-Critic Algorithms": 4,
255
+ "Model-Based Reinforcement Learning": 13,
256
+ "Exploration-Exploitation Trade-off": 7
257
  }
258
+ refs.collect_papers(keywords_dict, method="ss", tldr=True)
259
  for p in refs.papers:
260
+ print(p["paper_id"])
261
+ print(len(refs.papers))
utils/tex_processing.py CHANGED
@@ -24,4 +24,6 @@ def replace_title(save_to_path, title):
24
  # check if citations are in bibtex.
25
 
26
 
27
- # replace citations
 
 
 
24
  # check if citations are in bibtex.
25
 
26
 
27
+ # replace citations
28
+
29
+ # sometimes the output may include thebibliography and bibitem . remove all of it.