shaocongma commited on
Commit
8ef9348
1 Parent(s): 3afc671

Fix some tex compiler error.

Browse files
Files changed (4) hide show
  1. .idea/.gitignore +2 -0
  2. app.py +32 -26
  3. auto_backgrounds.py +1 -1
  4. utils/references.py +29 -15
.idea/.gitignore CHANGED
@@ -6,3 +6,5 @@
6
  /dataSources.local.xml
7
  # Editor-based HTTP Client requests
8
  /httpRequests/
 
 
 
6
  /dataSources.local.xml
7
  # Editor-based HTTP Client requests
8
  /httpRequests/
9
+ **/__pycache__
10
+ **/.idea
app.py CHANGED
@@ -6,18 +6,18 @@ from utils.file_operations import hash_name
6
 
7
  # note: App白屏bug:允许第三方cookie
8
  # todo:
9
- # 4. add auto_polishing function
10
- # 5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
11
  # 5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
12
  # 5.2 Use local LLM to generate keywords, figures, ...
13
  # 5.3 Use embedding to find most related papers (find a paper dataset)
14
  # 6. get logs when the procedure is not completed.
15
  # 7. 自己的文件库; 更多的prompts
16
- # 8. Change prompts to langchain
17
- # 9. some references include &: journal={IEEE Power & Energy Society General Meeting}. Check them when generating it.
18
- # 10. some paper ids have : or - in the first word of title; remove them when generating paper id.
19
  # 11. distinguish citep and citet
20
- # 12. Change link to more appealing color
 
 
 
21
 
22
  openai_key = os.getenv("OPENAI_API_KEY")
23
  access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
@@ -40,14 +40,13 @@ else:
40
  IS_OPENAI_API_KEY_AVAILABLE = False
41
 
42
 
43
-
44
  def clear_inputs(text1, text2):
45
  return "", ""
46
 
47
 
48
- def wrapped_generator(title, description, openai_key = None,
49
- template = "ICLR2022",
50
- cache_mode = IS_CACHE_AVAILABLE, generator=None):
51
  # if `cache_mode` is True, then follow the following steps:
52
  # check if "title"+"description" have been generated before
53
  # if so, download from the cloud storage, return it
@@ -57,15 +56,16 @@ def wrapped_generator(title, description, openai_key = None,
57
  # generator = generate_backgrounds
58
  generator = generate_draft
59
  # generator = fake_generator
60
- if openai_key is not None:
61
- openai.api_key = openai_key
62
  openai.Model.list()
63
 
64
  if cache_mode:
65
  from utils.storage import list_all_files, download_file, upload_file
66
  # check if "title"+"description" have been generated before
67
 
68
- input_dict = {"title": title, "description": description, "generator": "generate_draft"} #todo: modify here also
 
69
  file_name = hash_name(input_dict) + ".zip"
70
  file_list = list_all_files()
71
  # print(f"{file_name} will be generated. Check the file list {file_list}")
@@ -75,21 +75,23 @@ def wrapped_generator(title, description, openai_key = None,
75
  return file_name
76
  else:
77
  # generate the result.
78
- # output = fake_generate_backgrounds(title, description, openai_key) # todo: use `generator` to control which function to use.
79
- output = generator(title, description, template, "gpt-4")
 
80
  upload_file(output)
81
  return output
82
  else:
83
  # output = fake_generate_backgrounds(title, description, openai_key)
84
- output = generator(title, description, template, "gpt-4")
85
  return output
86
 
87
 
88
- theme = gr.themes.Monochrome(font=gr.themes.GoogleFont("Questrial")).set(
89
- background_fill_primary='#E5E4E2',
90
- background_fill_secondary = '#F6F6F6',
91
- button_primary_background_fill="#281A39"
92
- )
 
93
 
94
  with gr.Blocks(theme=theme) as demo:
95
  gr.Markdown('''
@@ -107,16 +109,20 @@ with gr.Blocks(theme=theme) as demo:
107
  ''')
108
  with gr.Row():
109
  with gr.Column(scale=2):
110
- key = gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key", visible=not IS_OPENAI_API_KEY_AVAILABLE)
111
- # generator = gr.Dropdown(choices=["学术论文", "文献总结"], value="文献总结", label="Selection", info="目前支持生成'学术论文'和'文献总结'.", interactive=True)
112
- title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1, label="Title", info="论文标题")
 
 
 
113
  description = gr.Textbox(lines=5, label="Description (Optional)", visible=False)
114
 
115
  with gr.Row():
116
  clear_button = gr.Button("Clear")
117
- submit_button = gr.Button("Submit")
118
  with gr.Column(scale=1):
119
- style_mapping = {True: "color:white;background-color:green", False: "color:white;background-color:red"} #todo: to match website's style
 
120
  availability_mapping = {True: "AVAILABLE", False: "NOT AVAILABLE"}
121
  gr.Markdown(f'''## Huggingface Space Status
122
  当`OpenAI API`显示AVAILABLE的时候这个Space可以直接使用.
 
6
 
7
  # note: App白屏bug:允许第三方cookie
8
  # todo:
9
+ # 5. Use some simple method for simple tasks
10
+ # (including: writing abstract, conclusion, generate keywords, generate figures...)
11
  # 5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
12
  # 5.2 Use local LLM to generate keywords, figures, ...
13
  # 5.3 Use embedding to find most related papers (find a paper dataset)
14
  # 6. get logs when the procedure is not completed.
15
  # 7. 自己的文件库; 更多的prompts
 
 
 
16
  # 11. distinguish citep and citet
17
+ # future:
18
+ # 8. Change prompts to langchain
19
+ # 4. add auto_polishing function
20
+ # 12. Change link to more appealing color # after the website is built;
21
 
22
  openai_key = os.getenv("OPENAI_API_KEY")
23
  access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
 
40
  IS_OPENAI_API_KEY_AVAILABLE = False
41
 
42
 
 
43
  def clear_inputs(text1, text2):
44
  return "", ""
45
 
46
 
47
+ def wrapped_generator(paper_title, paper_description, openai_api_key=None,
48
+ template="ICLR2022",
49
+ cache_mode=IS_CACHE_AVAILABLE, generator=None):
50
  # if `cache_mode` is True, then follow the following steps:
51
  # check if "title"+"description" have been generated before
52
  # if so, download from the cloud storage, return it
 
56
  # generator = generate_backgrounds
57
  generator = generate_draft
58
  # generator = fake_generator
59
+ if openai_api_key is not None:
60
+ openai.api_key = openai_api_key
61
  openai.Model.list()
62
 
63
  if cache_mode:
64
  from utils.storage import list_all_files, download_file, upload_file
65
  # check if "title"+"description" have been generated before
66
 
67
+ input_dict = {"title": paper_title, "description": paper_description,
68
+ "generator": "generate_draft"} # todo: modify here also
69
  file_name = hash_name(input_dict) + ".zip"
70
  file_list = list_all_files()
71
  # print(f"{file_name} will be generated. Check the file list {file_list}")
 
75
  return file_name
76
  else:
77
  # generate the result.
78
+ # output = fake_generate_backgrounds(title, description, openai_key)
79
+ # todo: use `generator` to control which function to use.
80
+ output = generator(paper_title, paper_description, template, "gpt-4")
81
  upload_file(output)
82
  return output
83
  else:
84
  # output = fake_generate_backgrounds(title, description, openai_key)
85
+ output = generator(paper_title, paper_description, template, "gpt-4")
86
  return output
87
 
88
 
89
+ theme = gr.themes.Default(font=gr.themes.GoogleFont("Questrial"))
90
+ # .set(
91
+ # background_fill_primary='#E5E4E2',
92
+ # background_fill_secondary = '#F6F6F6',
93
+ # button_primary_background_fill="#281A39"
94
+ # )
95
 
96
  with gr.Blocks(theme=theme) as demo:
97
  gr.Markdown('''
 
109
  ''')
110
  with gr.Row():
111
  with gr.Column(scale=2):
112
+ key = gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key",
113
+ visible=not IS_OPENAI_API_KEY_AVAILABLE)
114
+ # generator = gr.Dropdown(choices=["学术论文", "文献总结"], value="文献总结",
115
+ # label="Selection", info="目前支持生成'学术论文'和'文献总结'.", interactive=True)
116
+ title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
117
+ label="Title", info="论文标题")
118
  description = gr.Textbox(lines=5, label="Description (Optional)", visible=False)
119
 
120
  with gr.Row():
121
  clear_button = gr.Button("Clear")
122
+ submit_button = gr.Button("Submit", variant="primary")
123
  with gr.Column(scale=1):
124
+ style_mapping = {True: "color:white;background-color:green",
125
+ False: "color:white;background-color:red"} # todo: to match website's style
126
  availability_mapping = {True: "AVAILABLE", False: "NOT AVAILABLE"}
127
  gr.Markdown(f'''## Huggingface Space Status
128
  当`OpenAI API`显示AVAILABLE的时候这个Space可以直接使用.
auto_backgrounds.py CHANGED
@@ -91,7 +91,7 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
91
  return make_archive("sample-output.pdf", filename)
92
 
93
 
94
- def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=12):
95
  paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
96
 
97
  # todo: `list_of_methods` failed to be generated; find a solution ...
 
91
  return make_archive("sample-output.pdf", filename)
92
 
93
 
94
+ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=14):
95
  paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
96
 
97
  # todo: `list_of_methods` failed to be generated; find a solution ...
utils/references.py CHANGED
@@ -8,6 +8,7 @@
8
  import requests
9
  import re
10
 
 
11
  #########################################################
12
  # Some basic tools
13
  #########################################################
@@ -18,6 +19,7 @@ def remove_newlines(serie):
18
  serie = serie.replace(' ', ' ')
19
  return serie
20
 
 
21
  #########################################################
22
  # Semantic Scholar (SS) API
23
  #########################################################
@@ -35,10 +37,10 @@ def ss_search(keywords, limit=20, fields=None):
35
  return response.json()
36
 
37
 
38
-
39
  def _collect_papers_ss(keyword, counts=3, tldr=False):
40
  def externalIds2link(externalIds):
41
- # externalIds is similar to "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
 
42
  if externalIds:
43
  # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
44
  # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
@@ -58,7 +60,10 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
58
  return ""
59
 
60
  def extract_paper_id(last_name, year_str, title):
61
- return last_name + year_str + title.split(' ', 1)[0]
 
 
 
62
 
63
  def extract_author_info(raw_authors):
64
  authors = [author['name'] for author in raw_authors]
@@ -67,17 +72,18 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
67
  last_name = authors[0].split()[-1]
68
  return authors_str, last_name
69
 
70
- def parse_search_results(search_results):
71
  # turn the search result to a list of paper dictionary.
72
  papers = []
73
- for raw_paper in search_results:
74
  if raw_paper["abstract"] is None:
75
  continue
76
 
77
  authors_str, last_name = extract_author_info(raw_paper['authors'])
78
  year_str = str(raw_paper['year'])
79
  title = raw_paper['title']
80
- journal = raw_paper['venue']
 
81
  if not journal:
82
  journal = "arXiv preprint"
83
  paper_id = extract_paper_id(last_name, year_str, title).lower()
@@ -97,6 +103,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
97
  }
98
  papers.append(result)
99
  return papers
 
100
  raw_results = ss_search(keyword, limit=counts)
101
  if raw_results is not None:
102
  search_results = raw_results['data']
@@ -105,6 +112,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
105
  results = parse_search_results(search_results)
106
  return results
107
 
 
108
  #########################################################
109
  # ArXiv API
110
  #########################################################
@@ -174,9 +182,14 @@ def _collect_papers_arxiv(keyword, counts=3, tldr=False):
174
  results = parse_results(content)
175
  return results
176
 
 
 
 
 
 
177
  # Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
178
  class References:
179
- def __init__(self, load_papers = ""):
180
  if load_papers:
181
  # todo: read a json file from the given path
182
  # this could be used to support pre-defined references
@@ -192,7 +205,7 @@ class References:
192
  """
193
  match method:
194
  case "arxiv":
195
- process =_collect_papers_arxiv
196
  case "ss":
197
  process = _collect_papers_ss
198
  case _:
@@ -246,16 +259,17 @@ class References:
246
  prompts[paper["paper_id"]] = paper["abstract"]
247
  return prompts
248
 
 
249
  if __name__ == "__main__":
250
  refs = References()
251
  keywords_dict = {
252
- "Deep Q-Networks": 15,
253
- "Policy Gradient Methods": 24,
254
- "Actor-Critic Algorithms": 4,
255
- "Model-Based Reinforcement Learning": 13,
256
- "Exploration-Exploitation Trade-off": 7
257
- }
258
  refs.collect_papers(keywords_dict, method="ss", tldr=True)
259
  for p in refs.papers:
260
  print(p["paper_id"])
261
- print(len(refs.papers))
 
8
  import requests
9
  import re
10
 
11
+
12
  #########################################################
13
  # Some basic tools
14
  #########################################################
 
19
  serie = serie.replace(' ', ' ')
20
  return serie
21
 
22
+
23
  #########################################################
24
  # Semantic Scholar (SS) API
25
  #########################################################
 
37
  return response.json()
38
 
39
 
 
40
  def _collect_papers_ss(keyword, counts=3, tldr=False):
41
  def externalIds2link(externalIds):
42
+ # Sample externalIds:
43
+ # "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
44
  if externalIds:
45
  # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
46
  # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
 
60
  return ""
61
 
62
  def extract_paper_id(last_name, year_str, title):
63
+ pattern = r'^\w+'
64
+ words = re.findall(pattern, title)
65
+ # return last_name + year_str + title.split(' ', 1)[0]
66
+ return last_name + year_str + words[0]
67
 
68
  def extract_author_info(raw_authors):
69
  authors = [author['name'] for author in raw_authors]
 
72
  last_name = authors[0].split()[-1]
73
  return authors_str, last_name
74
 
75
+ def parse_search_results(search_results_ss):
76
  # turn the search result to a list of paper dictionary.
77
  papers = []
78
+ for raw_paper in search_results_ss:
79
  if raw_paper["abstract"] is None:
80
  continue
81
 
82
  authors_str, last_name = extract_author_info(raw_paper['authors'])
83
  year_str = str(raw_paper['year'])
84
  title = raw_paper['title']
85
+ # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
86
+ journal = raw_paper['venue'].replace("&", "\\&")
87
  if not journal:
88
  journal = "arXiv preprint"
89
  paper_id = extract_paper_id(last_name, year_str, title).lower()
 
103
  }
104
  papers.append(result)
105
  return papers
106
+
107
  raw_results = ss_search(keyword, limit=counts)
108
  if raw_results is not None:
109
  search_results = raw_results['data']
 
112
  results = parse_search_results(search_results)
113
  return results
114
 
115
+
116
  #########################################################
117
  # ArXiv API
118
  #########################################################
 
182
  results = parse_results(content)
183
  return results
184
 
185
+
186
+ #########################################################
187
+ # References Class
188
+ #########################################################
189
+
190
  # Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
191
  class References:
192
+ def __init__(self, load_papers=""):
193
  if load_papers:
194
  # todo: read a json file from the given path
195
  # this could be used to support pre-defined references
 
205
  """
206
  match method:
207
  case "arxiv":
208
+ process = _collect_papers_arxiv
209
  case "ss":
210
  process = _collect_papers_ss
211
  case _:
 
259
  prompts[paper["paper_id"]] = paper["abstract"]
260
  return prompts
261
 
262
+
263
  if __name__ == "__main__":
264
  refs = References()
265
  keywords_dict = {
266
+ "Deep Q-Networks": 15,
267
+ "Policy Gradient Methods": 24,
268
+ "Actor-Critic Algorithms": 4,
269
+ "Model-Based Reinforcement Learning": 13,
270
+ "Exploration-Exploitation Trade-off": 7
271
+ }
272
  refs.collect_papers(keywords_dict, method="ss", tldr=True)
273
  for p in refs.papers:
274
  print(p["paper_id"])
275
+ print(len(refs.papers))