shaocongma commited on
Commit
c9efba3
1 Parent(s): 70e35a5

Re-format prompts using Langchain.

Browse files
app.py CHANGED
@@ -16,6 +16,7 @@ from utils.file_operations import hash_name
16
  # 8. Re-build some components using `langchain`
17
  # - in `references.py`, use PromptTemplates.format -> str
18
  # - in `gpt_interation`, use LLM
 
19
  # future:
20
  # 4. add auto_polishing function
21
  # 12. Change link to more appealing color # after the website is built;
@@ -104,7 +105,7 @@ with gr.Blocks(theme=theme) as demo:
104
 
105
  ***2023-05-03 Update***: 在公开版本中为大家提供了输入OpenAI API Key的地址, 如果有GPT-4的API KEY的话可以在这里体验!
106
 
107
- 在这个Huggingface Organization里也提供一定额度的免费体验: [AUTO-ACADEMIC](https://huggingface.co/organizations/auto-academic/share/HPjgazDSlkwLNCWKiAiZoYtXaJIatkWDYM).
108
 
109
  如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
110
 
 
16
  # 8. Re-build some components using `langchain`
17
  # - in `references.py`, use PromptTemplates.format -> str
18
  # - in `gpt_interation`, use LLM
19
+ # 5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
20
  # future:
21
  # 4. add auto_polishing function
22
  # 12. Change link to more appealing color # after the website is built;
 
105
 
106
  ***2023-05-03 Update***: 在公开版本中为大家提供了输入OpenAI API Key的地址, 如果有GPT-4的API KEY的话可以在这里体验!
107
 
108
+ 在这个Huggingface Organization里也提供一定额度的免费体验: [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
109
 
110
  如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
111
 
auto_backgrounds.py CHANGED
@@ -1,5 +1,5 @@
1
  import os.path
2
-
3
  from utils.references import References
4
  from utils.file_operations import hash_name, make_archive, copy_templates
5
  from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
@@ -25,16 +25,14 @@ def log_usage(usage, generating_target, print_out=True):
25
  TOTAL_COMPLETION_TOKENS += completion_tokens
26
 
27
  message = f"For generating {generating_target}, {total_tokens} tokens have been used ({prompts_tokens} for prompts; {completion_tokens} for completion). " \
28
- f"{TOTAL_TOKENS} tokens have been used in total."
29
  if print_out:
30
  print(message)
31
  logging.info(message)
32
 
33
  def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
34
- search_engine="ss", tldr=False, max_kw_refs=10):
35
- '''
36
- todo: use `model` to control which model to use; may use another method to generate keywords or collect references
37
- '''
38
  paper = {}
39
  paper_body = {}
40
 
@@ -45,13 +43,25 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
45
  # Generate keywords and references
46
  print("Initialize the paper information ...")
47
  input_dict = {"title": title, "description": description}
48
- keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
 
49
  print(f"keywords: {keywords}")
50
  log_usage(usage, "keywords")
51
 
52
- ref = References(load_papers="")
53
- ref.collect_papers(keywords, method=search_engine, tldr=tldr)
54
- all_paper_ids = ref.to_bibtex(bibtex_path) # todo: this will used to check if all citations are in this list
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
57
 
@@ -91,37 +101,29 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
91
  return make_archive("sample-output.pdf", filename)
92
 
93
 
94
- def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=10):
95
- paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
96
-
97
  # todo: `list_of_methods` failed to be generated; find a solution ...
98
  # print("Generating figures ...")
99
  # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
100
  # log_usage(usage, "figures")
101
 
102
  # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
103
- for section in ["introduction", "related works", "backgrounds", "abstract"]:
104
- try:
105
- usage = section_generation(paper, section, destination_folder, model=model)
106
- log_usage(usage, section)
107
- except Exception as e:
108
- message = f"Failed to generate {section}. {type(e).__name__} was raised: {e}"
109
- print(message)
110
- logging.info(message)
111
- max_attempts = 2
112
- # todo: make this part more compact
113
- # re-try `max_attempts` time
114
- for i in range(max_attempts):
 
115
  time.sleep(20)
116
- try:
117
- usage = section_generation(paper, section, destination_folder, model=model)
118
- log_usage(usage, section)
119
- e = None
120
- except Exception as e:
121
- pass
122
- if e is None:
123
- break
124
-
125
 
126
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
127
  filename = hash_name(input_dict) + ".zip"
@@ -129,7 +131,10 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", se
129
 
130
 
131
  if __name__ == "__main__":
 
 
 
132
  title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
133
  description = ""
134
- output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
135
  print(output)
 
1
  import os.path
2
+ import json
3
  from utils.references import References
4
  from utils.file_operations import hash_name, make_archive, copy_templates
5
  from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
 
25
  TOTAL_COMPLETION_TOKENS += completion_tokens
26
 
27
  message = f"For generating {generating_target}, {total_tokens} tokens have been used ({prompts_tokens} for prompts; {completion_tokens} for completion). " \
28
+ f"{TOTAL_TOKENS} tokens have been used in total.\n\n"
29
  if print_out:
30
  print(message)
31
  logging.info(message)
32
 
33
  def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
34
+ tldr=False, max_kw_refs=4, max_num_refs=10):
35
+ print("Generation setup...")
 
 
36
  paper = {}
37
  paper_body = {}
38
 
 
43
  # Generate keywords and references
44
  print("Initialize the paper information ...")
45
  input_dict = {"title": title, "description": description}
46
+ # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
47
+ keywords, usage = keywords_generation(input_dict) #todo: handle format error here
48
  print(f"keywords: {keywords}")
49
  log_usage(usage, "keywords")
50
 
51
+ # generate keywords dictionary
52
+ keywords = {keyword:max_kw_refs for keyword in keywords}
53
+ # tmp = {}
54
+ # for keyword in json.loads(keywords):
55
+ # tmp[keyword] = max_kw_refs
56
+ # keywords = tmp
57
+ print(f"keywords: {keywords}")
58
+
59
+ ref = References()
60
+ ref.collect_papers(keywords, tldr=tldr)
61
+ # todo: use `all_paper_ids` to check if all citations are in this list
62
+ # in tex_processing, remove all duplicated ids
63
+ # find most relevant papers; max_num_refs
64
+ all_paper_ids = ref.to_bibtex(bibtex_path)
65
 
66
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
67
 
 
101
  return make_archive("sample-output.pdf", filename)
102
 
103
 
104
+ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", tldr=True, max_kw_refs=4):
105
+ paper, destination_folder, _ = _generation_setup(title, description, template, model, tldr, max_kw_refs)
106
+ raise
107
  # todo: `list_of_methods` failed to be generated; find a solution ...
108
  # print("Generating figures ...")
109
  # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
110
  # log_usage(usage, "figures")
111
 
112
  # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
113
+ for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
114
+ max_attempts = 4
115
+ attempts_count = 0
116
+ while attempts_count < max_attempts:
117
+ try:
118
+ usage = section_generation(paper, section, destination_folder, model=model)
119
+ log_usage(usage, section)
120
+ break
121
+ except Exception as e:
122
+ message = f"Failed to generate {section}. {type(e).__name__} was raised: {e}"
123
+ print(message)
124
+ logging.info(message)
125
+ attempts_count += 1
126
  time.sleep(20)
 
 
 
 
 
 
 
 
 
127
 
128
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
129
  filename = hash_name(input_dict) + ".zip"
 
131
 
132
 
133
  if __name__ == "__main__":
134
+ import openai
135
+ openai.api_key = os.getenv("OPENAI_API_KEY")
136
+
137
  title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
138
  description = ""
139
+ output = generate_draft(title, description, tldr=True, max_kw_refs=10)
140
  print(output)
latex_templates/ICLR2022/fig.png ADDED
latex_templates/ICLR2022/template.tex CHANGED
@@ -6,7 +6,8 @@
6
  \input{math_commands.tex}
7
  \usepackage{hyperref}
8
  \usepackage{url}
9
- \usepackage{algorithmicx}
 
10
 
11
  \title{TITLE}
12
  \author{GPT-4}
 
6
  \input{math_commands.tex}
7
  \usepackage{hyperref}
8
  \usepackage{url}
9
+ \usepackage{algorithm}
10
+ \usepackage{algorithmic}
11
 
12
  \title{TITLE}
13
  \author{GPT-4}
section_generator.py CHANGED
@@ -3,6 +3,9 @@ from utils.gpt_interaction import get_responses, extract_responses, extract_keyw
3
  from utils.figures import generate_random_figures
4
  import time
5
  import os
 
 
 
6
 
7
  # three GPT-based content generator:
8
  # 1. section_generation: used to generate main content of the paper
@@ -23,7 +26,7 @@ def section_generation_bg(paper, section, save_to_path, model):
23
  print(f"Generating {section}...")
24
  prompts = generate_bg_summary_prompts(paper, section)
25
  gpt_response, usage = get_responses(prompts, model)
26
- output = extract_responses(gpt_response)
27
  paper["body"][section] = output
28
  tex_file = os.path.join(save_to_path, f"{section}.tex")
29
  # tex_file = save_to_path + f"/{section}.tex"
@@ -56,36 +59,46 @@ def section_generation(paper, section, save_to_path, model):
56
  print(f"Generating {section}...")
57
  prompts = generate_paper_prompts(paper, section)
58
  gpt_response, usage = get_responses(prompts, model)
59
- output = extract_responses(gpt_response)
60
  paper["body"][section] = output
61
  tex_file = os.path.join(save_to_path, f"{section}.tex")
62
  # tex_file = save_to_path + f"/{section}.tex"
63
  if section == "abstract":
64
  with open(tex_file, "w") as f:
65
- f.write(r"\begin{abstract}")
66
- with open(tex_file, "a") as f:
67
  f.write(output)
68
- with open(tex_file, "a") as f:
69
- f.write(r"\end{abstract}")
70
  else:
71
  with open(tex_file, "w") as f:
72
- f.write(f"\section{{{section.upper()}}}\n")
73
- with open(tex_file, "a") as f:
74
  f.write(output)
75
  time.sleep(5)
76
  print(f"{section} has been generated. Saved to {tex_file}.")
77
  return usage
78
 
79
- def keywords_generation(input_dict, model, max_kw_refs = 10):
 
 
 
 
 
 
 
 
 
 
 
80
  title = input_dict.get("title")
81
- description = input_dict.get("description", "")
82
- if title is not None:
83
- prompts = generate_keywords_prompts(title, description, max_kw_refs)
84
- gpt_response, usage = get_responses(prompts, model)
85
- keywords = extract_keywords(gpt_response)
86
- return keywords, usage
87
- else:
88
- raise ValueError("`input_dict` must include the key 'title'.")
 
 
 
 
 
89
 
90
  def figures_generation(paper, save_to_path, model):
91
  prompts = generate_experiments_prompts(paper)
 
3
  from utils.figures import generate_random_figures
4
  import time
5
  import os
6
+ from utils.prompts import KEYWORDS_SYSTEM
7
+ from utils.gpt_interaction import get_gpt_responses
8
+ import json
9
 
10
  # three GPT-based content generator:
11
  # 1. section_generation: used to generate main content of the paper
 
26
  print(f"Generating {section}...")
27
  prompts = generate_bg_summary_prompts(paper, section)
28
  gpt_response, usage = get_responses(prompts, model)
29
+ output = gpt_response # extract_responses(gpt_response)
30
  paper["body"][section] = output
31
  tex_file = os.path.join(save_to_path, f"{section}.tex")
32
  # tex_file = save_to_path + f"/{section}.tex"
 
59
  print(f"Generating {section}...")
60
  prompts = generate_paper_prompts(paper, section)
61
  gpt_response, usage = get_responses(prompts, model)
62
+ output = gpt_response # extract_responses(gpt_response)
63
  paper["body"][section] = output
64
  tex_file = os.path.join(save_to_path, f"{section}.tex")
65
  # tex_file = save_to_path + f"/{section}.tex"
66
  if section == "abstract":
67
  with open(tex_file, "w") as f:
 
 
68
  f.write(output)
 
 
69
  else:
70
  with open(tex_file, "w") as f:
 
 
71
  f.write(output)
72
  time.sleep(5)
73
  print(f"{section} has been generated. Saved to {tex_file}.")
74
  return usage
75
 
76
+ # def keywords_generation(input_dict, model, max_kw_refs = 10):
77
+ # title = input_dict.get("title")
78
+ # description = input_dict.get("description", "")
79
+ # if title is not None:
80
+ # prompts = generate_keywords_prompts(title, description, max_kw_refs)
81
+ # gpt_response, usage = get_responses(prompts, model)
82
+ # keywords = extract_keywords(gpt_response)
83
+ # return keywords, usage
84
+ # else:
85
+ # raise ValueError("`input_dict` must include the key 'title'.")
86
+
87
+ def keywords_generation(input_dict):
88
  title = input_dict.get("title")
89
+ max_attempts = 10
90
+ attempts_count = 0
91
+ while attempts_count < max_attempts:
92
+ try:
93
+ keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=3, max_refs_num=5), title,
94
+ model="gpt-3.5-turbo", temperature=0.4)
95
+ print(keywords)
96
+ output = json.loads(keywords)
97
+ return output, usage
98
+ except json.decoder.JSONDecodeError:
99
+ attempts_count += 1
100
+ time.sleep(20)
101
+ raise RuntimeError("Fail to generate keywords.")
102
 
103
  def figures_generation(paper, save_to_path, model):
104
  prompts = generate_experiments_prompts(paper)
utils/gpt_interaction.py CHANGED
@@ -76,6 +76,22 @@ def get_responses(user_message, model="gpt-4", temperature=0.4, openai_key=None)
76
  log.info(assistant_message)
77
  return assistant_message, usage
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  if __name__ == "__main__":
81
  test_strings = [r"f.write(r'hello world')", r"f.write(r'''hello world''')", r"f.write(r'''hello world",
 
76
  log.info(assistant_message)
77
  return assistant_message, usage
78
 
79
+ def get_gpt_responses(systems, prompts, model="gpt-4", temperature=0.4):
80
+ conversation_history = [
81
+ {"role": "system", "content": systems},
82
+ {"role": "user", "content": prompts}
83
+ ]
84
+ response = openai.ChatCompletion.create(
85
+ model=model,
86
+ messages=conversation_history,
87
+ n=1, # Number of responses you want to generate
88
+ temperature=temperature, # Controls the creativity of the generated response
89
+ )
90
+ assistant_message = response['choices'][0]["message"]["content"]
91
+ usage = response['usage']
92
+ log.info(assistant_message)
93
+ return assistant_message, usage
94
+
95
 
96
  if __name__ == "__main__":
97
  test_strings = [r"f.write(r'hello world')", r"f.write(r'''hello world''')", r"f.write(r'''hello world",
utils/prompts.py CHANGED
@@ -1,24 +1,13 @@
1
  import logging
2
- log = logging.getLogger(__name__)
3
 
4
- INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the motivation for the research. Explaining its importance and relevance to the AI community. Clearly state the problem you're addressing, your proposed solution, and the specific research questions or objectives. Briefly mention key related work for context. Explain the main differences from your work. ",
5
- "related works": r"Please discuss key publications, methods, and techniques in your research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
6
- "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",
7
- "methodology": "Please read the paper I have written and write the methodology section with three subsections: Concisely describe the techniques, algorithms, and procedures employed to address the research problem (use as many as formulas written in LaTeX). Explain the rationale behind choosing these methods, and provide sufficient detail for replication (use as many as formulas written in LaTeX). Do not make any list steps; instead, just put them in the same paragraph with sufficient explainations. Do not include \section{...} but you can have \subsection{...}. ",
8
- "results": "Please write the theoretical results section using LaTeX. Include theorem and corollary to support this paper (with formulas). Explain what assumptions are used and why they are standard and necessary. Do not include \section{...}. ",
9
- "experiments": "Please write the experiment section using LaTeX. Include a table to compare with other methods and bold our method. Include one figure comparison.png; this figure compares the loss curve with other methods. Do not include \section{...}. ",
10
- "conclusion": "Please read the paper I have written and write the conclusion section.",
11
- "abstract": "Please read the paper I have written and write the abstract."}
12
-
13
- INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
14
- r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
15
- r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
16
 
 
17
 
18
- BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
19
- "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
20
- "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",}
21
 
 
 
 
22
  def generate_keywords_prompts(title, description="", num_refs=5):
23
  prompts = f"I am writing a machine learning paper with the title '{title}'. {description}\n" \
24
  f"Generate three to five keywords. For each keyword, rate it from 1 to {num_refs}; the larger number means more important." \
@@ -39,6 +28,84 @@ def generate_experiments_prompts(paper_info):
39
 
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def generate_paper_prompts(paper_info, section):
43
  title = paper_info["title"]
44
  description = paper_info["description"]
@@ -47,34 +114,57 @@ def generate_paper_prompts(paper_info, section):
47
 
48
  # fundamental_subprompt - describe the basic information of paper
49
  # instruction_subprompt - tell AI what to do
50
- # references_subprompt - give AI references
51
  # self_subprompt - give AI existing written parts
52
  # output_subprompt - tell AI how to output
53
-
54
- fundamental_subprompt = f"I am writing a machine learning paper with the title '{title}'. {description}\n"
55
- instruction_subprompt = f"You need to write the {section} section. {INSTRUCTIONS[section]}\n"
56
- # references_subprompt = f"Please read the following references: \n{references}\n"\
57
- # f"Every time you use information from the references, you need to cite its id after the sentence; " \
58
- # f"for example, the sentence where you use information from 1905.09788 \cite{{1905.09788}}. " \
59
- # f"Please avoid citing the same reference in the same paragraph. \n"
60
- references_subprompt = f"Please read the following references: \n{references}\n"\
61
- f"Every time you use information from the references, you need to appropriately cite it (using \citep or \citet)." \
62
- f"For example of \citep, the sentence where you use information from lei2022adaptive \citep{{lei2022adaptive}}. " \
63
- f"For example of \citet, \citet{{lei2022adaptive}} claims some information. \n" \
64
- f"Please avoid citing the same reference in the same paragraph. \n"
65
- self_subprompt = f"Here is the paper that I have written: {paper}.\n"
66
- output_subprompt = r"Put your response (do not include \section{...}) in the following Python script:" \
67
- f"with open(\"{section}.tex\", \"w\") as f: f.write(r'''your_response''')"
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  if section in ["introduction", "related works", "backgrounds"]:
70
  # title + references + instruction
71
- prompts = fundamental_subprompt + instruction_subprompt + references_subprompt + output_subprompt
72
- elif section in ["experiments"]:
73
- # only title and instruction
74
- prompts = fundamental_subprompt + instruction_subprompt + output_subprompt
75
- elif section in ["methodology", "abstract", "conclusion"]:
 
 
 
 
 
 
 
 
76
  # title + instruction + paper
77
- prompts = fundamental_subprompt + instruction_subprompt + self_subprompt + output_subprompt
 
 
 
 
78
  else:
79
  raise NotImplementedError
80
 
@@ -82,6 +172,16 @@ def generate_paper_prompts(paper_info, section):
82
  return prompts
83
 
84
 
 
 
 
 
 
 
 
 
 
 
85
  def generate_bg_summary_prompts(paper_info, section):
86
  title = paper_info["title"]
87
  description = paper_info["description"]
 
1
  import logging
2
+ from langchain import PromptTemplate
3
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ log = logging.getLogger(__name__)
6
 
 
 
 
7
 
8
+ ######################################################################################################################
9
+ # Some basic functions
10
+ ######################################################################################################################
11
  def generate_keywords_prompts(title, description="", num_refs=5):
12
  prompts = f"I am writing a machine learning paper with the title '{title}'. {description}\n" \
13
  f"Generate three to five keywords. For each keyword, rate it from 1 to {num_refs}; the larger number means more important." \
 
28
 
29
 
30
 
31
+ ######################################################################################################################
32
+ # System Message
33
+ ######################################################################################################################
34
+
35
+ # two parameters: min_refs_num, max_refs_num
36
+ keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
37
+ Instructions
38
+ - Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
39
+ - The length of list should between {min_refs_num} and {max_refs_num}
40
+ - Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
41
+ # keywords_system_template = """You are an assistant designed to provide related research fields of academic papers.
42
+ # Instructions:
43
+ # - Your response should follow the following output format: ["field1", "field2", "field3"]\n
44
+ # - The length of this Python list should between {min_refs_num} and {max_refs_num}\n
45
+ # - Use specific phrases instead of using too general words (e.g. machine learning)"""
46
+
47
+ # two parameters: min_refs_num, max_refs_num
48
+ exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.
49
+ Instructions
50
+ - Your response should always be a Python list; e.g. ["method_name_1", "method_name_2", "method_name_3"]
51
+ - The length of list should between {min_exps_num} and {max_exps_num}
52
+ - Use abbreviation to make each method's name have 5 characters or less."""
53
+
54
+ # one parameter: research_field
55
+ section_generation_system_template = r"""You are an assistant designed to write academic papers in the field of {research_field} using LaTeX.
56
+ Instructions
57
+ - Your response should be professional and in academic tone.
58
+ - Always give a high-level overview at the beginning of each section or subsection.
59
+ """
60
+
61
+ KEYWORDS_SYSTEM = PromptTemplate(input_variables=["min_refs_num", "max_refs_num"],
62
+ template=keywords_system_template)
63
+ EXP_METHODS_SYSTEM = PromptTemplate(input_variables=["min_exps_num", "max_exps_num"],
64
+ template=exp_methods_system_template)
65
+ SECTION_GENERATION_SYSTEM = PromptTemplate(input_variables=["research_field"],
66
+ template=section_generation_system_template)
67
+
68
+
69
+ ######################################################################################################################
70
+ # Academic Paper
71
+ ######################################################################################################################
72
+
73
+ INSTRUCTIONS = {"introduction":
74
+ "- Include five paragraph: Establishing the motivation for the research. Explaining its importance and relevance to the AI community. Clearly state the problem you're addressing, your proposed solution, and the specific research questions or objectives. Briefly mention key related works for context and explain the main differences from this work. List three novel contributions of this paper.",
75
+ "results":
76
+ "Write the theoretical results section using LaTeX. Include theorem and corollary to support this paper (with formulas). Explain what assumptions are used and why they are standard and necessary. Do not include \section{...}. ",
77
+ "conclusion":
78
+ "- Read the existing parts of paper and write the conclusion section.",
79
+ "abstract":
80
+ "- Read the existing parts of paper and write the abstract."}
81
+
82
+
83
+ INSTRUCTIONS["backgrounds"] = "- Start from one high-level paragraph to state the central problem in this field with detailed examples in industrial applications and theoretical challenges. \n" \
84
+ "- Followed by two to three subsections: Explain the foundational concepts and notations that underpin your research using as many as mathematical formulas (written in LaTeX). " \
85
+ "Introduce more necessary mathematical notations, equations, or algorithms that are connected to this work. Present detailed discussions on how these concepts are applied in this paper."
86
+
87
+
88
+ INSTRUCTIONS["related works"] = r"- Discuss three to five main related fields to this paper. " \
89
+ r"For each field, select five to ten key publications from references. " \
90
+ r"For each reference, analyze its strengths and weaknesses in one or two sentences. " \
91
+ r"Present the related works in a logical manner, often chronologically. " \
92
+ r"Consider using a taxonomy or categorization to structure the discussion. " \
93
+ r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
94
+
95
+ INSTRUCTIONS["methodology"] = "- Provide a high-level overview of the proposed method at the beginning of this section. \n " \
96
+ "- Assume you have some figures ('fig1.png', 'fig2.png', ...); they can be any figures you need (e.g. flow chart, model architecture, sample output, simulation result, or others you need). Insert figures you need with informative caption. \n" \
97
+ "- Use one subsection to give a detailed formulation of the proposed method and explain how it overcomes the weakness of existing methods mentioned in this paper. " \
98
+ " If necessary, write pseudo codes wrapped by \\begin{{algorithm}} ... \\end{{algorithm}} to explain the detailed steps instead of simply listing them. \n" \
99
+ "- Use one follow-up subsection to highlight the key concepts in the proposed method. " \
100
+ " Elaborate the novelty of these key concepts using formulas and inserting appropriate figures. \n" \
101
+ "- Ensure the name of each subsection to be specific. \n"
102
+
103
+ INSTRUCTIONS["experiments"] = "- Provide a high-level overview at the beginning of this section.\n " \
104
+ "- If necessary, include a table to compare with other methods and bold our method.\n" \
105
+ "- Assume you have some figures ('exp1.png', 'exp2.png', ...); they can be any figures you need (e.g. loss curves, comparison with other methods, visualization, or others you need). Insert figures you need with informative caption. \n" \
106
+ "- If necessary, use different subsections to distinguish different experimental setup."
107
+
108
+
109
  def generate_paper_prompts(paper_info, section):
110
  title = paper_info["title"]
111
  description = paper_info["description"]
 
114
 
115
  # fundamental_subprompt - describe the basic information of paper
116
  # instruction_subprompt - tell AI what to do
117
+ # ref_instruction_subprompt - give AI references
118
  # self_subprompt - give AI existing written parts
119
  # output_subprompt - tell AI how to output
120
+ fundamental_subprompt = "Your task is to write the {section} section of the machine learning paper with the title '{title}'. {description}\n"
121
+ instruction_subprompt = "\n" \
122
+ "Your response should follow the following instructions:\n" \
123
+ "{instruction}\n" \
124
+ "- Start with \section{{{section}}}\n"
125
+ ref_instruction_subprompt = "- Read references. " \
126
+ "Every time you use information from the references, you need to appropriately cite it (using \citep or \citet)." \
127
+ "For example of \citep, the sentence where you use information from lei2022adaptive \citep{{lei2022adaptive}}. " \
128
+ "For example of \citet, \citet{{lei2022adaptive}} claims some information.\n" \
129
+ "- Avoid citing the same reference in a same paragraph.\n" \
130
+ "\n" \
131
+ "References:\n" \
132
+ "{references}"
133
+ self_subprompt = "The existing parts of this paper is provided here: {paper}.\n"
134
+ output_subprompt = "Your response should start with \section{{{section}}}. Ensure that it can be directly compiled by LeTaX."
135
+ abstract_output_subprompt = "Your response should start with \\begin{{abstract}} and should end with \\end{{abstract}}. Ensure that it can be directly compiled by LeTaX."
136
+
137
+ reivew_prompts = PromptTemplate(
138
+ input_variables=["title", "description", "instruction", "section", "references"],
139
+ template=fundamental_subprompt + instruction_subprompt + ref_instruction_subprompt + output_subprompt)
140
+ summarization_prompts = PromptTemplate(
141
+ input_variables=["title", "description", "instruction", "section", "paper"],
142
+ template=fundamental_subprompt + instruction_subprompt + self_subprompt + output_subprompt)
143
+ abstract_prompts = PromptTemplate(
144
+ input_variables=["title", "description", "instruction", "section", "paper"],
145
+ template=fundamental_subprompt + instruction_subprompt + self_subprompt + abstract_output_subprompt)
146
 
147
  if section in ["introduction", "related works", "backgrounds"]:
148
  # title + references + instruction
149
+ prompts = reivew_prompts.format(title=title,
150
+ description=description,
151
+ instruction=INSTRUCTIONS[section],
152
+ section=section,
153
+ references=references)
154
+ elif section in ["abstract"]:
155
+ # title + instruction + paper
156
+ prompts = abstract_prompts.format(title=title,
157
+ description=description,
158
+ instruction=INSTRUCTIONS[section],
159
+ section=section,
160
+ paper=paper)
161
+ elif section in ["methodology", "experiments", "conclusion"]:
162
  # title + instruction + paper
163
+ prompts = summarization_prompts.format(title=title,
164
+ description=description,
165
+ instruction=INSTRUCTIONS[section],
166
+ section=section,
167
+ paper=paper)
168
  else:
169
  raise NotImplementedError
170
 
 
172
  return prompts
173
 
174
 
175
+ ######################################################################################################################
176
+ # Literature Review
177
+ ######################################################################################################################
178
+
179
+ BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
180
+ "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
181
+ "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX). Do not include \section{...} but you can have \subsection{...}. ",}
182
+
183
+
184
+
185
  def generate_bg_summary_prompts(paper_info, section):
186
  title = paper_info["title"]
187
  description = paper_info["description"]
utils/references.py CHANGED
@@ -39,7 +39,7 @@ def remove_newlines(serie):
39
 
40
  def search_paper_abstract(title):
41
  pg = ProxyGenerator()
42
- success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") # todo: change this to env. var. for protection.
43
  if success:
44
  scholarly.use_proxy(pg)
45
  # input the title of a paper, return its abstract
@@ -136,6 +136,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
136
  authors_str = " and ".join(authors)
137
  try:
138
  last_name = authors[0].split()[-1]
 
139
  except IndexError:
140
  last_name = "ma"
141
  # pattern = r'^\w+'
@@ -149,6 +150,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
149
  # turn the search result to a list of paper dictionary.
150
  papers_ss = []
151
  for raw_paper in search_results_ss:
 
152
  if raw_paper["abstract"] is None:
153
  continue
154
 
@@ -168,7 +170,11 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
168
  abstract = raw_paper['tldr']['text']
169
  else:
170
  abstract = remove_newlines(raw_paper['abstract'])
171
- embeddings = raw_paper['embedding']['vector']
 
 
 
 
172
  result = {
173
  "paper_id": paper_id,
174
  "title": title,
@@ -224,8 +230,6 @@ class References:
224
  for key, counts in keywords_dict.items():
225
  self.papers[key] = _collect_papers_ss(key, counts, tldr)
226
 
227
- # Remove duplicated references # todo: remove duplicated references in tex_processing procedure.
228
-
229
  def find_relevant(self, max_refs=30):
230
  # todo: use embeddings to evaluate
231
  pass
@@ -242,7 +246,12 @@ class References:
242
 
243
  bibtex_entries = []
244
  paper_ids = []
 
245
  for paper in papers:
 
 
 
 
246
  bibtex_entry = f"""@article{{{paper["paper_id"]},
247
  title = {{{paper["title"]}}},
248
  author = {{{paper["authors"]}}},
@@ -287,49 +296,40 @@ class References:
287
 
288
 
289
  if __name__ == "__main__":
290
- # r = ss_search("Deep Q-Networks")['data']
291
- # print(r)
292
- # papers_json = {}
293
- # # for i in range(len(r)):
294
- # # r[i]
295
- # #
296
- # # with open("Output.txt", "w") as text_file:
297
- # # text_file.write("Purchase Amount: %s" % TotalAmount)
298
- # embeddings = r[0]['embedding']['vector']
299
- # print(embeddings)
300
 
 
301
  refs = References()
302
- keywords_dict = {
303
- "Deep Q-Networks": 5,
304
- "Actor-Critic Algorithms": 4,
305
- "Exploration-Exploitation Trade-off": 3
306
- }
307
- refs.collect_papers(keywords_dict, method="ss", tldr=True)
308
- for k in refs.papers:
309
- papers = refs.papers[k]
310
- print("keyword: ", k)
311
- for paper in papers:
312
- print(paper["paper_id"])
313
-
314
- refs.to_json()
315
- refs.to_bibtex()
316
- refs.to_prompts()
317
- # print(refs.papers)
318
-
319
- # todo: test load_papers
320
- # write test covering `references.py`. / fix this as a stable version
321
-
322
- # for p in refs.papers:
323
- # print(p["paper_id"])
324
- # print(len(refs.papers))
325
  #
326
- # papers_json = refs.to_json()
327
- # # print(papers_json)
328
  # with open("papers.json", "w", encoding='utf-8') as text_file:
329
  # text_file.write(f"{papers_json}")
 
 
 
330
 
 
 
 
331
 
332
- # bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib"
333
- # papers = load_papers_from_bibtex(bib)
 
334
  # for paper in papers:
335
  # print(paper)
 
39
 
40
  def search_paper_abstract(title):
41
  pg = ProxyGenerator()
42
+ success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
43
  if success:
44
  scholarly.use_proxy(pg)
45
  # input the title of a paper, return its abstract
 
136
  authors_str = " and ".join(authors)
137
  try:
138
  last_name = authors[0].split()[-1]
139
+ last_name = last_name.replace("'", "")
140
  except IndexError:
141
  last_name = "ma"
142
  # pattern = r'^\w+'
 
150
  # turn the search result to a list of paper dictionary.
151
  papers_ss = []
152
  for raw_paper in search_results_ss:
153
+ print(raw_paper['title'])
154
  if raw_paper["abstract"] is None:
155
  continue
156
 
 
170
  abstract = raw_paper['tldr']['text']
171
  else:
172
  abstract = remove_newlines(raw_paper['abstract'])
173
+ embeddings_dict = raw_paper.get('embedding')
174
+ if embeddings_dict is None:
175
+ continue
176
+ else:
177
+ embeddings = raw_paper['embedding']['vector']
178
  result = {
179
  "paper_id": paper_id,
180
  "title": title,
 
230
  for key, counts in keywords_dict.items():
231
  self.papers[key] = _collect_papers_ss(key, counts, tldr)
232
 
 
 
233
  def find_relevant(self, max_refs=30):
234
  # todo: use embeddings to evaluate
235
  pass
 
246
 
247
  bibtex_entries = []
248
  paper_ids = []
249
+ seen = set()
250
  for paper in papers:
251
+ if paper["paper_id"] in seen:
252
+ continue
253
+ else:
254
+ seen.add(paper["paper_id"])
255
  bibtex_entry = f"""@article{{{paper["paper_id"]},
256
  title = {{{paper["title"]}}},
257
  author = {{{paper["authors"]}}},
 
296
 
297
 
298
  if __name__ == "__main__":
299
+ # testing search results
300
+ r = ss_search("Deep Q-Networks", limit=1) # a list of raw papers
301
+ if r['total'] > 0:
302
+ paper = r['data'][0]
303
+ # print(paper)
 
 
 
 
 
304
 
305
+ # resting References
306
  refs = References()
307
+ # keywords_dict = {
308
+ # "Deep Q-Networks": 5,
309
+ # "Actor-Critic Algorithms": 4,
310
+ # "Exploration-Exploitation Trade-off": 3
311
+ # }
312
+ # refs.collect_papers(keywords_dict, tldr=True)
313
+ # for k in refs.papers:
314
+ # papers = refs.papers[k] # for each keyword, there is a list of papers
315
+ # print("keyword: ", k)
316
+ # for paper in papers:
317
+ # print(paper["paper_id"])
 
 
 
 
 
 
 
 
 
 
 
 
318
  #
319
+ # refs.to_bibtex()
320
+ # papers_json = refs.to_json() # this json can be used to find the most relevant papers
321
  # with open("papers.json", "w", encoding='utf-8') as text_file:
322
  # text_file.write(f"{papers_json}")
323
+ #
324
+ # prompts = refs.to_prompts()
325
+ # print(prompts)
326
 
327
+ bib = "test.bib"
328
+ refs.load_papers(bib, "variance-reduction rl")
329
+ print(refs.papers)
330
 
331
+ prompts = refs.to_prompts()
332
+ for k in prompts:
333
+ print(f"{k}: {prompts[k]}\n")
334
  # for paper in papers:
335
  # print(paper)