Spaces:

auto-academic
/

auto-draft

Runtime error

App Files Files Community

shaocongma commited on May 12, 2023

Commit

c9efba3

1 Parent(s): 70e35a5

Re-format prompts using Langchain.

Browse files

Files changed (8) hide show

app.py +2 -1
auto_backgrounds.py +40 -35
latex_templates/ICLR2022/fig.png +0 -0
latex_templates/ICLR2022/template.tex +2 -1
section_generator.py +30 -17
utils/gpt_interaction.py +16 -0
utils/prompts.py +138 -38
utils/references.py +41 -41

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from utils.file_operations import hash_name
 #   8. Re-build some components using `langchain`
 #           - in `references.py`, use PromptTemplates.format -> str
 #           - in `gpt_interation`, use LLM
 # future:
 #   4. add auto_polishing function
 #   12. Change link to more appealing color # after the website is built;
@@ -104,7 +105,7 @@ with gr.Blocks(theme=theme) as demo:
     ***2023-05-03 Update***: 在公开版本中为大家提供了输入OpenAI API Key的地址, 如果有GPT-4的API KEY的话可以在这里体验!
-    在这个Huggingface Organization里也提供一定额度的免费体验： [AUTO-ACADEMIC](https://huggingface.co/organizations/auto-academic/share/HPjgazDSlkwLNCWKiAiZoYtXaJIatkWDYM).
     如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.

 #   8. Re-build some components using `langchain`
 #           - in `references.py`, use PromptTemplates.format -> str
 #           - in `gpt_interation`, use LLM
+#   5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
 # future:
 #   4. add auto_polishing function
 #   12. Change link to more appealing color # after the website is built;
     ***2023-05-03 Update***: 在公开版本中为大家提供了输入OpenAI API Key的地址, 如果有GPT-4的API KEY的话可以在这里体验!
+    在这个Huggingface Organization里也提供一定额度的免费体验： [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
     如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.

auto_backgrounds.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os.path
 from utils.references import References
 from utils.file_operations import hash_name, make_archive, copy_templates
 from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
@@ -25,16 +25,14 @@ def log_usage(usage, generating_target, print_out=True):
     TOTAL_COMPLETION_TOKENS += completion_tokens
     message = f"For generating {generating_target}, {total_tokens} tokens have been used ({prompts_tokens} for prompts; {completion_tokens} for completion). " \
-              f"{TOTAL_TOKENS} tokens have been used in total."
     if print_out:
         print(message)
     logging.info(message)
 def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
-                      search_engine="ss", tldr=False, max_kw_refs=10):
-    '''
-    todo: use `model` to control which model to use; may use another method to generate keywords or collect references
-    '''
     paper = {}
     paper_body = {}
@@ -45,13 +43,25 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
-    keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
     print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
-    ref = References(load_papers="")
-    ref.collect_papers(keywords, method=search_engine, tldr=tldr)
-    all_paper_ids = ref.to_bibtex(bibtex_path)  # todo: this will used to check if all citations are in this list
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
@@ -91,37 +101,29 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
     return make_archive("sample-output.pdf", filename)
-def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=10):
-    paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
     # todo: `list_of_methods` failed to be generated; find a solution ...
     # print("Generating figures ...")
     # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
     # log_usage(usage, "figures")
     # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
-    for section in ["introduction", "related works", "backgrounds", "abstract"]:
-        try:
-            usage = section_generation(paper, section, destination_folder, model=model)
-            log_usage(usage, section)
-        except Exception as e:
-            message = f"Failed to generate {section}. {type(e).__name__} was raised:  {e}"
-            print(message)
-            logging.info(message)
-            max_attempts = 2
-            # todo: make this part more compact
-            # re-try `max_attempts` time
-            for i in range(max_attempts):
                 time.sleep(20)
-                try:
-                    usage = section_generation(paper, section, destination_folder, model=model)
-                    log_usage(usage, section)
-                    e = None
-                except Exception as e:
-                    pass
-                if e is None:
-                    break
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
@@ -129,7 +131,10 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", se
 if __name__ == "__main__":
     title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
     description = ""
-    output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
     print(output)

 import os.path
+import json
 from utils.references import References
 from utils.file_operations import hash_name, make_archive, copy_templates
 from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
     TOTAL_COMPLETION_TOKENS += completion_tokens
     message = f"For generating {generating_target}, {total_tokens} tokens have been used ({prompts_tokens} for prompts; {completion_tokens} for completion). " \
+              f"{TOTAL_TOKENS} tokens have been used in total.\n\n"
     if print_out:
         print(message)
     logging.info(message)
 def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
+                      tldr=False, max_kw_refs=4, max_num_refs=10):
+    print("Generation setup...")
     paper = {}
     paper_body = {}
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
+    # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
+    keywords, usage = keywords_generation(input_dict) #todo: handle format error here
     print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
+    # generate keywords dictionary
+    keywords = {keyword:max_kw_refs for keyword in keywords}
+    # tmp = {}
+    # for keyword in json.loads(keywords):
+    #     tmp[keyword] = max_kw_refs
+    # keywords = tmp
+    print(f"keywords: {keywords}")
+    ref = References()
+    ref.collect_papers(keywords, tldr=tldr)
+    # todo: use `all_paper_ids` to check if all citations are in this list
+    #       in tex_processing, remove all duplicated ids
+    #       find most relevant papers; max_num_refs
+    all_paper_ids = ref.to_bibtex(bibtex_path)
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     return make_archive("sample-output.pdf", filename)
+def generate_draft(title, description="", template="ICLR2022", model="gpt-4", tldr=True, max_kw_refs=4):
+    paper, destination_folder, _ = _generation_setup(title, description, template, model, tldr, max_kw_refs)
+    raise
     # todo: `list_of_methods` failed to be generated; find a solution ...
     # print("Generating figures ...")
     # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
     # log_usage(usage, "figures")
     # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
+    for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
+        max_attempts = 4
+        attempts_count = 0
+        while attempts_count < max_attempts:
+            try:
+                usage = section_generation(paper, section, destination_folder, model=model)
+                log_usage(usage, section)
+                break
+            except Exception as e:
+                message = f"Failed to generate {section}. {type(e).__name__} was raised:  {e}"
+                print(message)
+                logging.info(message)
+                attempts_count += 1
                 time.sleep(20)
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
 if __name__ == "__main__":
+    import openai
+    openai.api_key = os.getenv("OPENAI_API_KEY")
     title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
     description = ""
+    output = generate_draft(title, description, tldr=True, max_kw_refs=10)
     print(output)

latex_templates/ICLR2022/fig.png ADDED Viewed

latex_templates/ICLR2022/template.tex CHANGED Viewed

@@ -6,7 +6,8 @@
 \input{math_commands.tex}
 \usepackage{hyperref}
 \usepackage{url}
-\usepackage{algorithmicx}
 \title{TITLE}
 \author{GPT-4}

 \input{math_commands.tex}
 \usepackage{hyperref}
 \usepackage{url}
+\usepackage{algorithm}
+\usepackage{algorithmic}
 \title{TITLE}
 \author{GPT-4}

section_generator.py CHANGED Viewed

@@ -3,6 +3,9 @@ from utils.gpt_interaction import get_responses, extract_responses, extract_keyw
 from utils.figures import generate_random_figures
 import time
 import os
 #  three GPT-based content generator:
 #       1. section_generation: used to generate main content of the paper
@@ -23,7 +26,7 @@ def section_generation_bg(paper, section, save_to_path, model):
     print(f"Generating {section}...")
     prompts = generate_bg_summary_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
-    output = extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
@@ -56,36 +59,46 @@ def section_generation(paper, section, save_to_path, model):
     print(f"Generating {section}...")
     prompts = generate_paper_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
-    output = extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
     if section == "abstract":
         with open(tex_file, "w") as f:
-            f.write(r"\begin{abstract}")
-        with open(tex_file, "a") as f:
             f.write(output)
-        with open(tex_file, "a") as f:
-            f.write(r"\end{abstract}")
     else:
         with open(tex_file, "w") as f:
-            f.write(f"\section{{{section.upper()}}}\n")
-        with open(tex_file, "a") as f:
             f.write(output)
     time.sleep(5)
     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
-def keywords_generation(input_dict,  model, max_kw_refs = 10):
     title = input_dict.get("title")
-    description = input_dict.get("description", "")
-    if title is not None:
-        prompts = generate_keywords_prompts(title, description, max_kw_refs)
-        gpt_response, usage = get_responses(prompts, model)
-        keywords = extract_keywords(gpt_response)
-        return keywords, usage
-    else:
-        raise ValueError("`input_dict` must include the key 'title'.")
 def figures_generation(paper, save_to_path, model):
     prompts = generate_experiments_prompts(paper)

 from utils.figures import generate_random_figures
 import time
 import os
+from utils.prompts import KEYWORDS_SYSTEM
+from utils.gpt_interaction import get_gpt_responses
+import json
 #  three GPT-based content generator:
 #       1. section_generation: used to generate main content of the paper
     print(f"Generating {section}...")
     prompts = generate_bg_summary_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
+    output = gpt_response # extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
     print(f"Generating {section}...")
     prompts = generate_paper_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
+    output = gpt_response # extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
     if section == "abstract":
         with open(tex_file, "w") as f:
             f.write(output)
     else:
         with open(tex_file, "w") as f:
             f.write(output)
     time.sleep(5)
     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
+# def keywords_generation(input_dict,  model, max_kw_refs = 10):
+#     title = input_dict.get("title")
+#     description = input_dict.get("description", "")
+#     if title is not None:
+#         prompts = generate_keywords_prompts(title, description, max_kw_refs)
+#         gpt_response, usage = get_responses(prompts, model)
+#         keywords = extract_keywords(gpt_response)
+#         return keywords, usage
+#     else:
+#         raise ValueError("`input_dict` must include the key 'title'.")
+def keywords_generation(input_dict):
     title = input_dict.get("title")
+    max_attempts = 10
+    attempts_count = 0
+    while attempts_count < max_attempts:
+        try:
+            keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=3, max_refs_num=5), title,
+                                     model="gpt-3.5-turbo", temperature=0.4)
+            print(keywords)
+            output = json.loads(keywords)
+            return output, usage
+        except json.decoder.JSONDecodeError:
+            attempts_count += 1
+            time.sleep(20)
+    raise RuntimeError("Fail to generate keywords.")
 def figures_generation(paper, save_to_path, model):
     prompts = generate_experiments_prompts(paper)

utils/gpt_interaction.py CHANGED Viewed

@@ -76,6 +76,22 @@ def get_responses(user_message, model="gpt-4", temperature=0.4, openai_key=None)
     log.info(assistant_message)
     return assistant_message, usage
 if __name__ == "__main__":
     test_strings = [r"f.write(r'hello world')", r"f.write(r'''hello world''')", r"f.write(r'''hello world",

     log.info(assistant_message)
     return assistant_message, usage
+def get_gpt_responses(systems, prompts, model="gpt-4", temperature=0.4):
+    conversation_history = [
+        {"role": "system", "content": systems},
+        {"role": "user", "content": prompts}
+    ]
+    response = openai.ChatCompletion.create(
+        model=model,
+        messages=conversation_history,
+        n=1,  # Number of responses you want to generate
+        temperature=temperature,  # Controls the creativity of the generated response
+    )
+    assistant_message = response['choices'][0]["message"]["content"]
+    usage = response['usage']
+    log.info(assistant_message)
+    return assistant_message, usage
 if __name__ == "__main__":
     test_strings = [r"f.write(r'hello world')", r"f.write(r'''hello world''')", r"f.write(r'''hello world",

utils/prompts.py CHANGED Viewed

@@ -1,24 +1,13 @@
 import logging
-log = logging.getLogger(__name__)
-INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the motivation for the research. Explaining its importance and relevance to the AI community. Clearly state the problem you're addressing, your proposed solution, and the specific research questions or objectives. Briefly mention key related work for context. Explain the main differences from your work. ",
-                "related works": r"Please discuss key publications, methods, and techniques in your research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
-                "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",
-                "methodology": "Please read the paper I have written and write the methodology section with three subsections: Concisely describe the techniques, algorithms, and procedures employed to address the research problem (use as many as formulas written in LaTeX). Explain the rationale behind choosing these methods, and provide sufficient detail for replication (use as many as formulas written in LaTeX). Do not make any list steps; instead, just put them in the same paragraph with sufficient explainations. Do not include \section{...} but you can have \subsection{...}. ",
-                "results": "Please write the theoretical results section using LaTeX. Include theorem and corollary to support this paper (with formulas). Explain what assumptions are used and why they are standard and necessary. Do not include \section{...}. ",
-                "experiments": "Please write the experiment section using LaTeX. Include a table to compare with other methods and bold our method. Include one figure comparison.png; this figure compares the loss curve with other methods. Do not include \section{...}. ",
-                "conclusion": "Please read the paper I have written and write the conclusion section.",
-                "abstract": "Please read the paper I have written and write the abstract."}
-INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
-                                r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
-                                r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
-BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
-                "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
-                "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}
 def generate_keywords_prompts(title, description="", num_refs=5):
     prompts = f"I am writing a machine learning paper with the title '{title}'. {description}\n" \
                 f"Generate three to five keywords. For each keyword, rate it from 1 to {num_refs}; the larger number means more important." \
@@ -39,6 +28,84 @@ def generate_experiments_prompts(paper_info):
 def generate_paper_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]
@@ -47,34 +114,57 @@ def generate_paper_prompts(paper_info, section):
     # fundamental_subprompt - describe the basic information of paper
     # instruction_subprompt - tell AI what to do
-    # references_subprompt - give AI references
     # self_subprompt - give AI existing written parts
     # output_subprompt - tell AI how to output
-    fundamental_subprompt = f"I am writing a machine learning paper with the title '{title}'. {description}\n"
-    instruction_subprompt = f"You need to write the {section} section. {INSTRUCTIONS[section]}\n"
-    # references_subprompt = f"Please read the following references: \n{references}\n"\
-    #                         f"Every time you use information from the references, you need to cite its id after the sentence; " \
-    #                        f"for example, the sentence where you use information from 1905.09788 \cite{{1905.09788}}. " \
-    #                        f"Please avoid citing the same reference in the same paragraph. \n"
-    references_subprompt = f"Please read the following references: \n{references}\n"\
-                            f"Every time you use information from the references, you need to appropriately cite it (using \citep or \citet)." \
-                           f"For example of \citep, the sentence where you use information from lei2022adaptive \citep{{lei2022adaptive}}. " \
-                           f"For example of \citet, \citet{{lei2022adaptive}} claims some information. \n" \
-                           f"Please avoid citing the same reference in the same paragraph. \n"
-    self_subprompt = f"Here is the paper that I have written: {paper}.\n"
-    output_subprompt = r"Put your response (do not include \section{...}) in the following Python script:" \
-                        f"with open(\"{section}.tex\", \"w\") as f: f.write(r'''your_response''')"
     if section in ["introduction", "related works", "backgrounds"]:
         # title + references + instruction
-        prompts = fundamental_subprompt + instruction_subprompt + references_subprompt + output_subprompt
-    elif section in ["experiments"]:
-        # only title and instruction
-        prompts = fundamental_subprompt + instruction_subprompt + output_subprompt
-    elif section in ["methodology", "abstract", "conclusion"]:
         # title + instruction + paper
-        prompts = fundamental_subprompt + instruction_subprompt + self_subprompt + output_subprompt
     else:
         raise NotImplementedError
@@ -82,6 +172,16 @@ def generate_paper_prompts(paper_info, section):
     return prompts
 def generate_bg_summary_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]

 import logging
+from langchain import PromptTemplate
+log = logging.getLogger(__name__)
+######################################################################################################################
+# Some basic functions
+######################################################################################################################
 def generate_keywords_prompts(title, description="", num_refs=5):
     prompts = f"I am writing a machine learning paper with the title '{title}'. {description}\n" \
                 f"Generate three to five keywords. For each keyword, rate it from 1 to {num_refs}; the larger number means more important." \
+######################################################################################################################
+# System Message
+######################################################################################################################
+# two parameters: min_refs_num, max_refs_num
+keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
+Instructions
+- Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
+- The length of list should between {min_refs_num} and {max_refs_num}
+- Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
+# keywords_system_template = """You are an assistant designed to provide related research fields of academic papers.
+# Instructions:
+# - Your response should follow the following output format: ["field1", "field2", "field3"]\n
+# - The length of this Python list should between {min_refs_num} and {max_refs_num}\n
+# - Use specific phrases instead of using too general words (e.g. machine learning)"""
+# two parameters: min_refs_num, max_refs_num
+exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.
+Instructions
+- Your response should always be a Python list; e.g. ["method_name_1", "method_name_2", "method_name_3"]
+- The length of list should between {min_exps_num} and {max_exps_num}
+- Use abbreviation to make each method's name have 5 characters or less."""
+# one parameter: research_field
+section_generation_system_template = r"""You are an assistant designed to write academic papers in the field of {research_field} using LaTeX.
+Instructions
+- Your response should be professional and in academic tone.
+- Always give a high-level overview at the beginning of each section or subsection.
+"""
+KEYWORDS_SYSTEM = PromptTemplate(input_variables=["min_refs_num", "max_refs_num"],
+                                 template=keywords_system_template)
+EXP_METHODS_SYSTEM = PromptTemplate(input_variables=["min_exps_num", "max_exps_num"],
+                                    template=exp_methods_system_template)
+SECTION_GENERATION_SYSTEM = PromptTemplate(input_variables=["research_field"],
+                                           template=section_generation_system_template)
+######################################################################################################################
+# Academic Paper
+######################################################################################################################
+INSTRUCTIONS = {"introduction":
+                    "- Include five paragraph: Establishing the motivation for the research. Explaining its importance and relevance to the AI community. Clearly state the problem you're addressing, your proposed solution, and the specific research questions or objectives. Briefly mention key related works for context and explain the main differences from this work. List three novel contributions of this paper.",
+               "results":
+                    "Write the theoretical results section using LaTeX. Include theorem and corollary to support this paper (with formulas). Explain what assumptions are used and why they are standard and necessary. Do not include \section{...}. ",
+                "conclusion":
+                    "- Read the existing parts of paper and write the conclusion section.",
+                "abstract":
+                    "- Read the existing parts of paper and write the abstract."}
+INSTRUCTIONS["backgrounds"] = "- Start from one high-level paragraph to state the central problem in this field with detailed examples in industrial applications and theoretical challenges. \n" \
+                              "- Followed by two to three subsections:  Explain the foundational concepts and notations that underpin your research using as many as mathematical formulas (written in LaTeX). " \
+                              "Introduce more necessary mathematical notations, equations, or algorithms that are connected to this work. Present detailed discussions on how these concepts are applied in this paper."
+INSTRUCTIONS["related works"] = r"- Discuss three to five main related fields to this paper. " \
+                                r"For each field, select five to ten key publications from references. " \
+                                r"For each reference, analyze its strengths and weaknesses in one or two sentences. " \
+                                r"Present the related works in a logical manner, often chronologically. " \
+                                r"Consider using a taxonomy or categorization to structure the discussion. " \
+                                r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
+INSTRUCTIONS["methodology"] =  "- Provide a high-level overview of the proposed method at the beginning of this section. \n " \
+                               "- Assume you have some figures ('fig1.png', 'fig2.png', ...); they can be any figures you need (e.g. flow chart, model architecture, sample output, simulation result, or others you need). Insert figures you need with informative caption. \n" \
+                               "- Use one subsection to give a detailed formulation of the proposed method and explain how it overcomes the weakness of existing methods mentioned in this paper. " \
+                                 " If necessary, write pseudo codes wrapped by \\begin{{algorithm}} ... \\end{{algorithm}} to explain the detailed steps instead of simply listing them. \n" \
+                                "- Use one follow-up subsection to highlight the key concepts in the proposed method. " \
+                                "  Elaborate the novelty of these key concepts using formulas and inserting appropriate figures. \n" \
+                                "- Ensure the name of each subsection to be specific. \n"
+INSTRUCTIONS["experiments"] =  "- Provide a high-level overview at the beginning of this section.\n " \
+                               "- If necessary, include a table to compare with other methods and bold our method.\n" \
+                               "- Assume you have some figures ('exp1.png', 'exp2.png', ...); they can be any figures you need (e.g. loss curves, comparison with other methods, visualization, or others you need). Insert figures you need with informative caption. \n" \
+                               "- If necessary, use different subsections to distinguish different experimental setup."
 def generate_paper_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]
     # fundamental_subprompt - describe the basic information of paper
     # instruction_subprompt - tell AI what to do
+    # ref_instruction_subprompt - give AI references
     # self_subprompt - give AI existing written parts
     # output_subprompt - tell AI how to output
+    fundamental_subprompt = "Your task is to write the {section} section of the machine learning paper with the title '{title}'. {description}\n"
+    instruction_subprompt = "\n" \
+                            "Your response should follow the following instructions:\n" \
+                            "{instruction}\n" \
+                            "- Start with \section{{{section}}}\n"
+    ref_instruction_subprompt = "- Read references. " \
+                                "Every time you use information from the references, you need to appropriately cite it (using \citep or \citet)." \
+                                "For example of \citep, the sentence where you use information from lei2022adaptive \citep{{lei2022adaptive}}. " \
+                                "For example of \citet, \citet{{lei2022adaptive}} claims some information.\n" \
+                                "- Avoid citing the same reference in a same paragraph.\n" \
+                                "\n" \
+                                "References:\n" \
+                                "{references}"
+    self_subprompt = "The existing parts of this paper is provided here: {paper}.\n"
+    output_subprompt = "Your response should start with \section{{{section}}}. Ensure that it can be directly compiled by LeTaX."
+    abstract_output_subprompt  = "Your response should start with \\begin{{abstract}} and should end with \\end{{abstract}}. Ensure that it can be directly compiled by LeTaX."
+    reivew_prompts =  PromptTemplate(
+        input_variables=["title", "description", "instruction", "section", "references"],
+        template=fundamental_subprompt + instruction_subprompt + ref_instruction_subprompt + output_subprompt)
+    summarization_prompts = PromptTemplate(
+        input_variables=["title", "description", "instruction", "section", "paper"],
+        template=fundamental_subprompt + instruction_subprompt + self_subprompt + output_subprompt)
+    abstract_prompts = PromptTemplate(
+        input_variables=["title", "description", "instruction", "section", "paper"],
+        template=fundamental_subprompt + instruction_subprompt + self_subprompt + abstract_output_subprompt)
     if section in ["introduction", "related works", "backgrounds"]:
         # title + references + instruction
+        prompts = reivew_prompts.format(title=title,
+                                          description=description,
+                                          instruction=INSTRUCTIONS[section],
+                                          section=section,
+                                          references=references)
+    elif section in ["abstract"]:
+        # title + instruction + paper
+        prompts = abstract_prompts.format(title=title,
+                                          description=description,
+                                          instruction=INSTRUCTIONS[section],
+                                          section=section,
+                                          paper=paper)
+    elif section in ["methodology",  "experiments", "conclusion"]:
         # title + instruction + paper
+        prompts = summarization_prompts.format(title=title,
+                                          description=description,
+                                          instruction=INSTRUCTIONS[section],
+                                          section=section,
+                                          paper=paper)
     else:
         raise NotImplementedError
     return prompts
+######################################################################################################################
+# Literature Review
+######################################################################################################################
+BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
+                "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
+                "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}
 def generate_bg_summary_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]

utils/references.py CHANGED Viewed

@@ -39,7 +39,7 @@ def remove_newlines(serie):
 def search_paper_abstract(title):
     pg = ProxyGenerator()
-    success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") # todo: change this to env. var. for protection.
     if success:
         scholarly.use_proxy(pg)
         # input the title of a paper, return its abstract
@@ -136,6 +136,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
         authors_str = " and ".join(authors)
         try:
             last_name = authors[0].split()[-1]
         except IndexError:
             last_name = "ma"
         # pattern = r'^\w+'
@@ -149,6 +150,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
         # turn the search result to a list of paper dictionary.
         papers_ss = []
         for raw_paper in search_results_ss:
             if raw_paper["abstract"] is None:
                 continue
@@ -168,7 +170,11 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
                 abstract = raw_paper['tldr']['text']
             else:
                 abstract = remove_newlines(raw_paper['abstract'])
-            embeddings = raw_paper['embedding']['vector']
             result = {
                 "paper_id": paper_id,
                 "title": title,
@@ -224,8 +230,6 @@ class References:
         for key, counts in keywords_dict.items():
             self.papers[key] = _collect_papers_ss(key, counts, tldr)
-        # Remove duplicated references # todo: remove duplicated references in tex_processing procedure.
     def find_relevant(self, max_refs=30):
         # todo: use embeddings to evaluate
         pass
@@ -242,7 +246,12 @@ class References:
         bibtex_entries = []
         paper_ids = []
         for paper in papers:
             bibtex_entry = f"""@article{{{paper["paper_id"]},
           title = {{{paper["title"]}}},
           author = {{{paper["authors"]}}},
@@ -287,49 +296,40 @@ class References:
 if __name__ == "__main__":
-    # r = ss_search("Deep Q-Networks")['data']
-    # print(r)
-    # papers_json = {}
-    # # for i in range(len(r)):
-    # #     r[i]
-    # #
-    # # with open("Output.txt", "w") as text_file:
-    # #     text_file.write("Purchase Amount: %s" % TotalAmount)
-    # embeddings = r[0]['embedding']['vector']
-    # print(embeddings)
     refs = References()
-    keywords_dict = {
-        "Deep Q-Networks": 5,
-        "Actor-Critic Algorithms": 4,
-        "Exploration-Exploitation Trade-off": 3
-    }
-    refs.collect_papers(keywords_dict, method="ss", tldr=True)
-    for k in refs.papers:
-        papers = refs.papers[k]
-        print("keyword: ", k)
-        for paper in papers:
-            print(paper["paper_id"])
-    refs.to_json()
-    refs.to_bibtex()
-    refs.to_prompts()
-    # print(refs.papers)
-    # todo: test load_papers
-    # write test covering `references.py`. / fix this as a stable version
-    # for p in refs.papers:
-    #     print(p["paper_id"])
-    # print(len(refs.papers))
     #
-    # papers_json = refs.to_json()
-    # # print(papers_json)
     # with open("papers.json", "w",  encoding='utf-8') as text_file:
     #     text_file.write(f"{papers_json}")
-    # bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib"
-    # papers = load_papers_from_bibtex(bib)
     # for paper in papers:
     #     print(paper)

 def search_paper_abstract(title):
     pg = ProxyGenerator()
+    success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
     if success:
         scholarly.use_proxy(pg)
         # input the title of a paper, return its abstract
         authors_str = " and ".join(authors)
         try:
             last_name = authors[0].split()[-1]
+            last_name = last_name.replace("'", "")
         except IndexError:
             last_name = "ma"
         # pattern = r'^\w+'
         # turn the search result to a list of paper dictionary.
         papers_ss = []
         for raw_paper in search_results_ss:
+            print(raw_paper['title'])
             if raw_paper["abstract"] is None:
                 continue
                 abstract = raw_paper['tldr']['text']
             else:
                 abstract = remove_newlines(raw_paper['abstract'])
+            embeddings_dict = raw_paper.get('embedding')
+            if embeddings_dict is None:
+                continue
+            else:
+                embeddings = raw_paper['embedding']['vector']
             result = {
                 "paper_id": paper_id,
                 "title": title,
         for key, counts in keywords_dict.items():
             self.papers[key] = _collect_papers_ss(key, counts, tldr)
     def find_relevant(self, max_refs=30):
         # todo: use embeddings to evaluate
         pass
         bibtex_entries = []
         paper_ids = []
+        seen = set()
         for paper in papers:
+            if paper["paper_id"] in seen:
+                continue
+            else:
+                seen.add(paper["paper_id"])
             bibtex_entry = f"""@article{{{paper["paper_id"]},
           title = {{{paper["title"]}}},
           author = {{{paper["authors"]}}},
 if __name__ == "__main__":
+    # testing search results
+    r = ss_search("Deep Q-Networks", limit=1)  # a list of raw papers
+    if r['total'] > 0:
+        paper = r['data'][0]
+        # print(paper)
+    # resting References
     refs = References()
+    # keywords_dict = {
+    #     "Deep Q-Networks": 5,
+    #     "Actor-Critic Algorithms": 4,
+    #     "Exploration-Exploitation Trade-off": 3
+    # }
+    # refs.collect_papers(keywords_dict, tldr=True)
+    # for k in refs.papers:
+    #     papers = refs.papers[k] # for each keyword, there is a list of papers
+    #     print("keyword: ", k)
+    #     for paper in papers:
+    #         print(paper["paper_id"])
     #
+    # refs.to_bibtex()
+    # papers_json = refs.to_json() # this json can be used to find the most relevant papers
     # with open("papers.json", "w",  encoding='utf-8') as text_file:
     #     text_file.write(f"{papers_json}")
+    #
+    # prompts = refs.to_prompts()
+    # print(prompts)
+    bib = "test.bib"
+    refs.load_papers(bib, "variance-reduction rl")
+    print(refs.papers)
+    prompts = refs.to_prompts()
+    for k in prompts:
+        print(f"{k}: {prompts[k]}\n")
     # for paper in papers:
     #     print(paper)