Spaces:

auto-academic
/

auto-draft

Runtime error

App Files Files Community

shaocongma commited on Jul 11, 2023

Commit

94dc00e

1 Parent(s): 328e8d0

Add a generator wrapper using configuration file. Edit the logic of searching references. Add Gradio UI for testing Knowledge database.

Browse files

Files changed (13) hide show

api_wrapper.py +0 -42
app.py +54 -36
assets/idealab.png +0 -0
auto_backgrounds.py → auto_generators.py +11 -17
configurations/default.yaml +29 -0
cyber-supervisor-openai.py +1 -1
idealab.py +0 -144
kdb_test.py +39 -5
references_generator.py +0 -86
utils/knowledge.py +1 -1
utils/references.py +233 -136
worker.py +1 -1
wrapper.py +57 -0

api_wrapper.py DELETED Viewed

@@ -1,42 +0,0 @@
-'''
-This script is used to wrap all generation methods together.
-todo:
-    A worker keeps running on the server. Monitor the Amazon SQS. Once receive a new message, do the following:
-        Download the corresponding configuration files on S3.
-        Change Task status from Pending to Running.
-        Call `generator_wrapper` and wait for the outputs.
-        If `generator_wrapper` returns results:
-            evaluate the results; compile it; upload results to S3 ... Change Task status from Running to Completed.
-            If anything goes wrong, raise Error.
-        If `generator_wrapper` returns nothing or Timeout, or raise any error:
-            Change Task status from Running to Failed.
-'''
-import os.path
-from auto_backgrounds import generate_draft
-import json, time
-from utils.file_operations import make_archive
-GENERATOR_MAPPING = {"fake": None,  # a fake generator
-                     "draft": generate_draft # generate academic paper
-                     }
-def generator_wrapper(config):
-    generator = GENERATOR_MAPPING[config["generator"]]
-def generator_wrapper_from_json(path_to_config_json):
-    # Read configuration file and call corresponding function
-    with open(path_to_config_json, "r", encoding='utf-8') as f:
-        config = json.load(f)
-    print("Configuration:", config)
-    # generator = GENERATOR_MAPPING.get(config["generator"])
-    generator = None
-    if generator is None:
-        # generate a fake ZIP file and upload
-        time.sleep(150)
-        zip_path = os.path.splitext(path_to_config_json)[0]+".zip"
-        return make_archive(path_to_config_json, zip_path)

app.py CHANGED Viewed

@@ -2,9 +2,10 @@ import uuid
 import gradio as gr
 import os
 import openai
-from auto_backgrounds import generate_backgrounds, generate_draft
 from utils.file_operations import list_folders, urlify
 from huggingface_hub import snapshot_download
 # todo:
 #   6. get logs when the procedure is not completed. *
@@ -22,8 +23,10 @@ from huggingface_hub import snapshot_download
 # OPENAI_API_BASE: (Optional) Support alternative OpenAI minors
 # GPT4_ENABLE: (Optional) Set it to 1 to enable GPT-4 model.
-# AWS_ACCESS_KEY_ID: (Optional) Access AWS cloud storage (you need to edit `BUCKET_NAME` in `utils/storage.py` if you need to use this function)
-# AWS_SECRET_ACCESS_KEY: (Optional) Access AWS cloud storage (you need to edit `BUCKET_NAME` in `utils/storage.py` if you need to use this function)
 # KDB_REPO: (Optional) A Huggingface dataset hosting Knowledge Databases
 # HF_TOKEN: (Optional) Access to KDB_REPO
@@ -34,7 +37,7 @@ openai_key = os.getenv("OPENAI_API_KEY")
 openai_api_base = os.getenv("OPENAI_API_BASE")
 if openai_api_base is not None:
     openai.api_base = openai_api_base
-GPT4_ENABLE = os.getenv("GPT4_ENABLE") # disable GPT-4 for public repo
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
 secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
@@ -124,7 +127,7 @@ REFERENCES = """## 一键搜索相关论文
 REFERENCES_INSTRUCTION = """### References
 这一栏用于定义AI如何选取参考文献. 目前是两种方式混合:
 1. GPT自动根据标题生成关键字，使用Semantic Scholar搜索引擎搜索文献，利用Specter获取Paper Embedding来自动选取最相关的文献作为GPT的参考资料.
-2. 用户上传bibtex文件，使用Google Scholar搜索摘要作为GPT的参考资料.
 关于有希望利用本地文件来供GPT参考的功能将在未来实装.
 """
@@ -140,7 +143,7 @@ OUTPUTS_INSTRUCTION = """### Outputs
 这一栏用于定义输出的内容：
 * Template: 用于填装内容的LaTeX模板.
 * Models: 使用GPT-4或者GPT-3.5-Turbo生成内容.
-* Prompts模式: 不生成内容, 而是生成用于生成内容的Prompts. 可以手动复制到网页版或者其他语言模型中进行使用.
 """
 OTHERS_INSTRUCTION = """### Others
@@ -164,18 +167,34 @@ def clear_inputs(*args):
 def clear_inputs_refs(*args):
     return "", 5
 def wrapped_generator(
         paper_title, paper_description,  # main input
-        openai_api_key=None, openai_url=None,  # key
-        tldr=True, max_kw_refs=10, bib_refs=None, max_tokens_ref=2048,  # references
         knowledge_database=None, max_tokens_kd=2048, query_counts=10,  # domain knowledge
         paper_template="ICLR2022", selected_sections=None, model="gpt-4", prompts_mode=False,  # outputs parameters
         cache_mode=IS_CACHE_AVAILABLE  # handle cache mode
 ):
-    # if `cache_mode` is True, then always upload the generated content to my S3.
     file_name_upload = urlify(paper_title) + "_" + uuid.uuid1().hex + ".zip"
-    if bib_refs is not None:
-        bib_refs = bib_refs.name
     if openai_api_key is not None:
         openai.api_key = openai_api_key
         try:
@@ -183,12 +202,7 @@ def wrapped_generator(
         except Exception as e:
             raise gr.Error(f"Key错误. Error: {e}")
     try:
-        output = generate_draft(
-            paper_title, description=paper_description, # main input
-           tldr=tldr, max_kw_refs=max_kw_refs, bib_refs=bib_refs, max_tokens_ref=max_tokens_ref,  # references
-           knowledge_database=knowledge_database, max_tokens_kd=max_tokens_kd, query_counts=query_counts, # domain knowledge
-           sections=selected_sections, model=model, template=paper_template, prompts_mode=prompts_mode, # outputs parameters
-           )
         if cache_mode:
             from utils.storage import upload_file
             upload_file(output, target_name=file_name_upload)
@@ -204,8 +218,6 @@ with gr.Blocks(theme=theme) as demo:
         with gr.Column(scale=2):
             key = gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key",
                              visible=not IS_OPENAI_API_KEY_AVAILABLE)
-            url = gr.Textbox(value=None, lines=1, max_lines=1, label="URL",
-                             visible=False)
             # 每个功能做一个tab
             with gr.Tab("学术论文"):
                 gr.Markdown(ACADEMIC_PAPER)
@@ -230,8 +242,8 @@ with gr.Blocks(theme=theme) as demo:
                                                               interactive=GPT4_INTERACTIVE,
                                                               info="生成论文用到的语言模型.")
                                 prompts_mode = gr.Checkbox(value=False, visible=True, interactive=True,
-                                                             label="Prompts模式",
-                                                             info="只输出用于生成论文的Prompts, 可以复制到别的地方生成论文.")
                             sections = gr.CheckboxGroup(
                                 choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
@@ -245,21 +257,27 @@ with gr.Blocks(theme=theme) as demo:
                         with gr.Column(scale=2):
                             max_kw_ref_slider = gr.Slider(minimum=1, maximum=20, value=10, step=1,
-                                                       interactive=True, label="MAX_KW_REFS",
-                                                       info="每个Keyword搜索几篇参考文献", visible=False)
                             max_tokens_ref_slider = gr.Slider(minimum=256, maximum=8192, value=2048, step=2,
-                                                       interactive=True, label="MAX_TOKENS",
-                                                       info="参考文献内容占用Prompts中的Token数")
                             tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
                                                         info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
                                                         interactive=True)
-                            gr.Markdown('''
-                            上传.bib文件提供AI需要参考的文献.
-                            ''')
-                            bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
-                                                  interactive=True)
                     with gr.Row():
                         with gr.Column(scale=1):
@@ -267,11 +285,11 @@ with gr.Blocks(theme=theme) as demo:
                         with gr.Column(scale=2):
                             query_counts_slider = gr.Slider(minimum=1, maximum=20, value=10, step=1,
-                                                       interactive=True, label="QUERY_COUNTS",
-                                                       info="从知识库内检索多少条内���", visible=False)
                             max_tokens_kd_slider = gr.Slider(minimum=256, maximum=8192, value=2048, step=2,
-                                                      interactive=True, label="MAX_TOKENS",
-                                                      info="知识库内容占用Prompts中的Token数")
                             domain_knowledge = gr.Dropdown(label="预载知识库",
                                                            choices=ALL_DATABASES,
                                                            value="(None)",
@@ -296,8 +314,8 @@ with gr.Blocks(theme=theme) as demo:
             json_output = gr.JSON(label="References")
     clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
     submit_button_pp.click(fn=wrapped_generator,
-                           inputs=[title, description_pp, key, url,
-                                   tldr_checkbox, max_kw_ref_slider,  bibtex_file, max_tokens_ref_slider,
                                    domain_knowledge, max_tokens_kd_slider, query_counts_slider,
                                    template, sections, model_selection, prompts_mode], outputs=file_output)

 import gradio as gr
 import os
 import openai
+import yaml
 from utils.file_operations import list_folders, urlify
 from huggingface_hub import snapshot_download
+from wrapper import generator_wrapper
 # todo:
 #   6. get logs when the procedure is not completed. *
 # OPENAI_API_BASE: (Optional) Support alternative OpenAI minors
 # GPT4_ENABLE: (Optional) Set it to 1 to enable GPT-4 model.
+# AWS_ACCESS_KEY_ID: (Optional)
+#   Access AWS cloud storage (you need to edit `BUCKET_NAME` in `utils/storage.py` if you need to use this function)
+# AWS_SECRET_ACCESS_KEY: (Optional)
+#   Access AWS cloud storage (you need to edit `BUCKET_NAME` in `utils/storage.py` if you need to use this function)
 # KDB_REPO: (Optional) A Huggingface dataset hosting Knowledge Databases
 # HF_TOKEN: (Optional) Access to KDB_REPO
 openai_api_base = os.getenv("OPENAI_API_BASE")
 if openai_api_base is not None:
     openai.api_base = openai_api_base
+GPT4_ENABLE = os.getenv("GPT4_ENABLE")  # disable GPT-4 for public repo
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
 secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
 REFERENCES_INSTRUCTION = """### References
 这一栏用于定义AI如何选取参考文献. 目前是两种方式混合:
 1. GPT自动根据标题生成关键字，使用Semantic Scholar搜索引擎搜索文献，利用Specter获取Paper Embedding来自动选取最相关的文献作为GPT的参考资料.
+2. 用户通过输入文章标题(用英文逗号隔开), AI会自动搜索文献作为参考资料.
 关于有希望利用本地文件来供GPT参考的功能将在未来实装.
 """
 这一栏用于定义输出的内容：
 * Template: 用于填装内容的LaTeX模板.
 * Models: 使用GPT-4或者GPT-3.5-Turbo生成内容.
+* Prompts模式: 不生成内容, 而是生成用于生成内容的Prompts. 可以手动复制到网页版或者其他语言模型中进行使用. (放在输出的ZIP文件的prompts.json文件中)
 """
 OTHERS_INSTRUCTION = """### Others
 def clear_inputs_refs(*args):
     return "", 5
 def wrapped_generator(
         paper_title, paper_description,  # main input
+        openai_api_key=None,  # key
+        tldr=True, max_kw_refs=10, refs=None, max_tokens_ref=2048,  # references
         knowledge_database=None, max_tokens_kd=2048, query_counts=10,  # domain knowledge
         paper_template="ICLR2022", selected_sections=None, model="gpt-4", prompts_mode=False,  # outputs parameters
         cache_mode=IS_CACHE_AVAILABLE  # handle cache mode
 ):
     file_name_upload = urlify(paper_title) + "_" + uuid.uuid1().hex + ".zip"
+    # load the default configuration file
+    with open("configurations/default.yaml", 'r') as file:
+        config = yaml.safe_load(file)
+    config["paper"]["title"] = paper_title
+    config["paper"]["description"] = paper_description
+    config["references"]["tldr"] = tldr
+    config["references"]["max_kw_refs"] = max_kw_refs
+    config["references"]["refs"] = refs
+    config["references"]["max_tokens_ref"] = max_tokens_ref
+    config["domain_knowledge"]["knowledge_database"] = knowledge_database
+    config["domain_knowledge"]["max_tokens_kd"] = max_tokens_kd
+    config["domain_knowledge"]["query_counts"] = query_counts
+    config["output"]["selected_sections"] = selected_sections
+    config["output"]["model"] = model
+    config["output"]["template"] = paper_template
+    config["output"]["prompts_mode"] = prompts_mode
     if openai_api_key is not None:
         openai.api_key = openai_api_key
         try:
         except Exception as e:
             raise gr.Error(f"Key错误. Error: {e}")
     try:
+        output = generator_wrapper(config)
         if cache_mode:
             from utils.storage import upload_file
             upload_file(output, target_name=file_name_upload)
         with gr.Column(scale=2):
             key = gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key",
                              visible=not IS_OPENAI_API_KEY_AVAILABLE)
             # 每个功能做一个tab
             with gr.Tab("学术论文"):
                 gr.Markdown(ACADEMIC_PAPER)
                                                               interactive=GPT4_INTERACTIVE,
                                                               info="生成论文用到的语言模型.")
                                 prompts_mode = gr.Checkbox(value=False, visible=True, interactive=True,
+                                                           label="Prompts模式",
+                                                           info="只输出用于生成论文的Prompts, 可以复制到别的地方生成论文.")
                             sections = gr.CheckboxGroup(
                                 choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
                         with gr.Column(scale=2):
                             max_kw_ref_slider = gr.Slider(minimum=1, maximum=20, value=10, step=1,
+                                                          interactive=True, label="MAX_KW_REFS",
+                                                          info="每个Keyword搜索几篇参考文献", visible=False)
                             max_tokens_ref_slider = gr.Slider(minimum=256, maximum=8192, value=2048, step=2,
+                                                              interactive=True, label="MAX_TOKENS",
+                                                              info="参考文献内容占用Prompts中的Token数")
                             tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
                                                         info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
                                                         interactive=True)
+                            text_ref = gr.Textbox(lines=5, label="References (Optional)", visible=True,
+                                                  info="交给AI参考的文献的标题, 用英文逗号`,`隔开.")
+                            gr.Examples(
+                                examples = ["Understanding the Impact of Model Incoherence on Convergence of Incremental SGD with Random Reshuffle,"
+                                            "Variance-Reduced Off-Policy TDC Learning: Non-Asymptotic Convergence Analysis,"
+                                            "Greedy-GQ with Variance Reduction: Finite-time Analysis and Improved Complexity"],
+                                inputs=text_ref,
+                                cache_examples=False
+                            )
                     with gr.Row():
                         with gr.Column(scale=1):
                         with gr.Column(scale=2):
                             query_counts_slider = gr.Slider(minimum=1, maximum=20, value=10, step=1,
+                                                            interactive=True, label="QUERY_COUNTS",
+                                                            info="从知识库内检索多少条内容", visible=False)
                             max_tokens_kd_slider = gr.Slider(minimum=256, maximum=8192, value=2048, step=2,
+                                                             interactive=True, label="MAX_TOKENS",
+                                                             info="知识库内容占用Prompts中的Token数")
                             domain_knowledge = gr.Dropdown(label="预载知识库",
                                                            choices=ALL_DATABASES,
                                                            value="(None)",
             json_output = gr.JSON(label="References")
     clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
     submit_button_pp.click(fn=wrapped_generator,
+                           inputs=[title, description_pp, key,
+                                   tldr_checkbox, max_kw_ref_slider, text_ref, max_tokens_ref_slider,
                                    domain_knowledge, max_tokens_kd_slider, query_counts_slider,
                                    template, sections, model_selection, prompts_mode], outputs=file_output)

assets/idealab.png DELETED Viewed

Binary file (52.1 kB)

auto_backgrounds.py → auto_generators.py RENAMED Viewed

@@ -40,7 +40,7 @@ def log_usage(usage, generating_target, print_out=True):
 def _generation_setup(title, description="", template="ICLR2022",
-                      tldr=False, max_kw_refs=10, bib_refs=None, max_tokens_ref=2048,  # generating references
                       knowledge_database=None, max_tokens_kd=2048, query_counts=10,  # querying from knowledge database
                       debug=True):
     """
@@ -115,7 +115,7 @@ def _generation_setup(title, description="", template="ICLR2022",
     print("Keywords: \n", keywords)
     # todo: in some rare situations, collected papers will be an empty list. handle this issue
-    ref = References(title, bib_refs)
     ref.collect_papers(keywords, tldr=tldr)
     references = ref.to_prompts(max_tokens=max_tokens_ref)
     all_paper_ids = ref.to_bibtex(bibtex_path)
@@ -200,7 +200,7 @@ def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-
 def generate_draft(title, description="", # main input
-                   tldr=True, max_kw_refs=10, bib_refs=None, max_tokens_ref=2048,  # references
                    knowledge_database=None, max_tokens_kd=2048, query_counts=10, # domain knowledge
                    sections=None, model="gpt-4", template="ICLR2022", prompts_mode=False, # outputs parameters
                    ):
@@ -245,7 +245,7 @@ def generate_draft(title, description="", # main input
                     "abstract"]
     else:
         sections = _filter_sections(sections)
-    paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, bib_refs,
                                                      max_tokens_ref=max_tokens_ref, max_tokens_kd=max_tokens_kd,
                                                      query_counts=query_counts,
                                                      knowledge_database=knowledge_database)
@@ -254,11 +254,10 @@ def generate_draft(title, description="", # main input
     prompts_dict = {}
     print(f"================PROCESSING================")
     for section in sections:
         if prompts_mode:
-            prompts = generate_paper_prompts(paper, section)
-            prompts_dict[section] = prompts
             continue
         print(f"Generate {section} part...")
         max_attempts = 4
         attempts_count = 0
@@ -274,21 +273,16 @@ def generate_draft(title, description="", # main input
                 logging.info(message)
                 attempts_count += 1
                 time.sleep(15)
     # post-processing
     print("================POST-PROCESSING================")
     create_copies(destination_folder)
-    input_dict = {"title": title, "description": description, "generator": "generate_draft"}
-    filename = hash_name(input_dict) + ".zip"
     print("\nMission completed.\n")
-    if prompts_mode:
-        filename = hash_name(input_dict) + ".json"
-        with open(filename, "w") as f:
-            json.dump(prompts_dict, f)
-        return filename
-    else:
-        return make_archive(destination_folder, filename)
 if __name__ == "__main__":

 def _generation_setup(title, description="", template="ICLR2022",
+                      tldr=False, max_kw_refs=10, refs=None, max_tokens_ref=2048,  # generating references
                       knowledge_database=None, max_tokens_kd=2048, query_counts=10,  # querying from knowledge database
                       debug=True):
     """
     print("Keywords: \n", keywords)
     # todo: in some rare situations, collected papers will be an empty list. handle this issue
+    ref = References(title, load_papers=refs)
     ref.collect_papers(keywords, tldr=tldr)
     references = ref.to_prompts(max_tokens=max_tokens_ref)
     all_paper_ids = ref.to_bibtex(bibtex_path)
 def generate_draft(title, description="", # main input
+                   tldr=True, max_kw_refs=10, refs=None, max_tokens_ref=2048,  # references
                    knowledge_database=None, max_tokens_kd=2048, query_counts=10, # domain knowledge
                    sections=None, model="gpt-4", template="ICLR2022", prompts_mode=False, # outputs parameters
                    ):
                     "abstract"]
     else:
         sections = _filter_sections(sections)
+    paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, refs,
                                                      max_tokens_ref=max_tokens_ref, max_tokens_kd=max_tokens_kd,
                                                      query_counts=query_counts,
                                                      knowledge_database=knowledge_database)
     prompts_dict = {}
     print(f"================PROCESSING================")
     for section in sections:
+        prompts = generate_paper_prompts(paper, section)
+        prompts_dict[section] = prompts
         if prompts_mode:
             continue
         print(f"Generate {section} part...")
         max_attempts = 4
         attempts_count = 0
                 logging.info(message)
                 attempts_count += 1
                 time.sleep(15)
     # post-processing
     print("================POST-PROCESSING================")
     create_copies(destination_folder)
+    filename = "prompts.json"
+    with open(os.path.join(destination_folder, filename), "w") as f:
+        json.dump(prompts_dict, f)
     print("\nMission completed.\n")
+    return destination_folder
+       #  return make_archive(destination_folder, filename)
 if __name__ == "__main__":

configurations/default.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+date: 2023-07-11
+generator: "auto_draft"
+paper:
+  title: "playing atari game with deep reinforcement learning"
+  description: ""
+references:
+  tldr: True
+  max_kw_refs: 10
+  max_tokens_ref: 2048
+  refs: null
+domain_knowledge:
+  knowledge_database: null
+  max_tokens_kd: 2048
+  query_counts: 10
+output:
+  template: "default"
+  model: "gpt-4"
+  selected_sections: null
+  prompts_mode: False

cyber-supervisor-openai.py CHANGED Viewed

@@ -3,7 +3,7 @@ import openai
 import ast
 from tools import functions, TOOLS
-MAX_ITER = 5
 openai.api_key = os.getenv("OPENAI_API_KEY")
 default_model = os.getenv("DEFAULT_MODEL")

 import ast
 from tools import functions, TOOLS
+MAX_ITER = 99
 openai.api_key = os.getenv("OPENAI_API_KEY")
 default_model = os.getenv("DEFAULT_MODEL")

idealab.py DELETED Viewed

@@ -1,144 +0,0 @@
-import gradio as gr
-import os
-import openai
-from utils.references import References
-from utils.gpt_interaction import GPTModel
-from utils.prompts import SYSTEM
-openai_key = os.getenv("OPENAI_API_KEY")
-default_model = os.getenv("DEFAULT_MODEL")
-if default_model is None:
-    # default_model = "gpt-3.5-turbo-16k"
-    default_model = "gpt-4"
-openai.api_key = openai_key
-paper_system_prompt =  '''You are an assistant designed to propose choices of research direction.
-The user will input questions or some keywords of a fields. You need to generate some paper titles and main contributions. Ensure follow the following instructions:
-Instruction:
-- Your response should follow the JSON format.
-- Your response should have the following structure:
-{
-    "your suggested paper title":
-        {
-            "summary": "an overview introducing what this paper will include",
-            "contributions": {
-                "contribution1": {"statement": "briefly describe this contribution", "reason": "reason why this contribution can make this paper outstanding"},
-                "contribution2": {"statement": "briefly describe this contribution", "reason": "reason why this contribution can make this paper outstanding"},
-                ...
-            }
-        }
-    "your suggested paper title":
-        {
-            "summary": "an overview introducing what this paper will include",
-            "contributions": {
-                "contribution1": {"statement": "briefly describe this contribution", "reason": "reason why this contribution can make this paper outstanding"},
-                "contribution2": {"statement": "briefly describe this contribution", "reason": "reason why this contribution can make this paper outstanding"},
-                ...
-            }
-        }
-    ...
-}
-- Please list three to five suggested title and at least three contributions for each paper.
-'''
-contribution_system_prompt = '''You are an assistant designed to criticize the contributions of a paper. You will be provided Paper's Title, References and Contributions. Ensure follow the following instructions:
-Instruction:
-- Your response should follow the JSON format.
-- Your response should have the following structure:
-{
-    "title": "the title provided by the user",
-    "comment": "your thoughts on if this title clearly reflects the key ideas of this paper and explain why"
-    "contributions": {
-        "contribution1": {"statement": "briefly describe what the contribution is",
-                          "reason": "reason why the user claims it is a contribution",
-                          "judge": "your thought about if this is a novel contribution and explain why",
-                          "suggestion": "your suggestion on how to modify the research direction to enhance the novelty "},
-        "contribution2": {"statement": "briefly describe what the contribution is",
-                          "reason": "reason why the user claims it is a contribution",
-                          "judge": "your thought about if this is a novel contribution and explain why",
-                          "suggestion": "your suggestion on how to modify the research direction to enhance the novelty "},
-        ...
-    }
-}
-- You need to carefully check if the claimed contribution has been made in the provided references, which makes the contribution not novel.
-- You also need to propose your concerns on if any of contributions could be incremental or just a mild modification on an existing work.
-'''
-ANNOUNCEMENT = """
-<h1 style="text-align: center"><img src='/file=assets/idealab.png' width=36px style="display: inline"/>灵感实验室IdeaLab</h1>
-<p>灵感实验室IdeaLab可以为你选择你下一篇论文的研究方向! 输入你的研究领域或者任何想法, 灵感实验室会自动生成若干个论文标题+论文的主要贡献供你选择. </p>
-<p>除此之外, 输入你的论文标题+主要贡献, 它会自动搜索相关文献, 来验证这个想法是不是有人做过了.</p>
-"""
-def criticize_my_idea(title, contributions, max_tokens=4096):
-    ref = References(title=title, description=f"{contributions}")
-    keywords, _ = llm(systems=SYSTEM["keywords"], prompts=title, return_json=True)
-    keywords = {keyword: 10 for keyword in keywords}
-    ref.collect_papers(keywords)
-    ref_prompt = ref.to_prompts(max_tokens=max_tokens)
-    prompt = f"Title: {title}\n References: {ref_prompt}\n Contributions: {contributions}"
-    output, _ = llm(systems=contribution_system_prompt, prompts=prompt, return_json=True)
-    return output, ref_prompt
-def paste_title(suggestions):
-    if suggestions:
-        title = suggestions['title']['new title']
-        contributions = suggestions['contributions']
-        return title, contributions, {}, {}, {}
-    else:
-        return "", "", {}, {}, {}
-def generate_choices(thoughts):
-    output, _ = llm(systems=paper_system_prompt, prompts=thoughts, return_json=True)
-    return output
-# def translate_json(json_input):
-#     system_prompt = "You are a translation bot. The user will input a JSON format string. You need to translate it into Chinese and return in the same formmat."
-#     output, _ = llm(systems=system_prompt, prompts=str(json_input), return_json=True)
-#     return output
-with gr.Blocks() as demo:
-    llm = GPTModel(model=default_model)
-    gr.HTML(ANNOUNCEMENT)
-    with gr.Row():
-        with gr.Tab("生成论文想法 (Generate Paper Ideas)"):
-            thoughts_input = gr.Textbox(label="Thoughts")
-            with gr.Accordion("Show prompts", open=False):
-                prompts_1 = gr.Textbox(label="Prompts", interactive=False, value=paper_system_prompt)
-            with gr.Row():
-                button_generate_idea = gr.Button("Make it an idea!", variant="primary")
-        with gr.Tab("验证想法可行性 (Validate Feasibility)"):
-            title_input = gr.Textbox(label="Title")
-            contribution_input = gr.Textbox(label="Contributions", lines=5)
-            with gr.Accordion("Show prompts", open=False):
-                prompts_2 = gr.Textbox(label="Prompts", interactive=False, value=contribution_system_prompt)
-            with gr.Row():
-                button_submit = gr.Button("Criticize my idea!", variant="primary")
-        with gr.Tab("生成论文 (Generate Paper)"):
-            gr.Markdown("...")
-        with gr.Column(scale=1):
-            contribution_output = gr.JSON(label="Contributions")
-            # cn_output = gr.JSON(label="主要贡献")
-            with gr.Accordion("References", open=False):
-                references_output = gr.JSON(label="References")
-    button_submit.click(fn=criticize_my_idea, inputs=[title_input, contribution_input], outputs=[contribution_output, references_output])
-    button_generate_idea.click(fn=generate_choices, inputs=thoughts_input, outputs=contribution_output)#.success(translate_json, contribution_output, cn_output)
-demo.queue(concurrency_count=1, max_size=5, api_open=False)
-demo.launch(show_error=True)

kdb_test.py CHANGED Viewed

@@ -6,11 +6,15 @@ import gradio as gr
 import os
 import json
 from models import EMBEDDINGS
-# todo: 功能还没做
-HF_TOKEN = None # os.getenv("HF_TOKEN")
-REPO_ID = None # os.getenv("KDB_REPO")
 if HF_TOKEN is not None and REPO_ID is not None:
     snapshot_download(REPO_ID, repo_type="dataset", local_dir="knowledge_databases/",
                       local_dir_use_symlinks=False, token=HF_TOKEN)
@@ -50,6 +54,29 @@ def query_from_kdb(input, kdb, query_counts):
         raise RuntimeError(f"Failed to query from FAISS.")
     return domain_knowledge, ""
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
@@ -76,9 +103,16 @@ with gr.Blocks() as demo:
                                                 interactive=True, label="QUERY_COUNTS",
                                                 info="How many contents will be retrieved from the vector database.")
-        retrieval_output = gr.JSON(label="Output")
-    button_retrieval.click(fn=query_from_kdb, inputs=[user_input, kdb_dropdown, query_counts_slider], outputs=[retrieval_output, user_input])
 demo.queue(concurrency_count=1, max_size=5, api_open=False)
 demo.launch(show_error=True)

 import os
 import json
 from models import EMBEDDINGS
+from utils.gpt_interaction import GPTModel
+from utils.prompts import SYSTEM
+import openai
+llm = GPTModel(model="gpt-3.5-turbo")
+openai.api_key = os.getenv("OPENAI_API_KEY")
+HF_TOKEN = os.getenv("HF_TOKEN")
+REPO_ID = os.getenv("KDB_REPO")
 if HF_TOKEN is not None and REPO_ID is not None:
     snapshot_download(REPO_ID, repo_type="dataset", local_dir="knowledge_databases/",
                       local_dir_use_symlinks=False, token=HF_TOKEN)
         raise RuntimeError(f"Failed to query from FAISS.")
     return domain_knowledge, ""
+def query_from_kdb_llm(title, contributions, kdb, query_counts):
+    if kdb == "(None)":
+        return {"knowledge_database": "(None)", "title": title, "contributions": contributions, "output": ""}, "", {}
+    db_path = f"knowledge_databases/{kdb}"
+    db_config_path = os.path.join(db_path, "db_meta.json")
+    db_index_path = os.path.join(db_path, "faiss_index")
+    if os.path.isdir(db_path):
+        # load configuration file
+        with open(db_config_path, "r", encoding="utf-8") as f:
+            db_config = json.load(f)
+        model_name = db_config["embedding_model"]
+        embeddings = EMBEDDINGS[model_name]
+        db = FAISS.load_local(db_index_path, embeddings)
+        knowledge = Knowledge(db=db)
+        prompts = f"Title: {title}\n Contributions: {contributions}"
+        preliminaries_kw, _ = llm(systems=SYSTEM["preliminaries"], prompts=prompts, return_json=True)
+        knowledge.collect_knowledge(preliminaries_kw, max_query=query_counts)
+        domain_knowledge = knowledge.to_json()
+    else:
+        raise RuntimeError(f"Failed to query from FAISS.")
+    return domain_knowledge, "", preliminaries_kw
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
                                                 interactive=True, label="QUERY_COUNTS",
                                                 info="How many contents will be retrieved from the vector database.")
+        with gr.Column():
+            retrieval_output = gr.JSON(label="Output")
+            llm_kws = gr.JSON(label="Keywords generated by LLM")
+    button_retrieval.click(fn=query_from_kdb,
+                           inputs=[user_input, kdb_dropdown, query_counts_slider],
+                           outputs=[retrieval_output, user_input])
+    button_retrieval_2.click(fn=query_from_kdb_llm,
+                             inputs=[title_input, contribution_input, kdb_dropdown, query_counts_slider],
+                             outputs=[retrieval_output, user_input, llm_kws])
 demo.queue(concurrency_count=1, max_size=5, api_open=False)
 demo.launch(show_error=True)

references_generator.py DELETED Viewed

@@ -1,86 +0,0 @@
-'''
-This script is used to generate the most relevant papers of a given title.
-    - Search for as many as possible references. For 10~15 keywords, 10 references each.
-    - Sort the results from most relevant to least relevant.
-    - Return the most relevant using token size.
-Note: we do not use this function in auto-draft function. It has been integrated in that.
-'''
-import os.path
-import json
-from utils.references import References
-from section_generator import keywords_generation # section_generation_bg,  #, figures_generation, section_generation
-import itertools
-from gradio_client import Client
-def generate_raw_references(title, description="",
-                            bib_refs=None, tldr=False, max_kw_refs=10,
-                            save_to="ref.bib"):
-    # load pre-provided references
-    ref = References(title, bib_refs)
-    # generate multiple keywords for searching
-    input_dict = {"title": title, "description": description}
-    keywords, usage = keywords_generation(input_dict)
-    keywords = list(keywords)
-    comb_keywords = list(itertools.combinations(keywords, 2))
-    for comb_keyword in comb_keywords:
-        keywords.append(" ".join(comb_keyword))
-    keywords = {keyword:max_kw_refs for keyword in keywords}
-    print(f"keywords: {keywords}\n\n")
-    ref.collect_papers(keywords, tldr=tldr)
-    paper_json = ref.to_json()
-    with open(save_to, "w") as f:
-        json.dump(paper_json, f)
-    return save_to, ref # paper_json
-def generate_top_k_references(title, description="",
-                            bib_refs=None, tldr=False, max_kw_refs=10,  save_to="ref.bib", top_k=5):
-    json_path, ref_raw = generate_raw_references(title, description, bib_refs, tldr, max_kw_refs,  save_to)
-    json_content = ref_raw.to_json()
-    client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
-    result = client.predict(
-        title,  # str  in 'Title' Textbox component
-        json_path,  # str (filepath or URL to file) in 'Papers JSON (as string)' File component
-        top_k,  # int | float (numeric value between 1 and 50) in 'Top-k Relevant Papers' Slider component
-        api_name="/get_k_relevant_papers"
-    )
-    with open(result) as f:
-        result = json.load(f)
-    return result
-if __name__ == "__main__":
-    import openai
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-    title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
-    description = ""
-    save_to = "paper.json"
-    save_to, paper_json = generate_raw_references(title, description, save_to=save_to)
-    print("`paper.json` has been generated. Now evaluating its similarity...")
-    k = 5
-    client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
-    result = client.predict(
-        title,  # str  in 'Title' Textbox component
-        save_to,  # str (filepath or URL to file) in 'Papers JSON (as string)' File component
-        k,  # int | float (numeric value between 1 and 50) in 'Top-k Relevant Papers' Slider component
-        api_name="/get_k_relevant_papers"
-    )
-    with open(result) as f:
-        result = json.load(f)
-    print(result)
-    save_to = "paper2.json"
-    with open(save_to, "w") as f:
-        json.dump(result, f)

utils/knowledge.py CHANGED Viewed

@@ -16,7 +16,7 @@ class Knowledge:
         self.db = db
         self.contents = []
-    def collect_knowledge(self, keywords_dict, max_query):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};

         self.db = db
         self.contents = []
+    def collect_knowledge(self, keywords_dict: dict, max_query: int):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};

utils/references.py CHANGED Viewed

@@ -3,52 +3,68 @@
 #
 # Generate references:
 #   `Reference` class:
-#       1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract.
 #       2. Given some keywords; use Semantic Scholar API to find papers.
 #       3. Generate bibtex from the selected papers. --> to_bibtex()
 #       4. Generate prompts from the selected papers: --> to_prompts()
 #               A sample prompt: {"paper_id": "paper summary"}
-# todo: (1) citations & citedby of provided papers:
-#       load the pre-defined papers; use S2 to find all related works
-#       add all citations to `bib_papers`
-#       add all citedby to `bib_papers`
-#       use Semantic Scholar to find their embeddings
-#       (2) separate references:
-#       divide references into different groups to reduce the tokens count
-#       for generating different paragraph of related works, use different set of references
-from typing import Dict, List
-import requests
 import re
 import bibtexparser
-import random
-from scholarly import scholarly
-from scholarly import ProxyGenerator
-import tiktoken
-import itertools, uuid, json
-from gradio_client import Client
-import time
 import numpy as np
 from numpy.linalg import norm
 URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
 MAX_BATCH_SIZE = 16
 MAX_ATTEMPTS = 20
 ######################################################################################################################
 # Some basic tools
 ######################################################################################################################
 def evaluate_cosine_similarity(v1, v2):
     try:
-        return np.dot(v1, v2)/(norm(v1)*norm(v2))
     except ValueError:
         return 0.0
 def chunks(lst, chunk_size=MAX_BATCH_SIZE):
     """Splits a longer list to respect batch size"""
     for i in range(0, len(lst), chunk_size):
-        yield lst[i : i + chunk_size]
 def embed(papers):
     embeddings_by_paper_id: Dict[str, List[float]] = {}
@@ -64,6 +80,7 @@ def embed(papers):
     return embeddings_by_paper_id
 def get_embeddings(paper_title, paper_description):
     output = [{"title": paper_title, "abstract": paper_description, "paper_id": "target_paper"}]
     emb_vector = embed(output)["target_paper"]
@@ -71,9 +88,17 @@ def get_embeddings(paper_title, paper_description):
     target_paper["embeddings"] = emb_vector
     return target_paper
 def get_top_k(papers_dict, paper_title, paper_description, k=None):
     target_paper = get_embeddings(paper_title, paper_description)
-    papers = papers_dict # must include embeddings
     # if k < len(papers_json), return k most relevant papers
     # if k >= len(papers_json) or k is None, return all papers
@@ -88,7 +113,7 @@ def get_top_k(papers_dict, paper_title, paper_description, k=None):
     for k in papers:
         v = papers[k]
         embedding_vector = v["embeddings"]
-        cos_sim  = evaluate_cosine_similarity(embedding_vector, target_embedding_vector)
         papers[k]["cos_sim"] = cos_sim
     # return the best k papers
@@ -97,14 +122,6 @@ def get_top_k(papers_dict, paper_title, paper_description, k=None):
         sorted_papers[key].pop("embeddings", None)
     return sorted_papers
-def remove_newlines(serie):
-    # This function is applied to the abstract of each paper to reduce the length of prompts.
-    serie = serie.replace('\n', ' ')
-    serie = serie.replace('\\n', ' ')
-    serie = serie.replace('  ', ' ')
-    serie = serie.replace('  ', ' ')
-    return serie
 def search_paper_abstract(title):
     pg = ProxyGenerator()
@@ -123,6 +140,159 @@ def search_paper_abstract(title):
     return remove_newlines(found_paper['bib']['abstract'])
 def load_papers_from_bibtex(bib_file_path):
     with open(bib_file_path) as bibtex_file:
         bib_database = bibtexparser.load(bibtex_file)
@@ -154,15 +324,20 @@ def load_papers_from_bibtex(bib_file_path):
             bib_papers.append(result)
         return bib_papers
-# `tokenizer`: used to count how many tokens
-tokenizer_name = tiktoken.encoding_for_model('gpt-4')
-tokenizer = tiktoken.get_encoding(tokenizer_name.name)
-def tiktoken_len(text):
-    # evaluate how many tokens for the given text
-    tokens = tokenizer.encode(text, disallowed_special=())
-    return len(tokens)
 ######################################################################################################################
@@ -174,7 +349,7 @@ def ss_search(keywords, limit=20, fields=None):
         fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
     keywords = keywords.lower()
     keywords = keywords.replace(" ", "+")
-    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
     # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
     headers = {"Accept": "*/*"}
@@ -183,27 +358,6 @@ def ss_search(keywords, limit=20, fields=None):
 def _collect_papers_ss(keyword, counts=3, tldr=False):
-    def externalIds2link(externalIds):
-        # Sample externalIds:
-        #   "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
-        if externalIds:
-            # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
-            # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
-            # DBLP
-            dblp_id = externalIds.get('DBLP')
-            if dblp_id is not None:
-                dblp_link = f"dblp.org/rec/{dblp_id}"
-                return dblp_link
-            # arXiv
-            arxiv_id = externalIds.get('ArXiv')
-            if arxiv_id is not None:
-                arxiv_link = f"arxiv.org/abs/{arxiv_id}"
-                return arxiv_link
-            return ""
-        else:
-            # if this is an empty dictionary, return an empty string
-            return ""
     def extract_paper_id(last_name, year_str, title):
         pattern = r'^\w+'
         words = re.findall(pattern, title)
@@ -289,24 +443,28 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
 ######################################################################################################################
 class References:
-    def __init__(self, title, load_papers=None, keyword="customized_refs", description=""):
         if load_papers is not None:
-            self.papers = {keyword: load_papers_from_bibtex(load_papers)}
-        else:
-            self.papers = {}
         self.title = title
         self.description = description
-    def load_papers(self, bibtex, keyword):
-        self.papers[keyword] = load_papers_from_bibtex(bibtex)
-    def generate_keywords_dict(self):
         keywords_dict = {}
         for k in self.papers:
             keywords_dict[k] = len(self.papers[k])
         return keywords_dict
-    def collect_papers(self, keywords_dict, tldr=False):
         """
         Collect as many papers as possible
@@ -320,21 +478,15 @@ class References:
             keywords.append(" ".join(comb_keyword))
         for key in keywords:
             self.papers[key] = _collect_papers_ss(key, 10, tldr)
-        # print("Collected papers: ", papers)
-        # for key, counts in keywords_dict.items():
-        #     self.papers[key] = _collect_papers_ss(key, counts, tldr)
-    def to_bibtex(self, path_to_bibtex="ref.bib"):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
-        # todo:
-        #   use embeddings to evaluate; keep top k relevant references in papers
-        #   send (title, .bib file) to evaluate embeddings; recieve truncated papers
         papers = self._get_papers(keyword="_all")
-        l = len(papers)
-        print(f"{l} papers will be added to `ref.bib`.")
         # clear the bibtex file
         with open(path_to_bibtex, "w", encoding="utf-8") as file:
             file.write("")
@@ -372,7 +524,7 @@ class References:
             papers = self.papers["keyword"]
         return papers
-    def to_prompts(self, keyword="_all", max_tokens=2048):
         # `prompts`:
         #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
         #   this will be used to instruct GPT model to cite the correct bibtex entry.
@@ -384,21 +536,11 @@ class References:
         papers_json = self.to_json()
         with open(json_path, "w") as f:
             json.dump(papers_json, f)
         try:
             # Use external API to obtain the most relevant papers
             title = self.title
             description = self.description
             result = get_top_k(papers_json, title, description)
-            # client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
-            # result = client.predict(
-            #     title,  # str  in 'Title' Textbox component
-            #     json_path,  # str (filepath or URL to file) in 'Papers JSON (as string)' File component
-            #     50,  # int | float (numeric value between 1 and 50) in 'Top-k Relevant Papers' Slider component
-            #     api_name="/get_k_relevant_papers"
-            # )
-            # with open(result) as f:
-            #     result = json.load(f)
             result = [item for key, item in result.items()]
         except Exception as e:
             print(f"Error occurs during calling external API: {e}\n")
@@ -417,54 +559,9 @@ class References:
                 break
         return prompts
-    def to_json(self, keyword="_all"):
         papers = self._get_papers(keyword)
         papers_json = {}
         for paper in papers:
             papers_json[paper["paper_id"]] = paper
         return papers_json
-if __name__ == "__main__":
-    # testing search results
-    print("================Testing `ss_search`================")
-    r = ss_search("Deep Q-Networks", limit=1)  # a list of raw papers
-    if r['total'] > 0:
-        paper = r['data'][0]
-        # print(paper)
-    # resting References
-    print("================Testing `References`================")
-    refs = References(title="Super Deep Q-Networks")
-    keywords_dict = {
-        "Deep Q-Networks": 5,
-        "Actor-Critic Algorithms": 4,
-        "Exploration-Exploitation Trade-off": 3
-    }
-    print("================Testing `References.collect_papers`================")
-    refs.collect_papers(keywords_dict, tldr=True)
-    for k in refs.papers:
-        papers = refs.papers[k]  # for each keyword, there is a list of papers
-        print("keyword: ", k)
-        for paper in papers:
-            print(paper["paper_id"])
-    print("================Testing `References.to_bibtex`================")
-    refs.to_bibtex()
-    print("================Testing `References.to_json`================")
-    papers_json = refs.to_json()  # this json can be used to find the most relevant papers
-    with open("papers.json", "w", encoding='utf-8') as text_file:
-        text_file.write(f"{papers_json}")
-    print("================Testing `References.to_prompts`================")
-    prompts = refs.to_prompts()
-    print(prompts)
-    # bib = "test.bib"
-    # refs.load_papers(bib, "variance-reduction rl")
-    # print(refs.papers)
-    #
-    # prompts = refs.to_prompts()
-    # for k in prompts:
-    #     print(f"{k}: {prompts[k]}\n")

 #
 # Generate references:
 #   `Reference` class:
+#       1. Two methods to load papers:
+#           1.1. Read a given string including paper titles separated by `,`
+#           1.2. Read a .bib file
 #       2. Given some keywords; use Semantic Scholar API to find papers.
 #       3. Generate bibtex from the selected papers. --> to_bibtex()
 #       4. Generate prompts from the selected papers: --> to_prompts()
 #               A sample prompt: {"paper_id": "paper summary"}
+#       5. Generate json from the selected papers. --> to_json()
+import itertools
+import json
 import re
+import uuid
+from typing import Dict, List, Optional, Union
+import arxiv
 import bibtexparser
 import numpy as np
+import requests
+import tiktoken
 from numpy.linalg import norm
+from scholarly import ProxyGenerator
+from scholarly import scholarly
+# used to evaluate embeddings
 URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
 MAX_BATCH_SIZE = 16
 MAX_ATTEMPTS = 20
+# `tokenizer`: used to count how many tokens
+tokenizer_name = tiktoken.encoding_for_model('gpt-4')
+tokenizer = tiktoken.get_encoding(tokenizer_name.name)
 ######################################################################################################################
 # Some basic tools
 ######################################################################################################################
+def remove_special_characters(s):
+    return ''.join(c for c in s if c.isalnum() or c.isspace() or c == ',')
+def remove_newlines(serie):
+    # This function is applied to the abstract of each paper to reduce the length of prompts.
+    serie = serie.replace('\n', ' ')
+    serie = serie.replace('\\n', ' ')
+    serie = serie.replace('  ', ' ')
+    serie = serie.replace('  ', ' ')
+    return serie
 def evaluate_cosine_similarity(v1, v2):
     try:
+        return np.dot(v1, v2) / (norm(v1) * norm(v2))
     except ValueError:
         return 0.0
 def chunks(lst, chunk_size=MAX_BATCH_SIZE):
     """Splits a longer list to respect batch size"""
     for i in range(0, len(lst), chunk_size):
+        yield lst[i: i + chunk_size]
 def embed(papers):
     embeddings_by_paper_id: Dict[str, List[float]] = {}
     return embeddings_by_paper_id
 def get_embeddings(paper_title, paper_description):
     output = [{"title": paper_title, "abstract": paper_description, "paper_id": "target_paper"}]
     emb_vector = embed(output)["target_paper"]
     target_paper["embeddings"] = emb_vector
     return target_paper
+def get_embeddings_vector(paper_title, paper_description):
+    output = [{"title": paper_title, "abstract": paper_description, "paper_id": "target_paper"}]
+    emb_vector = embed(output)["target_paper"]
+    return emb_vector
 def get_top_k(papers_dict, paper_title, paper_description, k=None):
+    # returns the top k papers most similar to the target paper
     target_paper = get_embeddings(paper_title, paper_description)
+    papers = papers_dict  # must include embeddings
     # if k < len(papers_json), return k most relevant papers
     # if k >= len(papers_json) or k is None, return all papers
     for k in papers:
         v = papers[k]
         embedding_vector = v["embeddings"]
+        cos_sim = evaluate_cosine_similarity(embedding_vector, target_embedding_vector)
         papers[k]["cos_sim"] = cos_sim
     # return the best k papers
         sorted_papers[key].pop("embeddings", None)
     return sorted_papers
 def search_paper_abstract(title):
     pg = ProxyGenerator()
     return remove_newlines(found_paper['bib']['abstract'])
+def tiktoken_len(text):
+    # evaluate how many tokens for the given text
+    tokens = tokenizer.encode(text, disallowed_special=())
+    return len(tokens)
+######################################################################################################################
+# Academic search tools
+######################################################################################################################
+def externalIds2link(externalIds):
+    # Sample externalIds:
+    #   "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
+    if externalIds:
+        # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
+        # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
+        # DBLP
+        dblp_id = externalIds.get('DBLP')
+        if dblp_id is not None:
+            dblp_link = f"dblp.org/rec/{dblp_id}"
+            return dblp_link
+        # arXiv
+        arxiv_id = externalIds.get('ArXiv')
+        if arxiv_id is not None:
+            arxiv_link = f"arxiv.org/abs/{arxiv_id}"
+            return arxiv_link
+        return ""
+    else:
+        # if this is an empty dictionary, return an empty string
+        return ""
+def search_paper_arxiv(title):
+    search = arxiv.Search(
+        query=title,
+        max_results=1,
+        sort_by=arxiv.SortCriterion.Relevance
+    )
+    try:
+        #       (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal (8) embeddings
+        result = next(search.results())
+        title = result.title
+        authors = " and ".join([author.name for author in result.authors])
+        year = str(result.updated.now().year)
+        link = result.pdf_url
+        abstract = result.summary
+        journal = f"Arxiv: {result.entry_id}"
+        paper_id = result.authors[0].name.replace(" ", "")[:4] + year + title[:6].replace(" ", "")
+        paper_id = paper_id.lower()
+        paper = {"paper_id": paper_id,
+                 "title": title,
+                 "authors": authors,
+                 "year": year,
+                 "link": link,
+                 "abstract": abstract,
+                 "journal": journal}
+    except StopIteration:
+        paper = {}
+    return paper
+def search_paper_ss(title):
+    fields = ["title", "abstract", "venue", "year", "authors", "tldr", "externalIds"]
+    limit = 1
+    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={title}&limit={limit}&fields={",".join(fields)}'
+    # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
+    headers = {"Accept": "*/*"}
+    response = requests.get(url, headers=headers, timeout=30)
+    results = response.json()
+    if results['total'] == 0:
+        return {}
+    raw_paper = results['data'][0]
+    if raw_paper['tldr'] is not None:
+        abstract = raw_paper['tldr']['text']
+    elif raw_paper['abstract'] is not None:
+        abstract = remove_newlines(raw_paper['abstract'])
+    else:
+        abstract = ""
+    authors = [author['name'] for author in raw_paper['authors']]
+    authors_str = " and ".join(authors)
+    year_str = str(raw_paper['year'])
+    title = raw_paper['title']
+    paper_id = authors_str.replace(" ", "")[:4] + year_str + title[:6].replace(" ", "")
+    # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
+    journal = remove_special_characters(raw_paper['venue'])
+    if not journal:
+        journal = "arXiv preprint"
+    link = externalIds2link(raw_paper['externalIds'])
+    paper = {
+        "paper_id": paper_id,
+        "title": title,
+        "abstract": abstract,
+        "link": link,
+        "authors": authors_str,
+        "year": year_str,
+        "journal": journal
+    }
+    return paper
+def search_paper_scrape(title):
+    pg = ProxyGenerator()
+    success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
+    if success:
+        try:
+            scholarly.use_proxy(pg)
+            # input the title of a paper, return its abstract
+            search_query = scholarly.search_pubs(title)
+            found_paper = next(search_query)
+            url = found_paper['pub_url']
+            result = found_paper['bib']
+            title = result['title']
+            authors = " and ".join(result['author'])
+            year = str(result['pub_year'])
+            journal = result['pub_year']
+            abstract = result['abstract']
+            paper_id = authors.replace(" ", "")[:4] + year + title[:6].replace(" ", "")
+            paper = {
+                "paper_id": paper_id,
+                "title": title,
+                "abstract": abstract,
+                "link": url,
+                "authors": authors,
+                "year": year,
+                "journal": journal
+            }
+            return paper
+        except StopIteration:
+            return {}
+def search_paper(title, verbose=True):
+    if verbose:
+        print(f"Searching {title}...")
+    # try Semantic Scholar first
+    paper = search_paper_ss(title)
+    if not paper:
+        paper = search_paper_arxiv(title)
+    if not paper:
+        paper = search_paper_scrape(title)
+    if paper:
+        paper["embeddings"] = get_embeddings_vector(paper_title=paper['title'], paper_description=paper['abstract'])
+    if verbose:
+        print(f"Search result: {paper}.")
+    return paper
 def load_papers_from_bibtex(bib_file_path):
     with open(bib_file_path) as bibtex_file:
         bib_database = bibtexparser.load(bibtex_file)
             bib_papers.append(result)
         return bib_papers
+def load_papers_from_text(text):
+    # split text by comma
+    titles = [part.strip() for part in text.split(',')]
+    titles = [remove_special_characters(title) for title in titles]
+    papers = []
+    if len(titles) > 0:
+        for title in titles:
+            paper = search_paper(title)
+            if paper:
+                papers.append(paper)
+        return papers
+    else:
+        return []
 ######################################################################################################################
         fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
     keywords = keywords.lower()
     keywords = keywords.replace(" ", "+")
+    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)} '
     # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
     headers = {"Accept": "*/*"}
 def _collect_papers_ss(keyword, counts=3, tldr=False):
     def extract_paper_id(last_name, year_str, title):
         pattern = r'^\w+'
         words = re.findall(pattern, title)
 ######################################################################################################################
 class References:
+    def __init__(self,
+                 title: str,
+                 load_papers: Optional[str] = None,
+                 load_bibtex: Optional[str] = None,
+                 description: str = ""
+                 ):
+        self.papers = {}
+        if load_bibtex is not None:
+            self.papers["load_from_bibtex"] = load_papers_from_bibtex(load_bibtex)
         if load_papers is not None:
+            self.papers["load_from_text"] = load_papers_from_text(load_papers)
         self.title = title
         self.description = description
+    def generate_keywords_dict(self) -> Dict[str, int]:
         keywords_dict = {}
         for k in self.papers:
             keywords_dict[k] = len(self.papers[k])
         return keywords_dict
+    def collect_papers(self, keywords_dict: Dict[str, int], tldr: bool = False) -> None:
         """
         Collect as many papers as possible
             keywords.append(" ".join(comb_keyword))
         for key in keywords:
             self.papers[key] = _collect_papers_ss(key, 10, tldr)
+    def to_bibtex(self, path_to_bibtex: str = "ref.bib") -> List[str]:
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
         papers = self._get_papers(keyword="_all")
+        num_papers = len(papers)
+        print(f"{num_papers} papers will be added to `ref.bib`.")
         # clear the bibtex file
         with open(path_to_bibtex, "w", encoding="utf-8") as file:
             file.write("")
             papers = self.papers["keyword"]
         return papers
+    def to_prompts(self, keyword: str = "_all", max_tokens: int = 2048):
         # `prompts`:
         #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
         #   this will be used to instruct GPT model to cite the correct bibtex entry.
         papers_json = self.to_json()
         with open(json_path, "w") as f:
             json.dump(papers_json, f)
         try:
             # Use external API to obtain the most relevant papers
             title = self.title
             description = self.description
             result = get_top_k(papers_json, title, description)
             result = [item for key, item in result.items()]
         except Exception as e:
             print(f"Error occurs during calling external API: {e}\n")
                 break
         return prompts
+    def to_json(self, keyword: str = "_all"):
         papers = self._get_papers(keyword)
         papers_json = {}
         for paper in papers:
             papers_json[paper["paper_id"]] = paper
         return papers_json

worker.py CHANGED Viewed

@@ -3,7 +3,7 @@ This script is only used for service-side host.
 '''
 import boto3
 import os, time
-from api_wrapper import generator_wrapper
 from sqlalchemy import create_engine, Table, MetaData, update, select
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import inspect

 '''
 import boto3
 import os, time
+from wrapper import generator_wrapper
 from sqlalchemy import create_engine, Table, MetaData, update, select
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import inspect

wrapper.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+This script is used to wrap all generation methods together.
+todo:
+    A worker keeps running on the server. Monitor the Amazon SQS. Once receive a new message, do the following:
+        Download the corresponding configuration files on S3.
+        Change Task status from Pending to Running.
+        Call `generator_wrapper` and wait for the outputs.
+        If `generator_wrapper` returns results:
+            evaluate the results; compile it; upload results to S3 ... Change Task status from Running to Completed.
+            If anything goes wrong, raise Error.
+        If `generator_wrapper` returns nothing or Timeout, or raise any error:
+            Change Task status from Running to Failed.
+"""
+from auto_generators import generate_draft
+from utils.file_operations import make_archive
+import yaml
+import uuid
+def remove_special_characters(s):
+    return ''.join(c for c in s if c.isalnum() or c.isspace() or c == ',')
+def generator_wrapper(config):
+    if not isinstance(config, dict):
+        with open(config, "r") as file:
+            config = yaml.safe_load(file)
+    title = config["paper"]["title"]
+    generator = config["generator"]
+    if generator == "auto_draft":
+        folder = generate_draft(title, config["paper"]["description"],
+                                tldr=config["references"]["tldr"],
+                                max_kw_refs=config["references"]["max_kw_refs"],
+                                refs=config["references"]["refs"],
+                                max_tokens_ref=config["references"]["max_tokens_ref"],
+                                knowledge_database=config["domain_knowledge"]["knowledge_database"],
+                                max_tokens_kd=config["domain_knowledge"]["max_tokens_kd"],
+                                query_counts=config["domain_knowledge"]["query_counts"],
+                                sections=config["output"]["selected_sections"],
+                                model=config["output"]["model"],
+                                template=config["output"]["template"],
+                                prompts_mode=config["output"]["prompts_mode"],
+                                )
+    else:
+        raise NotImplementedError(f"The generator {generator} has not been supported yet.")
+    # todo: post processing: translate to Chinese, compile PDF ...
+    filename = remove_special_characters(title).replace(" ", "_") + uuid.uuid1().hex + ".zip"
+    return make_archive(folder, filename)
+if __name__ == "__main__":
+    pass
+    # with open("configurations/default.yaml", 'r') as file:
+    #     config = yaml.safe_load(file)
+    # print(config)
+    # generator_wrapper(config)