Spaces:

auto-academic
/

auto-draft

Running

App Files Files Community

CCCBora commited on May 7, 2023

Commit

a6a7f17

2 Parent(s): d1feb02 8ef9348

Merge pull request #1 from CCCBora/semantic-scholar

Browse files

Files changed (24) hide show

.idea/.gitignore +10 -0
.idea/auto-draft.iml +14 -0
.idea/inspectionProfiles/Project_Default.xml +95 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
__pycache__/auto_backgrounds.cpython-310.pyc +0 -0
__pycache__/auto_draft.cpython-310.pyc +0 -0
__pycache__/section_generator.cpython-310.pyc +0 -0
app.py +35 -24
auto_backgrounds.py +13 -5
section_generator.py +2 -2
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/figures.cpython-310.pyc +0 -0
utils/__pycache__/file_operations.cpython-310.pyc +0 -0
utils/__pycache__/gpt_interaction.cpython-310.pyc +0 -0
utils/__pycache__/prompts.cpython-310.pyc +0 -0
utils/__pycache__/references.cpython-310.pyc +0 -0
utils/__pycache__/storage.cpython-310.pyc +0 -0
utils/__pycache__/tex_processing.cpython-310.pyc +0 -0
utils/prompts.py +5 -0
utils/references.py +131 -16
utils/tex_processing.py +3 -1

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+**/__pycache__
+**/.idea

.idea/auto-draft.iml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,95 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyChainedComparisonsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoreConstantInTheMiddle" value="true" />
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="69">
+            <item index="0" class="java.lang.String" itemvalue="pprint" />
+            <item index="1" class="java.lang.String" itemvalue="tnt" />
+            <item index="2" class="java.lang.String" itemvalue="pyglet" />
+            <item index="3" class="java.lang.String" itemvalue="pyzmq" />
+            <item index="4" class="java.lang.String" itemvalue="gym" />
+            <item index="5" class="java.lang.String" itemvalue="torch" />
+            <item index="6" class="java.lang.String" itemvalue="numpy" />
+            <item index="7" class="java.lang.String" itemvalue="absl-py" />
+            <item index="8" class="java.lang.String" itemvalue="numba" />
+            <item index="9" class="java.lang.String" itemvalue="protobuf" />
+            <item index="10" class="java.lang.String" itemvalue="torch-scatter" />
+            <item index="11" class="java.lang.String" itemvalue="joblib" />
+            <item index="12" class="java.lang.String" itemvalue="threadpoolctl" />
+            <item index="13" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="14" class="java.lang.String" itemvalue="PyYAML" />
+            <item index="15" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="16" class="java.lang.String" itemvalue="cycler" />
+            <item index="17" class="java.lang.String" itemvalue="MarkupSafe" />
+            <item index="18" class="java.lang.String" itemvalue="mpi4py" />
+            <item index="19" class="java.lang.String" itemvalue="torchvision" />
+            <item index="20" class="java.lang.String" itemvalue="line-profiler" />
+            <item index="21" class="java.lang.String" itemvalue="pyasn1-modules" />
+            <item index="22" class="java.lang.String" itemvalue="certifi" />
+            <item index="23" class="java.lang.String" itemvalue="oauthlib" />
+            <item index="24" class="java.lang.String" itemvalue="pyparsing" />
+            <item index="25" class="java.lang.String" itemvalue="Markdown" />
+            <item index="26" class="java.lang.String" itemvalue="Werkzeug" />
+            <item index="27" class="java.lang.String" itemvalue="h5py" />
+            <item index="28" class="java.lang.String" itemvalue="rdflib" />
+            <item index="29" class="java.lang.String" itemvalue="torch-cluster" />
+            <item index="30" class="java.lang.String" itemvalue="kiwisolver" />
+            <item index="31" class="java.lang.String" itemvalue="pytorch-lightning" />
+            <item index="32" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="33" class="java.lang.String" itemvalue="imageio" />
+            <item index="34" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="35" class="java.lang.String" itemvalue="test-tube" />
+            <item index="36" class="java.lang.String" itemvalue="googledrivedownloader" />
+            <item index="37" class="java.lang.String" itemvalue="idna" />
+            <item index="38" class="java.lang.String" itemvalue="rsa" />
+            <item index="39" class="java.lang.String" itemvalue="networkx" />
+            <item index="40" class="java.lang.String" itemvalue="isodate" />
+            <item index="41" class="java.lang.String" itemvalue="torch-sparse" />
+            <item index="42" class="java.lang.String" itemvalue="llvmlite" />
+            <item index="43" class="java.lang.String" itemvalue="pyasn1" />
+            <item index="44" class="java.lang.String" itemvalue="requests" />
+            <item index="45" class="java.lang.String" itemvalue="importlib-metadata" />
+            <item index="46" class="java.lang.String" itemvalue="Jinja2" />
+            <item index="47" class="java.lang.String" itemvalue="requests-oauthlib" />
+            <item index="48" class="java.lang.String" itemvalue="tensorboard-plugin-wit" />
+            <item index="49" class="java.lang.String" itemvalue="zipp" />
+            <item index="50" class="java.lang.String" itemvalue="urllib3" />
+            <item index="51" class="java.lang.String" itemvalue="torch-geometric" />
+            <item index="52" class="java.lang.String" itemvalue="scipy" />
+            <item index="53" class="java.lang.String" itemvalue="six" />
+            <item index="54" class="java.lang.String" itemvalue="google-auth-oauthlib" />
+            <item index="55" class="java.lang.String" itemvalue="chardet" />
+            <item index="56" class="java.lang.String" itemvalue="pandas" />
+            <item index="57" class="java.lang.String" itemvalue="tqdm" />
+            <item index="58" class="java.lang.String" itemvalue="torch-spline-conv" />
+            <item index="59" class="java.lang.String" itemvalue="ase" />
+            <item index="60" class="java.lang.String" itemvalue="future" />
+            <item index="61" class="java.lang.String" itemvalue="cachetools" />
+            <item index="62" class="java.lang.String" itemvalue="grpcio" />
+            <item index="63" class="java.lang.String" itemvalue="pytz" />
+            <item index="64" class="java.lang.String" itemvalue="google-auth" />
+            <item index="65" class="java.lang.String" itemvalue="Pillow" />
+            <item index="66" class="java.lang.String" itemvalue="decorator" />
+            <item index="67" class="java.lang.String" itemvalue="typing-extensions" />
+            <item index="68" class="java.lang.String" itemvalue="ale-py" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N812" />
+          <option value="N802" />
+          <option value="N803" />
+          <option value="N806" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (auto-draft)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/auto-draft.iml" filepath="$PROJECT_DIR$/.idea/auto-draft.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

__pycache__/auto_backgrounds.cpython-310.pyc ADDED Viewed

Binary file (4.06 kB). View file

__pycache__/auto_draft.cpython-310.pyc ADDED Viewed

Binary file (4.56 kB). View file

__pycache__/section_generator.cpython-310.pyc ADDED Viewed

Binary file (2.42 kB). View file

app.py CHANGED Viewed

@@ -4,15 +4,20 @@ import openai
 from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
 from utils.file_operations import hash_name
 # todo:
-#   2. update QQ group and Organization cards
-#   4. add auto_polishing function
-#   5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
 #       5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
 #       5.2 Use local LLM to generate keywords, figures, ...
 #       5.3 Use embedding to find most related papers (find a paper dataset)
-#       5.4 Use Semantic Scholar API instead of Arxiv API.
 #   6. get logs when the procedure is not completed.
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
@@ -35,14 +40,13 @@ else:
         IS_OPENAI_API_KEY_AVAILABLE = False
 def clear_inputs(text1, text2):
     return "", ""
-def wrapped_generator(title, description, openai_key = None,
-                      template = "ICLR2022",
-                      cache_mode = IS_CACHE_AVAILABLE, generator=None):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
@@ -52,15 +56,16 @@ def wrapped_generator(title, description, openai_key = None,
         # generator = generate_backgrounds
         generator = generate_draft
         # generator = fake_generator
-    if openai_key is not None:
-        openai.api_key = openai_key
         openai.Model.list()
     if cache_mode:
         from utils.storage import list_all_files, download_file, upload_file
         # check if "title"+"description" have been generated before
-        input_dict = {"title": title, "description": description, "generator": "generate_draft"} #todo: modify here also
         file_name = hash_name(input_dict) + ".zip"
         file_list = list_all_files()
         # print(f"{file_name} will be generated. Check the file list {file_list}")
@@ -70,21 +75,23 @@ def wrapped_generator(title, description, openai_key = None,
             return file_name
         else:
             # generate the result.
-            # output = fake_generate_backgrounds(title, description, openai_key) # todo: use `generator` to control which function to use.
-            output = generator(title, description,  template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
-        output = generator(title, description,  template, "gpt-4")
         return output
-theme = gr.themes.Monochrome(font=gr.themes.GoogleFont("Questrial")).set(
-    background_fill_primary='#E5E4E2',
-    background_fill_secondary = '#F6F6F6',
-    button_primary_background_fill="#281A39"
-)
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
@@ -102,16 +109,20 @@ with gr.Blocks(theme=theme) as demo:
     ''')
     with gr.Row():
         with gr.Column(scale=2):
-            key =  gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key", visible=not IS_OPENAI_API_KEY_AVAILABLE)
-            # generator = gr.Dropdown(choices=["学术论文", "文献总结"], value="文献总结", label="Selection", info="目前支持生成'学术论文'和'文献总结'.", interactive=True)
-            title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1, label="Title", info="论文标题")
             description = gr.Textbox(lines=5, label="Description (Optional)", visible=False)
             with gr.Row():
                 clear_button = gr.Button("Clear")
-                submit_button = gr.Button("Submit")
         with gr.Column(scale=1):
-            style_mapping = {True: "color:white;background-color:green", False: "color:white;background-color:red"} #todo: to match website's style
             availability_mapping = {True: "AVAILABLE", False: "NOT AVAILABLE"}
             gr.Markdown(f'''## Huggingface Space Status
              当`OpenAI API`显示AVAILABLE的时候这个Space可以直接使用.

 from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
 from utils.file_operations import hash_name
+# note: App白屏bug：允许第三方cookie
 # todo:
+#   5. Use some simple method for simple tasks
+#   (including: writing abstract, conclusion, generate keywords, generate figures...)
 #       5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
 #       5.2 Use local LLM to generate keywords, figures, ...
 #       5.3 Use embedding to find most related papers (find a paper dataset)
 #   6. get logs when the procedure is not completed.
+#   7. 自己的文件库； 更多的prompts
+#   11. distinguish citep and citet
+# future:
+#   8. Change prompts to langchain
+#   4. add auto_polishing function
+#   12. Change link to more appealing color # after the website is built;
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
         IS_OPENAI_API_KEY_AVAILABLE = False
 def clear_inputs(text1, text2):
     return "", ""
+def wrapped_generator(paper_title, paper_description, openai_api_key=None,
+                      template="ICLR2022",
+                      cache_mode=IS_CACHE_AVAILABLE, generator=None):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
         # generator = generate_backgrounds
         generator = generate_draft
         # generator = fake_generator
+    if openai_api_key is not None:
+        openai.api_key = openai_api_key
         openai.Model.list()
     if cache_mode:
         from utils.storage import list_all_files, download_file, upload_file
         # check if "title"+"description" have been generated before
+        input_dict = {"title": paper_title, "description": paper_description,
+                      "generator": "generate_draft"}  # todo: modify here also
         file_name = hash_name(input_dict) + ".zip"
         file_list = list_all_files()
         # print(f"{file_name} will be generated. Check the file list {file_list}")
             return file_name
         else:
             # generate the result.
+            # output = fake_generate_backgrounds(title, description, openai_key)
+            # todo: use `generator` to control which function to use.
+            output = generator(paper_title, paper_description, template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
+        output = generator(paper_title, paper_description, template, "gpt-4")
         return output
+theme = gr.themes.Default(font=gr.themes.GoogleFont("Questrial"))
+# .set(
+#     background_fill_primary='#E5E4E2',
+#     background_fill_secondary = '#F6F6F6',
+#     button_primary_background_fill="#281A39"
+# )
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     ''')
     with gr.Row():
         with gr.Column(scale=2):
+            key = gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key",
+                             visible=not IS_OPENAI_API_KEY_AVAILABLE)
+            # generator = gr.Dropdown(choices=["学术论文", "文献总结"], value="文献总结",
+            # label="Selection", info="目前支持生成'学术论文'和'文献总结'.", interactive=True)
+            title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
+                               label="Title", info="论文标题")
             description = gr.Textbox(lines=5, label="Description (Optional)", visible=False)
             with gr.Row():
                 clear_button = gr.Button("Clear")
+                submit_button = gr.Button("Submit", variant="primary")
         with gr.Column(scale=1):
+            style_mapping = {True: "color:white;background-color:green",
+                             False: "color:white;background-color:red"}  # todo: to match website's style
             availability_mapping = {True: "AVAILABLE", False: "NOT AVAILABLE"}
             gr.Markdown(f'''## Huggingface Space Status
              当`OpenAI API`显示AVAILABLE的时候这个Space可以直接使用.

auto_backgrounds.py CHANGED Viewed

@@ -30,7 +30,8 @@ def log_usage(usage, generating_target, print_out=True):
         print(message)
     logging.info(message)
-def _generation_setup(title, description="", template="ICLR2022", model="gpt-4"):
     '''
     todo: use `model` to control which model to use; may use another method to generate keywords or collect references
     '''
@@ -44,12 +45,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4")
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
-    keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo")
     print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
     ref = References(load_papers="")
-    ref.collect_papers(keywords, method="arxiv")
     all_paper_ids = ref.to_bibtex(bibtex_path)  # todo: this will used to check if all citations are in this list
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
@@ -90,8 +91,8 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
     return make_archive("sample-output.pdf", filename)
-def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
-    paper, destination_folder, _ = _generation_setup(title, description, template, model)
     # todo: `list_of_methods` failed to be generated; find a solution ...
     # print("Generating figures ...")
@@ -125,3 +126,10 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)

         print(message)
     logging.info(message)
+def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
+                      search_engine="ss", tldr=False, max_kw_refs=10):
     '''
     todo: use `model` to control which model to use; may use another method to generate keywords or collect references
     '''
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
+    keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
     print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
     ref = References(load_papers="")
+    ref.collect_papers(keywords, method=search_engine, tldr=tldr)
     all_paper_ids = ref.to_bibtex(bibtex_path)  # todo: this will used to check if all citations are in this list
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     return make_archive("sample-output.pdf", filename)
+def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=14):
+    paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
     # todo: `list_of_methods` failed to be generated; find a solution ...
     # print("Generating figures ...")
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)
+if __name__ == "__main__":
+    title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
+    description = ""
+    output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
+    print(output)

section_generator.py CHANGED Viewed

@@ -76,11 +76,11 @@ def section_generation(paper, section, save_to_path, model):
     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
-def keywords_generation(input_dict,  model):
     title = input_dict.get("title")
     description = input_dict.get("description", "")
     if title is not None:
-        prompts = generate_keywords_prompts(title, description)
         gpt_response, usage = get_responses(prompts, model)
         keywords = extract_keywords(gpt_response)
         return keywords, usage

     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
+def keywords_generation(input_dict,  model, max_kw_refs = 10):
     title = input_dict.get("title")
     description = input_dict.get("description", "")
     if title is not None:
+        prompts = generate_keywords_prompts(title, description, max_kw_refs)
         gpt_response, usage = get_responses(prompts, model)
         keywords = extract_keywords(gpt_response)
         return keywords, usage

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (137 Bytes). View file

utils/__pycache__/figures.cpython-310.pyc ADDED Viewed

Binary file (1.89 kB). View file

utils/__pycache__/file_operations.cpython-310.pyc ADDED Viewed

Binary file (1.41 kB). View file

utils/__pycache__/gpt_interaction.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

utils/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (6.66 kB). View file

utils/__pycache__/references.cpython-310.pyc ADDED Viewed

Binary file (6.77 kB). View file

utils/__pycache__/storage.cpython-310.pyc ADDED Viewed

Binary file (1.71 kB). View file

utils/__pycache__/tex_processing.cpython-310.pyc ADDED Viewed

Binary file (609 Bytes). View file

utils/prompts.py CHANGED Viewed

@@ -10,6 +10,11 @@ INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the
                 "conclusion": "Please read the paper I have written and write the conclusion section.",
                 "abstract": "Please read the paper I have written and write the abstract."}
 BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
                 "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
                 "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}

                 "conclusion": "Please read the paper I have written and write the conclusion section.",
                 "abstract": "Please read the paper I have written and write the abstract."}
+INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
+                                r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
+                                r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
 BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
                 "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
                 "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}

utils/references.py CHANGED Viewed

@@ -8,10 +8,115 @@
 import requests
 import re
-def _collect_papers_arxiv(keyword, counts=3):
-    #
-    # The following codes are used to generate the most related papers
-    #
     # Build the arXiv API query URL with the given keyword and other parameters
     def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
         base_url = "http://export.arxiv.org/api/query?"
@@ -37,6 +142,7 @@ def _collect_papers_arxiv(keyword, counts=3):
             title = entry.find(f"{namespace}title").text
             link = entry.find(f"{namespace}id").text
             summary = entry.find(f"{namespace}summary").text
             # Extract the authors
             authors = entry.findall(f"{namespace}author")
@@ -76,9 +182,14 @@ def _collect_papers_arxiv(keyword, counts=3):
     results = parse_results(content)
     return results
 # Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
 class References:
-    def __init__(self, load_papers = ""):
         if load_papers:
             # todo: read a json file from the given path
             #       this could be used to support pre-defined references
@@ -86,7 +197,7 @@ class References:
         else:
             self.papers = []
-    def collect_papers(self, keywords_dict, method="arxiv"):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
@@ -94,11 +205,13 @@ class References:
         """
         match method:
             case "arxiv":
-                process =_collect_papers_arxiv
             case _:
                 raise NotImplementedError("Other sources have not been not supported yet.")
         for key, counts in keywords_dict.items():
-            self.papers = self.papers + process(key, counts)
         seen = set()
         papers = []
@@ -146,15 +259,17 @@ class References:
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
 if __name__ == "__main__":
     refs = References()
     keywords_dict = {
-  "Deep Q-Networks": 5,
-  "Policy Gradient Methods": 4,
-  "Actor-Critic Algorithms": 4,
-  "Model-Based Reinforcement Learning": 3,
-  "Exploration-Exploitation Trade-off": 2
-}
-    refs.collect_papers(keywords_dict)
     for p in refs.papers:
-        print(p["paper_id"])

 import requests
 import re
+#########################################################
+# Some basic tools
+#########################################################
+def remove_newlines(serie):
+    serie = serie.replace('\n', ' ')
+    serie = serie.replace('\\n', ' ')
+    serie = serie.replace('  ', ' ')
+    serie = serie.replace('  ', ' ')
+    return serie
+#########################################################
+# Semantic Scholar (SS) API
+#########################################################
+def ss_search(keywords, limit=20, fields=None):
+    # space between the  query to be removed and replaced with +
+    if fields is None:
+        fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
+    keywords = keywords.lower()
+    keywords = keywords.replace(" ", "+")
+    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
+    # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
+    headers = {"Accept": "*/*"}
+    response = requests.get(url, headers=headers, timeout=30)
+    return response.json()
+def _collect_papers_ss(keyword, counts=3, tldr=False):
+    def externalIds2link(externalIds):
+        # Sample externalIds:
+        #   "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
+        if externalIds:
+            # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
+            # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
+            # DBLP
+            dblp_id = externalIds.get('DBLP')
+            if dblp_id is not None:
+                dblp_link = f"dblp.org/rec/{dblp_id}"
+                return dblp_link
+            # arXiv
+            arxiv_id = externalIds.get('ArXiv')
+            if arxiv_id is not None:
+                arxiv_link = f"arxiv.org/abs/{arxiv_id}"
+                return arxiv_link
+            return ""
+        else:
+            # if this is an empty dictionary, return an empty string
+            return ""
+    def extract_paper_id(last_name, year_str, title):
+        pattern = r'^\w+'
+        words = re.findall(pattern, title)
+        # return last_name + year_str + title.split(' ', 1)[0]
+        return last_name + year_str + words[0]
+    def extract_author_info(raw_authors):
+        authors = [author['name'] for author in raw_authors]
+        authors_str = " and ".join(authors)
+        last_name = authors[0].split()[-1]
+        return authors_str, last_name
+    def parse_search_results(search_results_ss):
+        # turn the search result to a list of paper dictionary.
+        papers = []
+        for raw_paper in search_results_ss:
+            if raw_paper["abstract"] is None:
+                continue
+            authors_str, last_name = extract_author_info(raw_paper['authors'])
+            year_str = str(raw_paper['year'])
+            title = raw_paper['title']
+            # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
+            journal = raw_paper['venue'].replace("&", "\\&")
+            if not journal:
+                journal = "arXiv preprint"
+            paper_id = extract_paper_id(last_name, year_str, title).lower()
+            link = externalIds2link(raw_paper['externalIds'])
+            if tldr and raw_paper['tldr'] is not None:
+                abstract = raw_paper['tldr']['text']
+            else:
+                abstract = remove_newlines(raw_paper['abstract'])
+            result = {
+                "paper_id": paper_id,
+                "title": title,
+                "abstract": abstract,  # todo: compare results with tldr
+                "link": link,
+                "authors": authors_str,
+                "year": year_str,
+                "journal": journal
+            }
+            papers.append(result)
+        return papers
+    raw_results = ss_search(keyword, limit=counts)
+    if raw_results is not None:
+        search_results = raw_results['data']
+    else:
+        search_results = []
+    results = parse_search_results(search_results)
+    return results
+#########################################################
+# ArXiv API
+#########################################################
+def _collect_papers_arxiv(keyword, counts=3, tldr=False):
     # Build the arXiv API query URL with the given keyword and other parameters
     def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
         base_url = "http://export.arxiv.org/api/query?"
             title = entry.find(f"{namespace}title").text
             link = entry.find(f"{namespace}id").text
             summary = entry.find(f"{namespace}summary").text
+            summary = remove_newlines(summary)
             # Extract the authors
             authors = entry.findall(f"{namespace}author")
     results = parse_results(content)
     return results
+#########################################################
+# References Class
+#########################################################
 # Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
 class References:
+    def __init__(self, load_papers=""):
         if load_papers:
             # todo: read a json file from the given path
             #       this could be used to support pre-defined references
         else:
             self.papers = []
+    def collect_papers(self, keywords_dict, method="arxiv", tldr=False):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
         """
         match method:
             case "arxiv":
+                process = _collect_papers_arxiv
+            case "ss":
+                process = _collect_papers_ss
             case _:
                 raise NotImplementedError("Other sources have not been not supported yet.")
         for key, counts in keywords_dict.items():
+            self.papers = self.papers + process(key, counts, tldr)
         seen = set()
         papers = []
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
 if __name__ == "__main__":
     refs = References()
     keywords_dict = {
+        "Deep Q-Networks": 15,
+        "Policy Gradient Methods": 24,
+        "Actor-Critic Algorithms": 4,
+        "Model-Based Reinforcement Learning": 13,
+        "Exploration-Exploitation Trade-off": 7
+    }
+    refs.collect_papers(keywords_dict, method="ss", tldr=True)
     for p in refs.papers:
+        print(p["paper_id"])
+    print(len(refs.papers))

utils/tex_processing.py CHANGED Viewed

@@ -24,4 +24,6 @@ def replace_title(save_to_path, title):
 # check if citations are in bibtex.
-# replace citations

 # check if citations are in bibtex.
+# replace citations
+# sometimes the output may include thebibliography and bibitem . remove all of it.