Spaces:

auto-academic
/

auto-draft

Runtime error

App Files Files Community

sc_ma commited on May 6, 2023

Commit

3a7ead9

1 Parent(s): d1feb02

Move from ArXiv API to Semantic Scholar API.

Browse files

Files changed (6) hide show

app.py +3 -1
auto_backgrounds.py +13 -5
section_generator.py +2 -2
utils/prompts.py +5 -0
utils/references.py +113 -12
utils/tex_processing.py +3 -1

app.py CHANGED Viewed

@@ -4,8 +4,8 @@ import openai
 from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
 from utils.file_operations import hash_name
 # todo:
-#   2. update QQ group and Organization cards
 #   4. add auto_polishing function
 #   5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
 #       5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
@@ -13,6 +13,8 @@ from utils.file_operations import hash_name
 #       5.3 Use embedding to find most related papers (find a paper dataset)
 #       5.4 Use Semantic Scholar API instead of Arxiv API.
 #   6. get logs when the procedure is not completed.
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')

 from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
 from utils.file_operations import hash_name
+# note: App白屏bug：允许第三方cookie
 # todo:
 #   4. add auto_polishing function
 #   5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
 #       5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
 #       5.3 Use embedding to find most related papers (find a paper dataset)
 #       5.4 Use Semantic Scholar API instead of Arxiv API.
 #   6. get logs when the procedure is not completed.
+#   7. 自己的文件库； 更多的prompts
+#   8. Change prompts to langchain
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')

auto_backgrounds.py CHANGED Viewed

@@ -30,7 +30,8 @@ def log_usage(usage, generating_target, print_out=True):
         print(message)
     logging.info(message)
-def _generation_setup(title, description="", template="ICLR2022", model="gpt-4"):
     '''
     todo: use `model` to control which model to use; may use another method to generate keywords or collect references
     '''
@@ -44,12 +45,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4")
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
-    keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo")
     print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
     ref = References(load_papers="")
-    ref.collect_papers(keywords, method="arxiv")
     all_paper_ids = ref.to_bibtex(bibtex_path)  # todo: this will used to check if all citations are in this list
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
@@ -90,8 +91,8 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
     return make_archive("sample-output.pdf", filename)
-def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
-    paper, destination_folder, _ = _generation_setup(title, description, template, model)
     # todo: `list_of_methods` failed to be generated; find a solution ...
     # print("Generating figures ...")
@@ -125,3 +126,10 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4"):
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)

         print(message)
     logging.info(message)
+def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
+                      search_engine="ss", tldr=False, max_kw_refs=10):
     '''
     todo: use `model` to control which model to use; may use another method to generate keywords or collect references
     '''
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
+    keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
     print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
     ref = References(load_papers="")
+    ref.collect_papers(keywords, method=search_engine, tldr=tldr)
     all_paper_ids = ref.to_bibtex(bibtex_path)  # todo: this will used to check if all citations are in this list
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     return make_archive("sample-output.pdf", filename)
+def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=False, max_kw_refs=5):
+    paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
     # todo: `list_of_methods` failed to be generated; find a solution ...
     # print("Generating figures ...")
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)
+if __name__ == "__main__":
+    title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
+    description = ""
+    output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
+    print(output)

section_generator.py CHANGED Viewed

@@ -76,11 +76,11 @@ def section_generation(paper, section, save_to_path, model):
     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
-def keywords_generation(input_dict,  model):
     title = input_dict.get("title")
     description = input_dict.get("description", "")
     if title is not None:
-        prompts = generate_keywords_prompts(title, description)
         gpt_response, usage = get_responses(prompts, model)
         keywords = extract_keywords(gpt_response)
         return keywords, usage

     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
+def keywords_generation(input_dict,  model, max_kw_refs = 10):
     title = input_dict.get("title")
     description = input_dict.get("description", "")
     if title is not None:
+        prompts = generate_keywords_prompts(title, description, max_kw_refs)
         gpt_response, usage = get_responses(prompts, model)
         keywords = extract_keywords(gpt_response)
         return keywords, usage

utils/prompts.py CHANGED Viewed

@@ -10,6 +10,11 @@ INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the
                 "conclusion": "Please read the paper I have written and write the conclusion section.",
                 "abstract": "Please read the paper I have written and write the abstract."}
 BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
                 "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
                 "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}

                 "conclusion": "Please read the paper I have written and write the conclusion section.",
                 "abstract": "Please read the paper I have written and write the abstract."}
+INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
+                                r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
+                                r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
 BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
                 "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
                 "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}

utils/references.py CHANGED Viewed

@@ -8,10 +8,107 @@
 import requests
 import re
-def _collect_papers_arxiv(keyword, counts=3):
-    #
-    # The following codes are used to generate the most related papers
-    #
     # Build the arXiv API query URL with the given keyword and other parameters
     def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
         base_url = "http://export.arxiv.org/api/query?"
@@ -37,6 +134,7 @@ def _collect_papers_arxiv(keyword, counts=3):
             title = entry.find(f"{namespace}title").text
             link = entry.find(f"{namespace}id").text
             summary = entry.find(f"{namespace}summary").text
             # Extract the authors
             authors = entry.findall(f"{namespace}author")
@@ -86,7 +184,7 @@ class References:
         else:
             self.papers = []
-    def collect_papers(self, keywords_dict, method="arxiv"):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
@@ -95,10 +193,12 @@ class References:
         match method:
             case "arxiv":
                 process =_collect_papers_arxiv
             case _:
                 raise NotImplementedError("Other sources have not been not supported yet.")
         for key, counts in keywords_dict.items():
-            self.papers = self.papers + process(key, counts)
         seen = set()
         papers = []
@@ -149,12 +249,13 @@ class References:
 if __name__ == "__main__":
     refs = References()
     keywords_dict = {
-  "Deep Q-Networks": 5,
-  "Policy Gradient Methods": 4,
   "Actor-Critic Algorithms": 4,
-  "Model-Based Reinforcement Learning": 3,
-  "Exploration-Exploitation Trade-off": 2
 }
-    refs.collect_papers(keywords_dict)
     for p in refs.papers:
-        print(p["paper_id"])

 import requests
 import re
+#########################################################
+# Some basic tools
+#########################################################
+def remove_newlines(serie):
+    serie = serie.replace('\n', ' ')
+    serie = serie.replace('\\n', ' ')
+    serie = serie.replace('  ', ' ')
+    serie = serie.replace('  ', ' ')
+    return serie
+#########################################################
+# Semantic Scholar (SS) API
+#########################################################
+def ss_search(keywords, limit=20, fields=None):
+    # space between the  query to be removed and replaced with +
+    if fields is None:
+        fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
+    keywords = keywords.lower()
+    keywords = keywords.replace(" ", "+")
+    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
+    # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
+    headers = {"Accept": "*/*"}
+    response = requests.get(url, headers=headers, timeout=30)
+    return response.json()
+def _collect_papers_ss(keyword, counts=3, tldr=False):
+    def externalIds2link(externalIds):
+        # externalIds is similar to "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
+        if externalIds:
+            # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
+            # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
+            # DBLP
+            dblp_id = externalIds.get('DBLP')
+            if dblp_id is not None:
+                dblp_link = f"dblp.org/rec/{dblp_id}"
+                return dblp_link
+            # arXiv
+            arxiv_id = externalIds.get('ArXiv')
+            if arxiv_id is not None:
+                arxiv_link = f"arxiv.org/abs/{arxiv_id}"
+                return arxiv_link
+            return ""
+        else:
+            # if this is an empty dictionary, return an empty string
+            return ""
+    def extract_paper_id(last_name, year_str, title):
+        return last_name + year_str + title.split(' ', 1)[0]
+    def extract_author_info(raw_authors):
+        authors = [author['name'] for author in raw_authors]
+        authors_str = " and ".join(authors)
+        last_name = authors[0].split()[-1]
+        return authors_str, last_name
+    def parse_search_results(search_results):
+        # turn the search result to a list of paper dictionary.
+        papers = []
+        for raw_paper in search_results:
+            if raw_paper["abstract"] is None:
+                continue
+            authors_str, last_name = extract_author_info(raw_paper['authors'])
+            year_str = str(raw_paper['year'])
+            title = raw_paper['title']
+            journal = raw_paper['venue']
+            if not journal:
+                journal = "arXiv preprint"
+            paper_id = extract_paper_id(last_name, year_str, title).lower()
+            link = externalIds2link(raw_paper['externalIds'])
+            if tldr and raw_paper['tldr'] is not None:
+                abstract = raw_paper['tldr']['text']
+            else:
+                abstract = remove_newlines(raw_paper['abstract'])
+            result = {
+                "paper_id": paper_id,
+                "title": title,
+                "abstract": abstract,  # todo: compare results with tldr
+                "link": link,
+                "authors": authors_str,
+                "year": year_str,
+                "journal": journal
+            }
+            papers.append(result)
+        return papers
+    raw_results = ss_search(keyword, limit=counts)
+    if raw_results is not None:
+        search_results = raw_results['data']
+    else:
+        search_results = []
+    results = parse_search_results(search_results)
+    return results
+#########################################################
+# ArXiv API
+#########################################################
+def _collect_papers_arxiv(keyword, counts=3, tldr=False):
     # Build the arXiv API query URL with the given keyword and other parameters
     def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
         base_url = "http://export.arxiv.org/api/query?"
             title = entry.find(f"{namespace}title").text
             link = entry.find(f"{namespace}id").text
             summary = entry.find(f"{namespace}summary").text
+            summary = remove_newlines(summary)
             # Extract the authors
             authors = entry.findall(f"{namespace}author")
         else:
             self.papers = []
+    def collect_papers(self, keywords_dict, method="arxiv", tldr=False):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
         match method:
             case "arxiv":
                 process =_collect_papers_arxiv
+            case "ss":
+                process = _collect_papers_ss
             case _:
                 raise NotImplementedError("Other sources have not been not supported yet.")
         for key, counts in keywords_dict.items():
+            self.papers = self.papers + process(key, counts, tldr)
         seen = set()
         papers = []
 if __name__ == "__main__":
     refs = References()
     keywords_dict = {
+  "Deep Q-Networks": 15,
+  "Policy Gradient Methods": 24,
   "Actor-Critic Algorithms": 4,
+  "Model-Based Reinforcement Learning": 13,
+  "Exploration-Exploitation Trade-off": 7
 }
+    refs.collect_papers(keywords_dict, method="ss", tldr=True)
     for p in refs.papers:
+        print(p["paper_id"])
+    print(len(refs.papers))

utils/tex_processing.py CHANGED Viewed

@@ -24,4 +24,6 @@ def replace_title(save_to_path, title):
 # check if citations are in bibtex.
-# replace citations

 # check if citations are in bibtex.
+# replace citations
+# sometimes the output may include thebibliography and bibitem . remove all of it.