Spaces:

auto-academic
/

auto-draft

Running

App Files Files Community

shaocongma commited on May 12, 2023

Commit

365213e

•

1 Parent(s): c9efba3

Edit UI.

Browse files

Files changed (5) hide show

app.py +46 -36
auto_backgrounds.py +38 -33
latex_templates/pre_refs.bib +19 -16
utils/prompts.py +9 -10
utils/references.py +13 -13

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import os
 import openai
-from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
 from utils.file_operations import hash_name
 # note: App白屏bug：允许第三方cookie
@@ -9,12 +9,10 @@ from utils.file_operations import hash_name
 #   6. get logs when the procedure is not completed. *
 #   7. 自己的文件库； 更多的prompts
 #   8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
-#   9. Load .bibtex file to generate a pre-defined references list. *
 #   1. 把paper改成纯JSON?
 #   2. 实现别的功能
 #   3. Check API Key GPT-4 Support.
 #   8. Re-build some components using `langchain`
-#           - in `references.py`, use PromptTemplates.format -> str
 #           - in `gpt_interation`, use LLM
 #   5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
 # future:
@@ -49,17 +47,12 @@ def clear_inputs(text1, text2):
 def wrapped_generator(paper_title, paper_description, openai_api_key=None,
-                      template="ICLR2022",
-                      cache_mode=IS_CACHE_AVAILABLE, generator=None):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
     #        if not, generate the result.
-    if generator is None:
-        # todo: add a Dropdown to select which generator to use.
-        # generator = generate_backgrounds
-        generator = generate_draft
-        # generator = fake_generator
     if openai_api_key is not None:
         openai.api_key = openai_api_key
         openai.Model.list()
@@ -80,13 +73,17 @@ def wrapped_generator(paper_title, paper_description, openai_api_key=None,
         else:
             # generate the result.
             # output = fake_generate_backgrounds(title, description, openai_key)
-            # todo: use `generator` to control which function to use.
-            output = generator(paper_title, paper_description, template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
-        output = generator(paper_title, paper_description, template, "gpt-4")
         return output
@@ -97,6 +94,14 @@ theme = gr.themes.Default(font=gr.themes.GoogleFont("Questrial"))
 #     button_primary_background_fill="#281A39"
 # )
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     # Auto-Draft: 文献整理辅助工具
@@ -107,11 +112,7 @@ with gr.Blocks(theme=theme) as demo:
     在这个Huggingface Organization里也提供一定额度的免费体验： [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
-    如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
-    ## 用法
-    输入想要生成的论文名称（比如Playing Atari with Deep Reinforcement Learning), 点击Submit, 等待大概十分钟, 下载.zip格式的输出，在Overleaf上编译浏览.
     ''')
     with gr.Row():
@@ -124,6 +125,9 @@ with gr.Blocks(theme=theme) as demo:
             # 每个功能做一个tab
             with gr.Tab("学术论文"):
                 title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
                                    label="Title", info="论文标题")
@@ -131,33 +135,38 @@ with gr.Blocks(theme=theme) as demo:
                     description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
                                                 info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
-                    interactive = False
-                    gr.Markdown('''
-                    ## 下面的功能我只做了UI, 还没来得及实现功能.
-                    ''')
                     with gr.Row():
                         with gr.Column():
                             gr.Markdown('''
-                            Upload .bib file (Optional)
-                            通过上传.bib文件来控制GPT-4模型必须参考哪些文献.
                             ''')
                             bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
-                                                  interactive=interactive)
                         with gr.Column():
                             search_engine = gr.Dropdown(label="Search Engine",
                                                         choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
                                                         value= "Semantic Scholar",
-                                                        interactive=interactive,
-                                                        info="用于决定GPT-4用什么搜索引擎来搜索文献. 选择None的时候仅参考给定文献.")
-                            tldr = gr.Checkbox(value=True, label="TLDR;",
                                                info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
-                                               interactive = interactive),
-                            use_cache = gr.Checkbox(label="总是重新生成",
-                                                    info="选择此筐表示将不会读取已经生成好的文章.",
-                                               interactive = interactive)
-                            slider = gr.Slider(minimum=1, maximum=30, value=20, label="最大参考文献数目",
-                                               info="过多参考文献会超出Token数限制导致报错，这里限制最大参考文献数目.")
                 with gr.Row():
                     clear_button_pp = gr.Button("Clear")
@@ -196,7 +205,8 @@ with gr.Blocks(theme=theme) as demo:
             file_output = gr.File(label="Output")
     clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
-    submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key], outputs=file_output)
 demo.queue(concurrency_count=1, max_size=5, api_open=False)
 demo.launch()

 import gradio as gr
 import os
 import openai
+from auto_backgrounds import generate_backgrounds, generate_draft
 from utils.file_operations import hash_name
 # note: App白屏bug：允许第三方cookie
 #   6. get logs when the procedure is not completed. *
 #   7. 自己的文件库； 更多的prompts
 #   8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
 #   1. 把paper改成纯JSON?
 #   2. 实现别的功能
 #   3. Check API Key GPT-4 Support.
 #   8. Re-build some components using `langchain`
 #           - in `gpt_interation`, use LLM
 #   5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
 # future:
 def wrapped_generator(paper_title, paper_description, openai_api_key=None,
+                      template="ICLR2022", tldr=True, max_num_refs=50, sections=None, bib_refs=None, model="gpt-4",
+                      cache_mode=IS_CACHE_AVAILABLE):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
     #        if not, generate the result.
     if openai_api_key is not None:
         openai.api_key = openai_api_key
         openai.Model.list()
         else:
             # generate the result.
             # output = fake_generate_backgrounds(title, description, openai_key)
+            output =generate_draft(paper_title, paper_description, template=template,
+                                   tldr=tldr, max_num_refs=max_num_refs,
+                                   sections=sections, bib_refs=bib_refs, model=model)
+            # output = generate_draft(paper_title, paper_description, template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
+        output =generate_draft(paper_title, paper_description, template=template,
+                               tldr=tldr, max_num_refs=max_num_refs,
+                               sections=sections, bib_refs=bib_refs, model=model)
         return output
 #     button_primary_background_fill="#281A39"
 # )
+ACADEMIC_PAPER = """## 一键生成论文初稿
+1. 在Title文本框中输入想要生成的论文名称（比如Playing Atari with Deep Reinforcement Learning).
+2. 点击Submit. 等待大概十分钟.
+3. 在右侧下载.zip格式的输出，在Overleaf上编译浏览.
+"""
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     # Auto-Draft: 文献整理辅助工具
     在这个Huggingface Organization里也提供一定额度的免费体验： [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
+    如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
     ''')
     with gr.Row():
             # 每个功能做一个tab
             with gr.Tab("学术论文"):
+                gr.Markdown(ACADEMIC_PAPER)
                 title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
                                    label="Title", info="论文标题")
                     description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
                                                 info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
                     with gr.Row():
                         with gr.Column():
+                            with gr.Row():
+                                template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
+                                                            interactive=False,
+                                                            info="生成论文的参考模板. (暂不支持修改)")
+                                model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"], value="gpt-4",
+                                                            interactive=True,
+                                                            info="生成论文用到的语言模型.")
                             gr.Markdown('''
+                            上传.bib文件提供AI需要参考的文献.
                             ''')
                             bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
+                                                  interactive=True)
+                            gr.Examples(
+                                examples=["latex_templates/pre_refs.bib"],
+                                inputs=bibtex_file
+                            )
                         with gr.Column():
                             search_engine = gr.Dropdown(label="Search Engine",
                                                         choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
                                                         value= "Semantic Scholar",
+                                                        interactive=False,
+                                                        info="用于决定GPT-4用什么搜索引擎来搜索文献. (暂不支持修改)")
+                            tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
                                                info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
+                                               interactive = True)
+                            sections = gr.CheckboxGroup(choices=["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"],
+                                                        type="value", label="生成章节", interactive = True,
+                                                        value=["introduction", "related works"])
+                            slider = gr.Slider(minimum=1, maximum=100, value=50, step=1,
+                                               interactive = True, label="最大参考文献数目")
                 with gr.Row():
                     clear_button_pp = gr.Button("Clear")
             file_output = gr.File(label="Output")
     clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
+    # submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key, template, tldr, slider, sections, bibtex_file], outputs=file_output)
+    submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key, template, tldr_checkbox, slider, sections, bibtex_file, model_selection ], outputs=file_output)
 demo.queue(concurrency_count=1, max_size=5, api_open=False)
 demo.launch()

auto_backgrounds.py CHANGED Viewed

@@ -30,8 +30,29 @@ def log_usage(usage, generating_target, print_out=True):
         print(message)
     logging.info(message)
-def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
-                      tldr=False, max_kw_refs=4, max_num_refs=10):
     print("Generation setup...")
     paper = {}
     paper_body = {}
@@ -44,24 +65,16 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
     # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
-    keywords, usage = keywords_generation(input_dict) #todo: handle format error here
-    print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
     # generate keywords dictionary
     keywords = {keyword:max_kw_refs for keyword in keywords}
-    # tmp = {}
-    # for keyword in json.loads(keywords):
-    #     tmp[keyword] = max_kw_refs
-    # keywords = tmp
-    print(f"keywords: {keywords}")
-    ref = References()
     ref.collect_papers(keywords, tldr=tldr)
-    # todo: use `all_paper_ids` to check if all citations are in this list
-    #       in tex_processing, remove all duplicated ids
-    #       find most relevant papers; max_num_refs
-    all_paper_ids = ref.to_bibtex(bibtex_path)
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
@@ -70,11 +83,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
     paper["references"] = ref.to_prompts()
     paper["body"] = paper_body
     paper["bibtex"] = bibtex_path
-    return paper, destination_folder, all_paper_ids
 def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-4"):
     paper, destination_folder, _ = _generation_setup(title, description, template, model)
     for section in ["introduction", "related works", "backgrounds"]:
@@ -92,25 +106,15 @@ def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-
     return make_archive(destination_folder, filename)
-def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
-    """
-    This function is used to test the whole pipeline without calling OpenAI API.
-    """
-    input_dict = {"title": title, "description": description, "generator": "generate_draft"}
-    filename = hash_name(input_dict) + ".zip"
-    return make_archive("sample-output.pdf", filename)
-def generate_draft(title, description="", template="ICLR2022", model="gpt-4", tldr=True, max_kw_refs=4):
-    paper, destination_folder, _ = _generation_setup(title, description, template, model, tldr, max_kw_refs)
-    raise
-    # todo: `list_of_methods` failed to be generated; find a solution ...
-    # print("Generating figures ...")
-    # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
-    # log_usage(usage, "figures")
-    # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
-    for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
         max_attempts = 4
         attempts_count = 0
         while attempts_count < max_attempts:
@@ -127,6 +131,7 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", tl
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)

         print(message)
     logging.info(message)
+def _generation_setup(title, description="", template="ICLR2022", tldr=False,
+                      max_kw_refs=10, max_num_refs=50, bib_refs=None):
+    """
+    This function handles the setup process for paper generation; it contains three folds
+        1. Copy the template to the outputs folder. Create the log file `generation.log`
+        2. Collect references based on the given `title` and `description`
+        3. Generate the basic `paper` object (a dictionary)
+    Parameters:
+        title (str): The title of the paper.
+        description (str, optional): A short description or abstract for the paper. Defaults to an empty string.
+        template (str, optional): The template to be used for paper generation. Defaults to "ICLR2022".
+        tldr (bool, optional): A flag indicating whether a TL;DR (Too Long; Didn't Read) summary should be generated for the collected papers. Defaults to False.
+        max_kw_refs (int, optional): The maximum number of references that can be associated with each keyword. Defaults to 10.
+        max_num_refs (int, optional): The maximum number of references that can be included in the paper. Defaults to 50.
+        bib_refs (list, optional): A list of pre-existing references in BibTeX format. Defaults to None.
+    Returns:
+    tuple: A tuple containing the following elements:
+        - paper (dict): A dictionary containing the generated paper information.
+        - destination_folder (str): The path to the destination folder where the generation log is saved.
+        - all_paper_ids (list): A list of all paper IDs collected for the references.
+    """
     print("Generation setup...")
     paper = {}
     paper_body = {}
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
     # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
+    keywords, usage = keywords_generation(input_dict)
     log_usage(usage, "keywords")
     # generate keywords dictionary
     keywords = {keyword:max_kw_refs for keyword in keywords}
+    print(f"keywords: {keywords}\n\n")
+    ref = References(title, bib_refs)
     ref.collect_papers(keywords, tldr=tldr)
+    all_paper_ids = ref.to_bibtex(bibtex_path, max_num_refs) #todo: max_num_refs has not implemented yet
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     paper["references"] = ref.to_prompts()
     paper["body"] = paper_body
     paper["bibtex"] = bibtex_path
+    return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
 def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-4"):
+    # todo: to match the current generation setup
     paper, destination_folder, _ = _generation_setup(title, description, template, model)
     for section in ["introduction", "related works", "backgrounds"]:
     return make_archive(destination_folder, filename)
+def generate_draft(title, description="", template="ICLR2022",
+                   model="gpt-4", tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None):
+    # pre-processing `sections` parameter;
+    if sections is None:
+        sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
+    # todo: add more parameters; select which section to generate; select maximum refs.
+    paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
+    for section in sections:
         max_attempts = 4
         attempts_count = 0
         while attempts_count < max_attempts:
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
+    print("\nMission completed.\n")
     return make_archive(destination_folder, filename)

latex_templates/pre_refs.bib CHANGED Viewed

@@ -1,17 +1,20 @@
-@article{1512.07669,
-          title = {Reinforcement Learning: Stochastic Approximation Algorithms for Markov
-  Decision Processes},
-          author = {Vikram Krishnamurthy},
-          journal={arXiv preprint arXiv:1512.07669},
-          year = {2015},
-          url = {http://arxiv.org/abs/1512.07669v1}
-        }
-@article{1511.02377,
-          title = {The Value Functions of Markov Decision Processes},
-          author = {Ehud Lehrer , Eilon Solan , Omri N. Solan},
-          journal={arXiv preprint arXiv:1511.02377},
-          year = {2015},
-          url = {http://arxiv.org/abs/1511.02377v1}
-        }

+@inproceedings{ma2020understanding,
+  title={Understanding the impact of model incoherence on convergence of incremental sgd with random reshuffle},
+  author={Ma, Shaocong and Zhou, Yi},
+  booktitle={International Conference on Machine Learning},
+  pages={6565--6574},
+  year={2020},
+  organization={PMLR}
+}
+@inproceedings{ma2020variance,
+ author = {Ma, Shaocong and Zhou, Yi and Zou, Shaofeng},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
+ pages = {14796--14806},
+ publisher = {Curran Associates, Inc.},
+ title = {Variance-Reduced Off-Policy TDC Learning: Non-Asymptotic Convergence Analysis},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/a992995ef4f0439b258f2360dbb85511-Paper.pdf},
+ volume = {33},
+ year = {2020}
+}

utils/prompts.py CHANGED Viewed

@@ -33,16 +33,15 @@ def generate_experiments_prompts(paper_info):
 ######################################################################################################################
 # two parameters: min_refs_num, max_refs_num
-keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
-Instructions
-- Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
-- The length of list should between {min_refs_num} and {max_refs_num}
-- Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
-# keywords_system_template = """You are an assistant designed to provide related research fields of academic papers.
-# Instructions:
-# - Your response should follow the following output format: ["field1", "field2", "field3"]\n
-# - The length of this Python list should between {min_refs_num} and {max_refs_num}\n
-# - Use specific phrases instead of using too general words (e.g. machine learning)"""
 # two parameters: min_refs_num, max_refs_num
 exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.

 ######################################################################################################################
 # two parameters: min_refs_num, max_refs_num
+# keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
+# Instructions
+# - Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
+# - The length of list should between {min_refs_num} and {max_refs_num}
+# - Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
+keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.\n
+Instructions:\n
+- Your response should follow the following output format: ["field1", "field2", "field3", "field4"]\n
+- The length of this Python list should between {min_refs_num} and {max_refs_num}."""
 # two parameters: min_refs_num, max_refs_num
 exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.

utils/references.py CHANGED Viewed

@@ -150,7 +150,6 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
         # turn the search result to a list of paper dictionary.
         papers_ss = []
         for raw_paper in search_results_ss:
-            print(raw_paper['title'])
             if raw_paper["abstract"] is None:
                 continue
@@ -170,6 +169,8 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
                 abstract = raw_paper['tldr']['text']
             else:
                 abstract = remove_newlines(raw_paper['abstract'])
             embeddings_dict = raw_paper.get('embedding')
             if embeddings_dict is None:
                 continue
@@ -203,14 +204,13 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
 ######################################################################################################################
 class References:
-    def __init__(self):
-        # if load_papers:
-        #     # todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10
-        #     #       (2) google scholar didn't give a full abstract for some papers ...
-        #     #       (3) may use langchain to support long input
-        #     self.papers = load_papers_from_bibtex(load_papers)
-        # else:
-        self.papers = {}
     def load_papers(self, bibtex, keyword):
         self.papers[keyword] = load_papers_from_bibtex(bibtex)
@@ -230,14 +230,14 @@ class References:
         for key, counts in keywords_dict.items():
             self.papers[key] = _collect_papers_ss(key, counts, tldr)
-    def find_relevant(self, max_refs=30):
-        # todo: use embeddings to evaluate
-        pass
-    def to_bibtex(self, path_to_bibtex="ref.bib"):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
         papers = self._get_papers(keyword = "_all")
         # clear the bibtex file

         # turn the search result to a list of paper dictionary.
         papers_ss = []
         for raw_paper in search_results_ss:
             if raw_paper["abstract"] is None:
                 continue
                 abstract = raw_paper['tldr']['text']
             else:
                 abstract = remove_newlines(raw_paper['abstract'])
+            # some papers have no embeddings; handle this case
             embeddings_dict = raw_paper.get('embedding')
             if embeddings_dict is None:
                 continue
 ######################################################################################################################
 class References:
+    def __init__(self, title, load_papers):
+        if load_papers is not None:
+            self.papers = {}
+            self.papers["customized_refs"] = load_papers_from_bibtex(load_papers)
+        else:
+            self.papers = {}
+        self.title = title
     def load_papers(self, bibtex, keyword):
         self.papers[keyword] = load_papers_from_bibtex(bibtex)
         for key, counts in keywords_dict.items():
             self.papers[key] = _collect_papers_ss(key, counts, tldr)
+    def to_bibtex(self, path_to_bibtex="ref.bib", max_num_refs=50):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
+        # todo:
+        #   use embeddings to evaluate; keep top k relevant references in papers
+        #   send (title, .bib file) to evaluate embeddings; recieve truncated papers
         papers = self._get_papers(keyword = "_all")
         # clear the bibtex file