Spaces:

p-baleine
/

metaanalyser

Runtime error

App Files Files Community

p-baleine commited on May 7, 2023

Commit

5a5f604

•

1 Parent(s): ec36bd2

organize code

Browse files

Files changed (9) hide show

app.py +8 -45
metaanalyser/chains/__init__.py +2 -0
metaanalyser/chains/outline/__init__.py +2 -1
metaanalyser/chains/outline/outline.py +1 -1
metaanalyser/chains/outline/prompt.py +8 -3
metaanalyser/chains/overview/overview.py +1 -1
metaanalyser/chains/section/prompt.py +1 -1
metaanalyser/chains/section/section.py +61 -37
metaanalyser/chains/sr.py +137 -0

app.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import logging
 import os
-# from typing import Optional, Tuple
 import gradio as gr
 from langchain.chat_models import ChatOpenAI
-from metaanalyser.chains import SRSectionChain, SROverviewChain, SROutlintChain
-from metaanalyser.paper import search_on_google_scholar, create_papers_vectorstor
 logger = logging.getLogger(__name__)
@@ -13,48 +11,13 @@ logging.basicConfig()
 logging.getLogger("metaanalyser").setLevel(level=logging.DEBUG)
-def run(query: str):
-    llm = ChatOpenAI(temperature=0)
-    papers = search_on_google_scholar(query)
-    db = create_papers_vectorstor(papers)
-    overview_chain = SROverviewChain(llm=llm, verbose=True)
-    outline_chain = SROutlintChain(llm=llm, verbose=True)
-    section_chain = SRSectionChain(
-        llm=llm,
-        paper_store=db,
-        verbose=True
-    )
-    overview = overview_chain.run({"query": query, "papers": papers})
-    outline = outline_chain.run({"query": query, "papers": papers, "overview": overview})
-    sections_as_md = []
-    for section_idx in range(len(outline.sections)):
-        # TODO: 入れ子のセクションに対応する
-        sections_as_md.append(section_chain.run({
-            "section_idx": section_idx,
-            "section_level": 2,
-            "query": query,
-            "papers": papers,
-            "overview": overview,
-            "outline": outline
-        }))
-    sr = f"# {overview.title}\n\n{overview.overview}\n\n## Table of contents\n\n{outline}\n\n"
-    sr += "\n\n".join(sections_as_md)
-    sr += "\n\n## References\n"
-    papers_citation_id_map = {p.citation_id: p for p in papers}
-    citations = []
-    for citation_id in outline.citations_ids:
-        citation = papers_citation_id_map[int(citation_id)]
-        citations.append(f"[^{citation_id}]: [{citation.mla_citiation.snippet}]({citation.link})")
-    sr += "\n\n".join(citations)
-    return sr
 def set_openai_api_key(api_key: str):
@@ -65,7 +28,6 @@ def set_serpapi_api_key(api_key: str):
     os.environ["SERPAPI_API_KEY"] = api_key
-# block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
 block = gr.Blocks()
 with block:
@@ -94,6 +56,7 @@ with block:
             placeholder="the query for Google Scholar",
             lines=1,
         )
         submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
     gr.Examples(

 import logging
 import os
 import gradio as gr
 from langchain.chat_models import ChatOpenAI
+from metaanalyser.chains import SRChain
 logger = logging.getLogger(__name__)
 logging.getLogger("metaanalyser").setLevel(level=logging.DEBUG)
+def run(query: str, chain: SRChain):
+    if "OPENAI_API_KEY" in os.environ or "SERPAPI_API_KEY" not in os.environ:
+        raise gr.Error(f"Please paste your OpenAI (https://platform.openai.com/) key and SerpAPI (https://serpapi.com/) key to use.")
+    llm = ChatOpenAI(temperature=0)
+    chain = SRChain(llm=llm, verbose=True)
+    return chain.run({"query": query})
 def set_openai_api_key(api_key: str):
     os.environ["SERPAPI_API_KEY"] = api_key
 block = gr.Blocks()
 with block:
             placeholder="the query for Google Scholar",
             lines=1,
         )
         submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
     gr.Examples(

metaanalyser/chains/__init__.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from .overview import SROverviewChain
 from .outline import SROutlintChain
 from .section import SRSectionChain
 __all__ = [
     "SROutlintChain",
     "SROverviewChain",
     "SRSectionChain",

 from .overview import SROverviewChain
 from .outline import SROutlintChain
 from .section import SRSectionChain
+from .sr import SRChain
 __all__ = [
+    "SRChain",
     "SROutlintChain",
     "SROverviewChain",
     "SRSectionChain",

metaanalyser/chains/outline/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from .outline import SROutlintChain
-from .prompt import Outlint
 __all__ = [
     "Outlint",
     "SROutlintChain",
 ]

 from .outline import SROutlintChain
+from .prompt import Outlint, Section
 __all__ = [
     "Outlint",
+    "Section",
     "SROutlintChain",
 ]

metaanalyser/chains/outline/outline.py CHANGED Viewed

@@ -20,7 +20,7 @@ class SROutlintChain(SRBaseChain):
     prompt: BasePromptTemplate = OUTLINE_PROMPT
     nb_categories: int = 3
-    nb_token_limit: int = 2_000
     @property
     def input_keys(self) -> List[str]:

     prompt: BasePromptTemplate = OUTLINE_PROMPT
     nb_categories: int = 3
+    nb_token_limit: int = 1_500
     @property
     def input_keys(self) -> List[str]:

metaanalyser/chains/outline/prompt.py CHANGED Viewed

@@ -23,20 +23,24 @@ class Outlint(BaseModel):
     citations_ids: List[int] = Field(description="citation ids to all paper abstracts cited in this paper")
     def __str__(self):
-        def section_string(idx: int, section: Section):
             result = [f"{idx}. {section.title}: {section.description}"]
             if not section.children:
                 return result[0]
             result += [
-                section_string(f"{idx}.{child_idx}", child)
                 for child_idx, child in enumerate(section.children, start=1)
             ]
             return "\n".join(result)
         return "\n".join([
-            section_string(idx, s)
             for idx, s in enumerate(self.sections, start=1)
         ])
@@ -60,6 +64,7 @@ The following is an overview of this systematic review. Build the outline of the
 Device each section of this outline by citing abstracts from the papers.
 The beginning of element of the sections should by titled "Introduction" and last element of the sections should be titled "Conclusion".
 {format_instructions}"""
 human_prompt = HumanMessagePromptTemplate(

     citations_ids: List[int] = Field(description="citation ids to all paper abstracts cited in this paper")
     def __str__(self):
+        def section_string(idx: int, section: Section, indent_level: int):
             result = [f"{idx}. {section.title}: {section.description}"]
             if not section.children:
                 return result[0]
             result += [
+                section_string(
+                    ("    " * (indent_level + 1)) + f"{child_idx}",
+                    child,
+                    indent_level + 1
+                )
                 for child_idx, child in enumerate(section.children, start=1)
             ]
             return "\n".join(result)
         return "\n".join([
+            section_string(idx, s, 0)
             for idx, s in enumerate(self.sections, start=1)
         ])
 Device each section of this outline by citing abstracts from the papers.
 The beginning of element of the sections should by titled "Introduction" and last element of the sections should be titled "Conclusion".
+It is preferred that sections be divided into more child sections. Each section can have up to two child sections.
 {format_instructions}"""
 human_prompt = HumanMessagePromptTemplate(

metaanalyser/chains/overview/overview.py CHANGED Viewed

@@ -19,7 +19,7 @@ class SROverviewChain(SRBaseChain):
     prompt: BasePromptTemplate = OVERVIEW_PROMPT
     nb_categories: int = 3
-    nb_token_limit: int = 2_000
     nb_max_retry: int = 3
     @property

     prompt: BasePromptTemplate = OVERVIEW_PROMPT
     nb_categories: int = 3
+    nb_token_limit: int = 1_500
     nb_max_retry: int = 3
     @property

metaanalyser/chains/section/prompt.py CHANGED Viewed

@@ -24,7 +24,7 @@ This systematic review should adhere to the following overview:
 {overview}
-Write the "{section_title}" section with respect to this overview. Write the text in markdown format. The title of this section should bu suffixed with {section_level} level markdown title (`{md_title_suffix}`). The text of the section should be based on a snippet or abstact and should be clearly cited. The citation should be written at the end of the sentence in the form `[^cite_id]`."""
 human_prompt = HumanMessagePromptTemplate.from_template(human_template)
 SECTION_PROMPT = ChatPromptTemplate.from_messages([

 {overview}
+Write the "{section_title}: {section_description}" section with respect to this overview. Write the text in markdown format. The title of this section should bu suffixed with {section_level} level markdown title (`{md_title_suffix}`). The text of the section should be based on a snippet or abstact and should be clearly cited. The citation should be written at the end of the sentence in the form `[^<ID>]` where `ID` refers to the citation_id."""
 human_prompt = HumanMessagePromptTemplate.from_template(human_template)
 SECTION_PROMPT = ChatPromptTemplate.from_messages([

metaanalyser/chains/section/section.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import CallbackManagerForChainRun
 from langchain.prompts.base import BasePromptTemplate
 from langchain.vectorstores.base import VectorStore
 from typing import Any, Dict, List, Optional
 from ...paper import (
@@ -23,7 +25,7 @@ class SRSectionChain(SRBaseChain):
     paper_store: VectorStore
     prompt: BasePromptTemplate = SECTION_PROMPT
     nb_categories: int = 3
-    nb_token_limit: int = 2_000
     nb_max_retry: int = 3
     @property
@@ -31,11 +33,11 @@ class SRSectionChain(SRBaseChain):
         # TODO: 入れ子に対応する
         return [
             "section_idx",
-            "section_level",
             "query",
             "papers",
             "overview",
-            "outline"
         ]
     def _call(
@@ -47,11 +49,11 @@ class SRSectionChain(SRBaseChain):
             self.llm,
             self.paper_store,
             inputs["section_idx"],
-            inputs["section_level"],
             inputs["query"],
             inputs["papers"],
             inputs["overview"],
             inputs["outline"],
             self.nb_categories,
             self.nb_token_limit,
         )
@@ -66,69 +68,90 @@ class SRSectionChain(SRBaseChain):
             self.llm,
             self.paper_store,
             inputs["section_idx"],
-            inputs["section_level"],
             inputs["query"],
             inputs["papers"],
             inputs["overview"],
             inputs["outline"],
             self.nb_categories,
             self.nb_token_limit,
         )
         return super()._acall(input_list, run_manager=run_manager)
 def get_input_list(
         llm: BaseLanguageModel,
         paper_store: VectorStore,
         section_idx: int,
-        section_level: int,
         query: str,
         papers: List[Paper],
         overview: Overview,
         outline: Outlint,
         nb_categories: int,
         nb_token_limit: int,
         max_paper_store_search_size: int = 100,
 ):
-    section = outline.sections[section_idx]
     papers_citation_id_map = {p.citation_id: p for p in papers}
-    related_papers = [
-        papers_citation_id_map[int(citation_id)]
-        for citation_id in section.citation_ids
-    ]
-    if not related_papers:
         # citation_ids が空なら全部を対象とする
-        # FIXME: 全部にしちゃうと溢れちゃうのでは？？
-        related_papers = papers
-    related_snippets = paper_store.similarity_search(
-        f"{section.title} {section.description}",
-        k=max_paper_store_search_size,
-    )
-    # overview が引用している論文の abst は全部 snippet に含める
-    # 加えて nb_token_limit に到達するまで vectorstore から関連文章を集める
-    def get_snippet(title, citation_id, text):
-        text = text.replace("\n", " ")
         return f"""
-Title: {title}
-citation_id: {citation_id}
 Text: {text}
 """
-    snippets = [get_snippet(p.title, p.citation_id, p.summary) for p in related_papers]
-    total_num_tokens = llm.get_num_tokens("\n".join(snippets).strip())
     idx = 0
-    while idx < len(related_snippets):
-        snippet = related_snippets[idx]
-        snippet_text = get_snippet(
-            snippet.metadata["title"],
-            snippet.metadata["citation_id"],
-            snippet.page_content,
-        )
         num_tokens = llm.get_num_tokens(snippet_text)
         if total_num_tokens + num_tokens > nb_token_limit:
@@ -142,9 +165,10 @@ Text: {text}
         "query": query,
         "title": overview.title,
         "overview": overview,
-        "section_title": section.title,
-        "section_level": section_level,
-        "md_title_suffix": "#" * section_level,
         "outline": outline,
         "categories": get_categories_string(papers, nb_categories),
         "snippets": "\n".join(snippets).strip(),

 from langchain.base_language import BaseLanguageModel
+from langchain.docstore.document import Document
 from langchain.callbacks.manager import CallbackManagerForChainRun
 from langchain.prompts.base import BasePromptTemplate
 from langchain.vectorstores.base import VectorStore
+from pydantic import BaseModel
 from typing import Any, Dict, List, Optional
 from ...paper import (
     paper_store: VectorStore
     prompt: BasePromptTemplate = SECTION_PROMPT
     nb_categories: int = 3
+    nb_token_limit: int = 1_500
     nb_max_retry: int = 3
     @property
         # TODO: 入れ子に対応する
         return [
             "section_idx",
             "query",
             "papers",
             "overview",
+            "outline",
+            "flatten_sections",
         ]
     def _call(
             self.llm,
             self.paper_store,
             inputs["section_idx"],
             inputs["query"],
             inputs["papers"],
             inputs["overview"],
             inputs["outline"],
+            inputs["flatten_sections"],
             self.nb_categories,
             self.nb_token_limit,
         )
             self.llm,
             self.paper_store,
             inputs["section_idx"],
             inputs["query"],
             inputs["papers"],
             inputs["overview"],
             inputs["outline"],
+            inputs["flatten_sections"],
             self.nb_categories,
             self.nb_token_limit,
         )
         return super()._acall(input_list, run_manager=run_manager)
+class TextSplit(BaseModel):
+    """get_input_list 向けのヘルパークラス
+    """
+    title: str
+    citation_id: int
+    text: str
+    @classmethod
+    def from_paper(cls, paper: Paper) -> "TextSplit":
+        return cls(
+            title=paper.title,
+            citation_id=paper.citation_id,
+            text=paper.summary,
+        )
+    @classmethod
+    def from_snippet(cls, snippet: Document) -> "TextSplit":
+        return cls(
+            title=snippet.metadata["title"],
+            citation_id=snippet.metadata["citation_id"],
+            text=snippet.page_content,
+        )
 def get_input_list(
         llm: BaseLanguageModel,
         paper_store: VectorStore,
         section_idx: int,
         query: str,
         papers: List[Paper],
         overview: Overview,
         outline: Outlint,
+        flatten_sections,
         nb_categories: int,
         nb_token_limit: int,
         max_paper_store_search_size: int = 100,
 ):
+    section = flatten_sections[section_idx]
     papers_citation_id_map = {p.citation_id: p for p in papers}
+    if section.section.citation_ids:
+        related_splits = [
+            TextSplit.from_paper(papers_citation_id_map[int(citation_id)])
+            for citation_id in section.section.citation_ids
+        ]
+    else:
         # citation_ids が空なら全部を対象とする
+        related_splits = [TextSplit.from_paper(p) for p in papers]
+    related_splits += [
+        TextSplit.from_snippet(snippet) for snippet in
+        paper_store.similarity_search(
+            f"{section.section.title} {section.section.description}",
+            k=max_paper_store_search_size,
+        )
+    ]
+    def get_snippet(split: TextSplit):
+        text = split.text.replace("\n", " ")
         return f"""
+Title: {split.title}
+citation_id: {split.citation_id}
 Text: {text}
 """
+    snippets = []
+    total_num_tokens = 0
     idx = 0
+    while idx < len(related_splits):
+        split = related_splits[idx]
+        snippet_text = get_snippet(split)
         num_tokens = llm.get_num_tokens(snippet_text)
         if total_num_tokens + num_tokens > nb_token_limit:
         "query": query,
         "title": overview.title,
         "overview": overview,
+        "section_title": section.section.title,
+        "section_description": section.section.description,
+        "section_level": section.level,
+        "md_title_suffix": "#" * section.level,
         "outline": outline,
         "categories": get_categories_string(papers, nb_categories),
         "snippets": "\n".join(snippets).strip(),

metaanalyser/chains/sr.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import logging
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.base import Chain
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from pydantic import BaseModel
+from typing import Any, Dict, List, Optional
+from ..paper import Paper, search_on_google_scholar, create_papers_vectorstor
+from .outline import SROutlintChain, Outlint, Section
+from .overview import SROverviewChain, Overview
+from .section import SRSectionChain
+logger = logging.getLogger(__name__)
+class SRChain(Chain):
+    llm: BaseLanguageModel
+    output_key: str = "text"
+    @property
+    def input_keys(self) -> List[str]:
+        return ["query"]
+    @property
+    def output_keys(self) -> List[str]:
+        return [self.output_key]
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        query = inputs["query"]
+        logger.info(f"Searching `{query}` on Google Scholar.")
+        papers = search_on_google_scholar(query)
+        logger.info(f"Writing an overview of the paper.")
+        overview_chain = SROverviewChain(llm=self.llm, verbose=self.verbose)
+        overview: Overview = overview_chain.run({"query": query, "papers": papers})
+        logger.info(f"Building the outline of the paper.")
+        outline_chain = SROutlintChain(llm=self.llm, verbose=self.verbose)
+        outline: Outlint = outline_chain.run({
+            "query": query,
+            "papers": papers,
+            "overview": overview
+        })
+        logger.info(f"Creating vector store.")
+        db = create_papers_vectorstor(papers)
+        section_chain = SRSectionChain(llm=self.llm, paper_store=db, verbose=self.verbose)
+        flatten_sections = get_flatten_sections(outline)
+        sections_as_md = []
+        for section_idx in range(len(flatten_sections)):
+            logger.info(f"Writing sections: [{section_idx + 1} / {len(flatten_sections)}]")
+            sections_as_md.append(
+                section_chain.run({
+                    "section_idx": section_idx,
+                    "query": query,
+                    "papers": papers,
+                    "overview": overview,
+                    "outline": outline,
+                    "flatten_sections": flatten_sections,
+                })
+            )
+        return {
+            self.output_key: create_output(outline, overview, papers, flatten_sections, sections_as_md)
+        }
+class FlattenSection(BaseModel):
+    """SRChain 向けのセクションを表すヘルパークラス
+    """
+    level: int
+    section: Section
+def get_flatten_sections(
+        outline: Outlint,
+        start_section_level: int = 2,
+) -> List[FlattenSection]:
+    def inner(section_level, section: Section) -> List[FlattenSection]:
+        result = FlattenSection(level=section_level, section=section)
+        if not section.children:
+            return [result]
+        return (
+            [result] + sum([
+                inner(section_level + 1, child)
+                for child in section.children
+            ], [])
+        )
+    return sum([
+        inner(start_section_level, section)
+        for section in outline.sections
+    ], [])
+def create_output(
+        outline: Outlint,
+        overview: Overview,
+        papers: List[Paper],
+        flatten_sections: List[FlattenSection],
+        sections_as_md: List[str],
+) -> str:
+    papers_citation_id_map = {p.citation_id: p for p in papers}
+    all_citation_ids = list(set(
+        outline.citations_ids + sum([
+            s.section.citation_ids for s in flatten_sections
+        ], [])
+    ))
+    citations = []
+    for citation_id in all_citation_ids:
+        citation = papers_citation_id_map[int(citation_id)]
+        citations.append(
+            f"[^{citation_id}]: "
+            f"[{citation.mla_citiation.snippet}]({citation.link})"
+        )
+    return (
+        f"# {overview.title}\n\n{overview.overview}\n\n"
+        + f"## Table of contents\n\n{outline}\n\n"
+        + "\n\n".join(sections_as_md)
+        + "\n\n## References\n"
+        + "\n\n".join(citations)
+    )