Spaces:

p-baleine
/

metaanalyser

Runtime error

App Files Files Community

p-baleine commited on May 6, 2023

Commit

cfdc527

•

1 Parent(s): 38b78a6

hf

Browse files

Files changed (19) hide show

.gitignore +8 -0
README.md +1 -0
metaanalyser/__init__.py +0 -0
metaanalyser/chains/__init__.py +10 -0
metaanalyser/chains/base.py +58 -0
metaanalyser/chains/outline/__init__.py +8 -0
metaanalyser/chains/outline/outline.py +89 -0
metaanalyser/chains/outline/prompt.py +75 -0
metaanalyser/chains/overview/__init__.py +8 -0
metaanalyser/chains/overview/overview.py +85 -0
metaanalyser/chains/overview/prompt.py +79 -0
metaanalyser/chains/section/__init__.py +6 -0
metaanalyser/chains/section/prompt.py +33 -0
metaanalyser/chains/section/section.py +151 -0
metaanalyser/memory.py +10 -0
metaanalyser/paper/__init__.py +16 -0
metaanalyser/paper/arxiv_categories.py +172 -0
metaanalyser/paper/paper.py +289 -0
metaanalyser/paper/vectorstore.py +58 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__
+.ipynb_checkpoints
+.envrc
+!.gitkeep
+.cache/*

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # metaanalyser

metaanalyser/__init__.py ADDED Viewed

File without changes

metaanalyser/chains/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .overview import SROverviewChain
+from .outline import SROutlintChain
+from .section import SRSectionChain
+__all__ = [
+    "SROutlintChain",
+    "SROverviewChain",
+    "SRSectionChain",
+]

metaanalyser/chains/base.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import logging
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.llm import LLMChain
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.output_parsers import RetryWithErrorOutputParser
+from langchain.prompts.base import BasePromptTemplate
+from langchain.schema import BaseOutputParser, OutputParserException
+from typing import Any, Dict, List, Optional
+logger = logging.getLogger(__name__)
+class SRBaseChain(LLMChain):
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        response = self.generate(inputs, run_manager=run_manager)
+        # トークンの利用状況を確認したい
+        logger.info(f"LLM utilization: {response.llm_output}")
+        return self.create_outputs(response)[0]
+    def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        response = self.agenerate(inputs, run_manager=run_manager)
+        logger.info(f"LLM utilization: {response.llm_output}")
+        return self.create_outputs(response)[0]
+def maybe_retry_with_error_output_parser(
+        llm: BaseLanguageModel,
+        input_list: List[Dict[str, str]],
+        output: Dict[str, str],
+        output_parser: BaseOutputParser,
+        output_key: str,
+        prompt: BasePromptTemplate,
+):
+    retry_parser = RetryWithErrorOutputParser.from_llm(
+        parser=output_parser,
+        llm=llm,
+    )
+    try:
+        output_text = output_parser.parse(output[output_key])
+    except OutputParserException as e:
+        logger.warning(f"An error occurred on parsing output, retrying parse, {e}")
+        output_text = retry_parser.parse_with_prompt(
+            output[output_key],
+            prompt.format_prompt(**(input_list[0]))
+        )
+    return {output_key: output_text}

metaanalyser/chains/outline/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .outline import SROutlintChain
+from .prompt import Outlint
+__all__ = [
+    "Outlint",
+    "SROutlintChain",
+]

metaanalyser/chains/outline/outline.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from langchain.base_language import BaseLanguageModel
+from langchain.prompts.base import BasePromptTemplate
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from typing import Any, Dict, List, Optional
+from ...paper import (
+    Paper,
+    get_abstract_with_token_limit,
+    get_categories_string,
+)
+from ..base import (
+    SRBaseChain,
+    maybe_retry_with_error_output_parser,
+)
+from ..overview import Overview
+from .prompt import OUTLINE_PROMPT, output_parser
+class SROutlintChain(SRBaseChain):
+    prompt: BasePromptTemplate = OUTLINE_PROMPT
+    nb_categories: int = 3
+    nb_token_limit: int = 2_000
+    @property
+    def input_keys(self) -> List[str]:
+        return ["query", "papers", "overview"]
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        input_list = get_input_list(
+            self.llm,
+            inputs["query"],
+            inputs["papers"],
+            inputs["overview"],
+            self.nb_categories,
+            self.nb_token_limit,
+        )
+        output = super()._call(input_list, run_manager=run_manager)
+        return maybe_retry_with_error_output_parser(
+                llm=self.llm,
+                input_list=input_list,
+                output=output,
+                output_parser=output_parser,
+                output_key=self.output_key,
+                prompt=self.prompt,
+        )
+    def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        input_list = get_input_list(
+            self.llm,
+            inputs["query"],
+            inputs["papers"],
+            inputs["overview"],
+            self.nb_categories,
+            self.nb_token_limit,
+        )
+        output = super()._acall(input_list, run_manager=run_manager)
+        return maybe_retry_with_error_output_parser(
+                llm=self.llm,
+                input_list=input_list,
+                output=output,
+                output_parser=output_parser,
+                output_key=self.output_key,
+                prompt=self.prompt,
+        )
+def get_input_list(
+        llm: BaseLanguageModel,
+        query: str,
+        papers: List[Paper],
+        overview: Overview,
+        nb_categories: int,
+        nb_token_limit: int,
+):
+    return [{
+        "query": query,
+        "overview": overview,
+        "categories": get_categories_string(papers, nb_categories),
+        "abstracts": get_abstract_with_token_limit(llm, papers, nb_token_limit)
+    }]

metaanalyser/chains/outline/prompt.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from langchain.output_parsers import PydanticOutputParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    PromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from pydantic import BaseModel, Field
+from typing import List, Optional
+class Section(BaseModel):
+    title: str = Field(description="title of this section")
+    children: Optional[List["Section"]] = Field(description="subsections of this section")
+    description: str = Field(description="brief description of this section (approximately 30 words maximum)")
+    citation_ids: List[int] = Field(description="citation ids to a paper abstract that this section cites")
+class Outlint(BaseModel):
+    sections: List[Section] = Field(description="sections that make up this systematic review")
+    citations_ids: List[int] = Field(description="citation ids to all paper abstracts cited in this paper")
+    def __str__(self):
+        def section_string(idx: int, section: Section):
+            result = [f"{idx}. {section.title}: {section.description}"]
+            if not section.children:
+                return result[0]
+            result += [
+                section_string(f"{idx}.{child_idx}", child)
+                for child_idx, child in enumerate(section.children, start=1)
+            ]
+            return "\n".join(result)
+        return "\n".join([
+            section_string(idx, s)
+            for idx, s in enumerate(self.sections, start=1)
+        ])
+output_parser = PydanticOutputParser(pydantic_object=Outlint)
+system_template = "You are a research scientist and intereseted in {categories}. You are working on writing a systematic review regarding \"{query}\"."
+system_prompt = SystemMessagePromptTemplate.from_template(system_template)
+human_template = """Build an outline of the systematic review regarding \"{query}\" based on the following list of paper abstracts.
+-----
+{abstracts}
+-----
+The following is an overview of this systematic review. Build the outline of the systematic review according to this overview.
+-----
+{overview}
+-----
+Device each section of this outline by citing abstracts from the papers.
+The beginning of element of the sections should by titled "Introduction" and last element of the sections should be titled "Conclusion".
+{format_instructions}"""
+human_prompt = HumanMessagePromptTemplate(
+    prompt=PromptTemplate(
+        template=human_template,
+        input_variables=["query", "abstracts", "overview"],
+        partial_variables={
+            "format_instructions": output_parser.get_format_instructions()
+        }
+    )
+)
+OUTLINE_PROMPT = ChatPromptTemplate.from_messages([system_prompt, human_prompt])

metaanalyser/chains/overview/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .overview import SROverviewChain
+from .prompt import Overview
+__all__ = [
+    "Overview",
+    "SROverviewChain",
+]

metaanalyser/chains/overview/overview.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.prompts.base import BasePromptTemplate
+from typing import Any, Dict, List, Optional
+from ...paper import (
+    Paper,
+    get_abstract_with_token_limit,
+    get_categories_string,
+)
+from ..base import (
+    SRBaseChain,
+    maybe_retry_with_error_output_parser,
+)
+from .prompt import OVERVIEW_PROMPT, output_parser
+class SROverviewChain(SRBaseChain):
+    prompt: BasePromptTemplate = OVERVIEW_PROMPT
+    nb_categories: int = 3
+    nb_token_limit: int = 2_000
+    nb_max_retry: int = 3
+    @property
+    def input_keys(self) -> List[str]:
+        return ["query", "papers"]
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        input_list = get_input_list(
+            self.llm,
+            inputs["query"],
+            inputs["papers"],
+            self.nb_categories,
+            self.nb_token_limit,
+        )
+        output = super()._call(input_list, run_manager=run_manager)
+        return maybe_retry_with_error_output_parser(
+                llm=self.llm,
+                input_list=input_list,
+                output=output,
+                output_parser=output_parser,
+                output_key=self.output_key,
+                prompt=self.prompt,
+        )
+    def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        input_list = get_input_list(
+            self.llm,
+            inputs["query"],
+            inputs["papers"],
+            self.nb_categories,
+            self.nb_token_limit,
+        )
+        output = super()._acall(input_list, run_manager=run_manager)
+        return maybe_retry_with_error_output_parser(
+                llm=self.llm,
+                input_list=input_list,
+                output=output,
+                output_parser=output_parser,
+                output_key=self.output_key,
+                prompt=self.prompt,
+        )
+def get_input_list(
+        llm: BaseLanguageModel,
+        query: str,
+        papers: List[Paper],
+        nb_categories: int,
+        nb_token_limit: int,
+):
+    return [{
+        "query": query,
+        "categories": get_categories_string(papers, nb_categories),
+        "abstracts": get_abstract_with_token_limit(llm, papers, nb_token_limit)
+    }]

metaanalyser/chains/overview/prompt.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from langchain.output_parsers import PydanticOutputParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    PromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from pydantic import BaseModel, Field
+from typing import List
+class Overview(BaseModel):
+    title: str = Field(description="title of the systematic review")
+    main_points: List[str] = Field(description="main points that make up the systematic review")
+    overview: str = Field(description="overview of the systematic review")
+    def __str__(self):
+        points = "\n  - ".join(self.main_points)
+        return f"""
+Title: {self.title}
+Points:
+  - {points}
+Overview: {self.overview}
+""".strip()
+    def _repr_html_(self):
+        main_points = "".join([f"<li>{p}</li>" for p in self.main_points])
+        return (
+            "<div>"
+            f"  <div><span style=\"font-weight: bold\">Title:</span>"
+            f"    <span style=\"margin-left: 5px\">{self.title}</span>"
+            f"  </div>"
+            f"  <div><span style=\"font-weight: bold\">Main points:</span>"
+            f"    <ul style=\"margin: 0 10px\">{main_points}</ul>"
+            f"  </div>"
+            f"  <div><span style=\"font-weight: bold\">Overview:</span>"
+            f"    <span style=\"margin-left: 5px\">{self.overview}</span>"
+            f"  </div>"
+            "</div>"
+        )
+output_parser = PydanticOutputParser(pydantic_object=Overview)
+system_template = "You are a research scientist and intereseted in {categories}. You are working on writing a systematic review regarding \"{query}\"."
+system_prompt = SystemMessagePromptTemplate.from_template(system_template)
+human_template = """Write an overview of the systematic review based on the summary of the following list of paper abstracts.
+-----
+{abstracts}
+-----
+This overview should serve as a compass for you as you construct the outline of the systematic review and write down its details.
+Assuming that the readers of this systematic review will not be familiar with the field. In order to make it easy for readers who are not familiar with this field to understand, list the main points briefly (approximately 30 words maximum) based on the following points.
+- Motivation for this field and the problem this field are trying to solve
+- Historical background of this field
+- Future development of this field
+Based on these main points, provide an overview of the systematic review regarding {query} you will write.
+Finally, write the title of the systematic review you are going to write based on this overview.
+{format_instructions}"""
+human_prompt = HumanMessagePromptTemplate(
+    prompt=PromptTemplate(
+        template=human_template,
+        input_variables=["abstracts", "query"],
+        partial_variables={
+            "format_instructions": output_parser.get_format_instructions()
+        }
+    )
+)
+OVERVIEW_PROMPT = ChatPromptTemplate.from_messages([system_prompt, human_prompt])

metaanalyser/chains/section/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .section import SRSectionChain
+__all__ = [
+    "SRSectionChain"
+]

metaanalyser/chains/section/prompt.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from langchain.prompts import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+system_template = """You are a research scientist and intereseted in {categories}. You are working on writing a systematic review regarding \"{query}\".
+The outline of the systematic review is as follows:
+-----
+Title: {title}
+{outline}"""
+system_prompt = SystemMessagePromptTemplate.from_template(system_template)
+human_template = """Write the "{section_title}" section of this systematic review based on the following list of snippets or abstracts of relative papers.
+-----
+{snippets}
+-----
+This systematic review should adhere to the following overview:
+{overview}
+Write the "{section_title}" section with respect to this overview. Write the text in markdown format. The title of this section should bu suffixed with {section_level} level markdown title (`{md_title_suffix}`). The text of the section should be based on a snippet or abstact and should be clearly cited. The citation should be written at the end of the sentence in the form `[^cite_id]`."""
+human_prompt = HumanMessagePromptTemplate.from_template(human_template)
+SECTION_PROMPT = ChatPromptTemplate.from_messages([
+    system_prompt,
+    human_prompt,
+])

metaanalyser/chains/section/section.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.prompts.base import BasePromptTemplate
+from langchain.vectorstores.base import VectorStore
+from typing import Any, Dict, List, Optional
+from ...paper import (
+    Paper,
+    get_abstract_with_token_limit,
+    get_categories_string,
+)
+from ..base import (
+    SRBaseChain,
+    maybe_retry_with_error_output_parser,
+)
+from ..outline import Outlint
+from ..overview import Overview
+from .prompt import SECTION_PROMPT
+class SRSectionChain(SRBaseChain):
+    paper_store: VectorStore
+    prompt: BasePromptTemplate = SECTION_PROMPT
+    nb_categories: int = 3
+    nb_token_limit: int = 2_000
+    nb_max_retry: int = 3
+    @property
+    def input_keys(self) -> List[str]:
+        # TODO: 入れ子に対応する
+        return [
+            "section_idx",
+            "section_level",
+            "query",
+            "papers",
+            "overview",
+            "outline"
+        ]
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        input_list = get_input_list(
+            self.llm,
+            self.paper_store,
+            inputs["section_idx"],
+            inputs["section_level"],
+            inputs["query"],
+            inputs["papers"],
+            inputs["overview"],
+            inputs["outline"],
+            self.nb_categories,
+            self.nb_token_limit,
+        )
+        return super()._call(input_list, run_manager=run_manager)
+    def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        input_list = get_input_list(
+            self.llm,
+            self.paper_store,
+            inputs["section_idx"],
+            inputs["section_level"],
+            inputs["query"],
+            inputs["papers"],
+            inputs["overview"],
+            inputs["outline"],
+            self.nb_categories,
+            self.nb_token_limit,
+        )
+        return super()._acall(input_list, run_manager=run_manager)
+def get_input_list(
+        llm: BaseLanguageModel,
+        paper_store: VectorStore,
+        section_idx: int,
+        section_level: int,
+        query: str,
+        papers: List[Paper],
+        overview: Overview,
+        outline: Outlint,
+        nb_categories: int,
+        nb_token_limit: int,
+        max_paper_store_search_size: int = 100,
+):
+    section = outline.sections[section_idx]
+    papers_citation_id_map = {p.citation_id: p for p in papers}
+    related_papers = [
+        papers_citation_id_map[int(citation_id)]
+        for citation_id in section.citation_ids
+    ]
+    if not related_papers:
+        # citation_ids が空なら全部を対象とする
+        # FIXME: 全部にしちゃうと溢れちゃうのでは？？
+        related_papers = papers
+    related_snippets = paper_store.similarity_search(
+        f"{section.title} {section.description}",
+        k=max_paper_store_search_size,
+    )
+    # overview が引用している論文の abst は全部 snippet に含める
+    # 加えて nb_token_limit に到達するまで vectorstore から関連文章を集める
+    def get_snippet(title, citation_id, text):
+        text = text.replace("\n", " ")
+        return f"""
+Title: {title}
+citation_id: {citation_id}
+Text: {text}
+"""
+    snippets = [get_snippet(p.title, p.citation_id, p.summary) for p in related_papers]
+    total_num_tokens = llm.get_num_tokens("\n".join(snippets).strip())
+    idx = 0
+    while idx < len(related_snippets):
+        snippet = related_snippets[idx]
+        snippet_text = get_snippet(
+            snippet.metadata["title"],
+            snippet.metadata["citation_id"],
+            snippet.page_content,
+        )
+        num_tokens = llm.get_num_tokens(snippet_text)
+        if total_num_tokens + num_tokens > nb_token_limit:
+            break
+        snippets.append(snippet_text)
+        total_num_tokens += num_tokens
+        idx += 1
+    return [{
+        "query": query,
+        "title": overview.title,
+        "overview": overview,
+        "section_title": section.title,
+        "section_level": section_level,
+        "md_title_suffix": "#" * section_level,
+        "outline": outline,
+        "categories": get_categories_string(papers, nb_categories),
+        "snippets": "\n".join(snippets).strip(),
+    }]

metaanalyser/memory.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from joblib import Memory
+CACHE_DIR = os.environ.get(
+    "METAANALYSER_CACHE_DIR",
+    os.path.join(os.path.relpath(os.path.dirname(__file__)), "..", ".cache")
+)
+memory = Memory(CACHE_DIR, verbose=0)

metaanalyser/paper/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .paper import (
+    Paper,
+    get_abstract_with_token_limit,
+    get_categories_string,
+    search_on_google_scholar,
+)
+from .vectorstore import create_papers_vectorstor
+__all__ = [
+    "Paper",
+    "create_papers_vectorstor",
+    "get_abstract_with_token_limit",
+    "get_categories_string",
+    "search_on_google_scholar",
+]

metaanalyser/paper/arxiv_categories.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# https://arxiv.org/category_taxonomy をスクレイピングして取得した
+# TODO: 新規に追加されるものに対応する
+CATEGORY_NAME_ID_MAP = {
+    'cs.AI': 'Artificial Intelligence',
+    'cs.AR': 'Hardware Architecture',
+    'cs.CC': 'Computational Complexity',
+    'cs.CE': 'Computational Engineering, Finance, and Science',
+    'cs.CG': 'Computational Geometry',
+    'cs.CL': 'Computation and Language',
+    'cs.CR': 'Cryptography and Security',
+    'cs.CV': 'Computer Vision and Pattern Recognition',
+    'cs.CY': 'Computers and Society',
+    'cs.DB': 'Databases',
+    'cs.DC': 'Distributed, Parallel, and Cluster Computing',
+    'cs.DL': 'Digital Libraries',
+    'cs.DM': 'Discrete Mathematics',
+    'cs.DS': 'Data Structures and Algorithms',
+    'cs.ET': 'Emerging Technologies',
+    'cs.FL': 'Formal Languages and Automata Theory',
+    'cs.GL': 'General Literature',
+    'cs.GR': 'Graphics',
+    'cs.GT': 'Computer Science and Game Theory',
+    'cs.HC': 'Human-Computer Interaction',
+    'cs.IR': 'Information Retrieval',
+    'cs.IT': 'Information Theory',
+    'cs.LG': 'Machine Learning',
+    'cs.LO': 'Logic in Computer Science',
+    'cs.MA': 'Multiagent Systems',
+    'cs.MM': 'Multimedia',
+    'cs.MS': 'Mathematical Software',
+    'cs.NA': 'Numerical Analysis',
+    'cs.NE': 'Neural and Evolutionary Computing',
+    'cs.NI': 'Networking and Internet Architecture',
+    'cs.OH': 'Other Computer Science',
+    'cs.OS': 'Operating Systems',
+    'cs.PF': 'Performance',
+    'cs.PL': 'Programming Languages',
+    'cs.RO': 'Robotics',
+    'cs.SC': 'Symbolic Computation',
+    'cs.SD': 'Sound',
+    'cs.SE': 'Software Engineering',
+    'cs.SI': 'Social and Information Networks',
+    'cs.SY': 'Systems and Control',
+    'econ.EM': 'Econometrics',
+    'econ.GN': 'General Economics',
+    'econ.TH': 'Theoretical Economics',
+    'eess.AS': 'Audio and Speech Processing',
+    'eess.IV': 'Image and Video Processing',
+    'eess.SP': 'Signal Processing',
+    'eess.SY': 'Systems and Control',
+    'math.AC': 'Commutative Algebra',
+    'math.AG': 'Algebraic Geometry',
+    'math.AP': 'Analysis of PDEs',
+    'math.AT': 'Algebraic Topology',
+    'math.CA': 'Classical Analysis and ODEs',
+    'math.CO': 'Combinatorics',
+    'math.CT': 'Category Theory',
+    'math.CV': 'Complex Variables',
+    'math.DG': 'Differential Geometry',
+    'math.DS': 'Dynamical Systems',
+    'math.FA': 'Functional Analysis',
+    'math.GM': 'General Mathematics',
+    'math.GN': 'General Topology',
+    'math.GR': 'Group Theory',
+    'math.GT': 'Geometric Topology',
+    'math.HO': 'History and Overview',
+    'math.IT': 'Information Theory',
+    'math.KT': 'K-Theory and Homology',
+    'math.LO': 'Logic',
+    'math.MG': 'Metric Geometry',
+    'math.MP': 'Mathematical Physics',
+    'math.NA': 'Numerical Analysis',
+    'math.NT': 'Number Theory',
+    'math.OA': 'Operator Algebras',
+    'math.OC': 'Optimization and Control',
+    'math.PR': 'Probability',
+    'math.QA': 'Quantum Algebra',
+    'math.RA': 'Rings and Algebras',
+    'math.RT': 'Representation Theory',
+    'math.SG': 'Symplectic Geometry',
+    'math.SP': 'Spectral Theory',
+    'math.ST': 'Statistics Theory',
+    'Astrophysics': 'astro-ph',
+    'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
+    'astro-ph.EP': 'Earth and Planetary Astrophysics',
+    'astro-ph.GA': 'Astrophysics of Galaxies',
+    'astro-ph.HE': 'High Energy Astrophysical Phenomena',
+    'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
+    'astro-ph.SR': 'Solar and Stellar Astrophysics',
+    'Condensed Matter': 'cond-mat',
+    'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
+    'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
+    'cond-mat.mtrl-sci': 'Materials Science',
+    'cond-mat.other': 'Other Condensed Matter',
+    'cond-mat.quant-gas': 'Quantum Gases',
+    'cond-mat.soft': 'Soft Condensed Matter',
+    'cond-mat.stat-mech': 'Statistical Mechanics',
+    'cond-mat.str-el': 'Strongly Correlated Electrons',
+    'cond-mat.supr-con': 'Superconductivity',
+    'General Relativity and Quantum Cosmology': 'gr-qc',
+    'gr-qc': 'General Relativity and Quantum Cosmology',
+    'High Energy Physics - Experiment': 'hep-ex',
+    'hep-ex': 'High Energy Physics - Experiment',
+    'High Energy Physics - Lattice': 'hep-lat',
+    'hep-lat': 'High Energy Physics - Lattice',
+    'High Energy Physics - Phenomenology': 'hep-ph',
+    'hep-ph': 'High Energy Physics - Phenomenology',
+    'High Energy Physics - Theory': 'hep-th',
+    'hep-th': 'High Energy Physics - Theory',
+    'Mathematical Physics': 'math-ph',
+    'math-ph': 'Mathematical Physics',
+    'Nonlinear Sciences': 'nlin',
+    'nlin.AO': 'Adaptation and Self-Organizing Systems',
+    'nlin.CD': 'Chaotic Dynamics',
+    'nlin.CG': 'Cellular Automata and Lattice Gases',
+    'nlin.PS': 'Pattern Formation and Solitons',
+    'nlin.SI': 'Exactly Solvable and Integrable Systems',
+    'Nuclear Experiment': 'nucl-ex',
+    'nucl-ex': 'Nuclear Experiment',
+    'Nuclear Theory': 'nucl-th',
+    'nucl-th': 'Nuclear Theory',
+    'Physics': 'physics',
+    'physics.acc-ph': 'Accelerator Physics',
+    'physics.ao-ph': 'Atmospheric and Oceanic Physics',
+    'physics.app-ph': 'Applied Physics',
+    'physics.atm-clus': 'Atomic and Molecular Clusters',
+    'physics.atom-ph': 'Atomic Physics',
+    'physics.bio-ph': 'Biological Physics',
+    'physics.chem-ph': 'Chemical Physics',
+    'physics.class-ph': 'Classical Physics',
+    'physics.comp-ph': 'Computational Physics',
+    'physics.data-an': 'Data Analysis, Statistics and Probability',
+    'physics.ed-ph': 'Physics Education',
+    'physics.flu-dyn': 'Fluid Dynamics',
+    'physics.gen-ph': 'General Physics',
+    'physics.geo-ph': 'Geophysics',
+    'physics.hist-ph': 'History and Philosophy of Physics',
+    'physics.ins-det': 'Instrumentation and Detectors',
+    'physics.med-ph': 'Medical Physics',
+    'physics.optics': 'Optics',
+    'physics.plasm-ph': 'Plasma Physics',
+    'physics.pop-ph': 'Popular Physics',
+    'physics.soc-ph': 'Physics and Society',
+    'physics.space-ph': 'Space Physics',
+    'Quantum Physics': 'quant-ph',
+    'quant-ph': 'Quantum Physics',
+    'q-bio.BM': 'Biomolecules',
+    'q-bio.CB': 'Cell Behavior',
+    'q-bio.GN': 'Genomics',
+    'q-bio.MN': 'Molecular Networks',
+    'q-bio.NC': 'Neurons and Cognition',
+    'q-bio.OT': 'Other Quantitative Biology',
+    'q-bio.PE': 'Populations and Evolution',
+    'q-bio.QM': 'Quantitative Methods',
+    'q-bio.SC': 'Subcellular Processes',
+    'q-bio.TO': 'Tissues and Organs',
+    'q-fin.CP': 'Computational Finance',
+    'q-fin.EC': 'Economics',
+    'q-fin.GN': 'General Finance',
+    'q-fin.MF': 'Mathematical Finance',
+    'q-fin.PM': 'Portfolio Management',
+    'q-fin.PR': 'Pricing of Securities',
+    'q-fin.RM': 'Risk Management',
+    'q-fin.ST': 'Statistical Finance',
+    'q-fin.TR': 'Trading and Market Microstructure',
+    'stat.AP': 'Applications',
+    'stat.CO': 'Computation',
+    'stat.ME': 'Methodology',
+    'stat.ML': 'Machine Learning',
+    'stat.OT': 'Other Statistics',
+    'stat.TH': 'Statistics Theory'
+}

metaanalyser/paper/paper.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import arxiv
+import datetime
+import logging
+import re
+import tempfile
+from collections import Counter
+from langchain.base_language import BaseLanguageModel
+from langchain.utilities import SerpAPIWrapper
+from pdfminer.high_level import extract_text
+from pydantic import BaseModel
+from tqdm.auto import tqdm
+from typing import List, Optional
+from ..memory import memory
+from .arxiv_categories import CATEGORY_NAME_ID_MAP
+logger = logging.getLogger(__name__)
+class Citation(BaseModel):
+    title: str
+    snippet: str
+class GoogleScholarItem(BaseModel):
+    result_id: str
+    title: str
+    link: str
+    nb_cited: int
+    citations: List[Citation]
+    @property
+    def mla_citiation(self) -> str:
+        mla = [c for c in self.citations if c.title == 'MLA']
+        if mla:
+            return mla[0]
+    @classmethod
+    def from_google_scholar_result(cls, result):
+        result_id = result["result_id"]
+        link = result["link"] if "link" in result else ""
+        nb_cited = (
+            result["inline_links"]["cited_by"]["total"]
+            if "cited_by" in result["inline_links"] else 0
+        )
+        citations = [
+            Citation(title=c["title"], snippet=c["snippet"]) for c in
+            fetch_google_scholar_cite(result_id)["citations"]
+        ]
+        return cls(
+            result_id=result_id,
+            title=result["title"],
+            link=link,
+            nb_cited=nb_cited,
+            citations=citations,
+        )
+class Paper(BaseModel):
+    """論文を表す、Google Scholar で得られる情報に追加して doi や要約などのフィールドを持つ
+    NOTE: serpapi 以外をソースにすることも考えられるが、今は Paper の出自は serpapi の検索結果に限定する
+    """
+    citation_id: int
+    google_scholar_item: GoogleScholarItem
+    entry_id: str
+    summary: str
+    published: datetime.datetime
+    primary_category: str
+    categories: List[str]
+    text: str
+    doi: Optional[str]
+    @property
+    def google_scholar_result_id(self):
+        return self.google_scholar_item.result_id
+    @property
+    def title(self) -> str:
+        return self.google_scholar_item.title
+    @property
+    def link(self) -> str:
+        return self.google_scholar_item.link
+    @property
+    def nb_cited(self) -> int:
+        return self.google_scholar_item.nb_cited
+    @property
+    def citations(self) -> str:
+        return self.google_scholar_item.citations
+    @property
+    def mla_citiation(self) -> str:
+        return self.google_scholar_item.mla_citiation
+    @classmethod
+    def from_google_scholar_result(cls, citation_id, result):
+        google_scholar_item = GoogleScholarItem.from_google_scholar_result(result)
+        arxiv_result = fetch_arxiv_result(google_scholar_item.link)
+        def get_category(c):
+            if c not in CATEGORY_NAME_ID_MAP:
+                logger.warning(f'Category {c} is not found in CATEGORY_NAME_ID_MAP.')
+                return None
+            return CATEGORY_NAME_ID_MAP[c]
+        primary_category = get_category(arxiv_result.primary_category)
+        categories = [
+            c for c in [get_category(c) for c in arxiv_result.categories]
+            if c
+        ]
+        return cls(
+            citation_id=citation_id,
+            google_scholar_item=google_scholar_item,
+            entry_id=arxiv_result.entry_id,
+            summary=arxiv_result.summary,
+            published=arxiv_result.published,
+            primary_category=primary_category,
+            categories=categories,
+            doi=arxiv_result.doi,
+            text=get_text_from_arxiv_search_result(arxiv_result),
+        )
+    def _repr_html_(self):
+        def get_category_string():
+            # 基本的に categories の先頭が primary_category らしい
+            if not self.categories:
+                return ""
+            result = f"<span style=\"font-weight: bold\">{self.categories[0]}</span>"
+            if len(self.categories) == 1:
+                return result
+            return f"{result}; " + "; ".join([c for c in self.categories[1:]])
+        return (
+            "<div>"
+            f"  Title:&nbsp;<a href=\"{self.link}\" target=\"_blank\">{self.title}</a><br/>"
+            f"  引用:&nbsp;[{self.citation_id}] {self.mla_citiation.snippet}<br/>"
+            f"  被引用数:&nbsp;{self.nb_cited}<br/>"
+            f"  発行日:&nbsp;{self.published}<br/>"
+            f"  カテゴリ:&nbsp;{get_category_string()}<br/>"
+            f"  要約:&nbsp;{self.summary}<br/>"
+            "</div>"
+        )
+def search_on_google_scholar(
+        query: str,
+        approved_domains: List[str] = ["arxiv.org"],
+        n: int = 10,
+) -> List[Paper]:
+    """query で SerpApi の Google Scholar API に問合せた結果を返す。
+    approved_domains に指定されたドメインの論文のみを対象とする。
+    最大 n に指定された件数��返却する。
+    """
+    def fetch(start=0):
+        def valid_item(i):
+            if "link" not in i:
+                return False
+            domain = re.match(r"https?://([^/]+)", i["link"])
+            if not domain or domain.group(1) not in approved_domains:
+                return False
+            return True
+        search_result = fetch_google_scholar(query, start)
+        return [i for i in search_result if valid_item(i)]
+    result = []
+    start = 0
+    while len(result) < n:
+        # FIXME: 今のままだとそもそも検索結果が全体で n 件以下の場合に無限ループになってしまう
+        logger.info(f"Looking for `{query}` on Google Scholar, offset: {start}...")
+        result += fetch(start)
+        start += 10
+    logger.info("Collecting details...")
+    return [
+        Paper.from_google_scholar_result(id, i)
+        for id, i in tqdm(enumerate(result[:n], start=1))
+    ]
+def get_categories_string(papers: List[Paper], n: int = 3) -> str:
+    categories = Counter(sum([p.categories for p in papers], []))
+    common = categories.most_common(n)
+    if not common:
+        return "Artifical Intelligence"
+    if len(common) == 1:
+        return common[0][0]
+    if len(common) == 2:
+        return " and ".join([c[0] for c in common])
+    *lst, last = common
+    return ", ".join([c[0] for c in lst]) + f" and {last[0]}"
+def get_abstract_with_token_limit(
+        model: BaseLanguageModel,
+        papers: List[Paper],
+        limit: int,
+        separator: str = "\n",
+) -> str:
+    def get_summary(paper: Paper):
+        summary = paper.summary.replace("\n", " ")
+        return f"""
+Title: {paper.title}
+citation_id: {paper.citation_id}
+Summry: {summary}
+"""
+    summaries = []
+    total_num_tokens = 0
+    idx = 0
+    while idx < len(papers):
+        summary = get_summary(papers[idx])
+        num_tokens = model.get_num_tokens(summary)
+        if total_num_tokens + num_tokens > limit:
+            break
+        summaries.append(summary)
+        total_num_tokens += num_tokens
+        idx += 1
+    result = separator.join(summaries).strip()
+    logger.info(
+        f'Number of papers: {len(summaries)}, '
+        f'number of tokens: {total_num_tokens}, text: {result[:100]}...'
+    )
+    return result
+@memory.cache
+def fetch_google_scholar(query: str, start: int) -> dict:
+    serpapi = SerpAPIWrapper(params={
+        "engine": "google_scholar",
+        "gl": "us",
+        "hl": "en",
+        "start": start,
+    })
+    return serpapi.results(query)["organic_results"]
+@memory.cache
+def fetch_google_scholar_cite(google_scholar_id: str) -> dict:
+    serpapi = SerpAPIWrapper(params={"engine": "google_scholar_cite"})
+    return serpapi.results(google_scholar_id)
+@memory.cache
+def fetch_arxiv_result(arxiv_abs_link: str) -> arxiv.Result:
+    m = re.match(r"https://arxiv\.org/abs/(.+)", arxiv_abs_link)
+    assert m is not None, f"{arxiv_abs_link} should be a arxiv link"
+    arxiv_id = m.group(1)
+    return next(arxiv.Search(id_list=[arxiv_id]).results())
+@memory.cache
+def get_text_from_arxiv_search_result(
+        arxiv_search_result: arxiv.Result
+) -> str:
+    with tempfile.TemporaryDirectory() as d:
+        file_path = arxiv_search_result.download_pdf(dirpath=d)
+        return extract_text(file_path)

metaanalyser/paper/vectorstore.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import logging
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import SpacyTextSplitter
+from langchain.vectorstores import FAISS
+from tqdm.auto import tqdm
+from typing import List
+from .paper import Paper
+logger = logging.getLogger(__name__)
+def create_papers_vectorstor(
+        papers: List[Paper],
+        tiktoken_encoder_model_name: str = "gpt-3.5-turbo",
+        chunk_size: int = 150,
+        chunk_overlap: int = 10,
+) -> FAISS:
+    splitter = SpacyTextSplitter.from_tiktoken_encoder(
+        model_name=tiktoken_encoder_model_name,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+    logger.info(
+        f"Creating vector store,"
+        f" {tiktoken_encoder_model_name=}"
+        f", {chunk_size=}, {chunk_overlap=}"
+    )
+    docs = splitter.create_documents(
+        [p.text.replace("\n", " ") for p in tqdm(papers)],
+        metadatas=[
+            {
+                'google_scholar_result_id': p.google_scholar_result_id,
+                'title': p.title,
+                'link': p.link,
+                'nb_cited': p.nb_cited,
+                'citation_id': p.citation_id,
+                'entry_id': p.entry_id,
+                'published': str(p.published),
+                'primary_category': p.primary_category,
+                'categories': ", ".join(p.categories),
+                'doi': p.doi,
+                'citiation': p.mla_citiation.snippet,
+            } for p in papers
+        ]
+    )
+    embeddings = OpenAIEmbeddings()
+    db = FAISS.from_documents(docs, embeddings)
+    logger.info(
+        f"Vector store is created from {len(papers)} papers,"
+        f" document size={len(docs)}"
+    )
+    return db