from llama_index.core import SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter from llama_index.core import Settings from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import SummaryIndex, VectorStoreIndex from llama_index.core.tools import QueryEngineTool from llama_index.core.query_engine.router_query_engine import RouterQueryEngine from llama_index.core.selectors import LLMSingleSelector from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex from llama_index.core.node_parser import SentenceSplitter from llama_index.core.tools import FunctionTool, QueryEngineTool from llama_index.core.vector_stores import MetadataFilters, FilterCondition from typing import List, Optional def get_doc_tools( file_path: str, name: str, ) -> str: """Get vector query and summary query tools from a document.""" # load documents documents = SimpleDirectoryReader(input_files=[file_path]).load_data() splitter = SentenceSplitter(chunk_size=1024) nodes = splitter.get_nodes_from_documents(documents) vector_index = VectorStoreIndex(nodes) def vector_query( query: str, page_numbers: Optional[List[str]] = None ) -> str: """Use to answer questions over a given paper. Useful if you have specific questions over the paper. Always leave page_numbers as None UNLESS there is a specific page you want to search for. Args: query (str): the string query to be embedded. page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE if we want to perform a vector search over all pages. Otherwise, filter by the set of specified pages. """ page_numbers = page_numbers or [] metadata_dicts = [ {"key": "page_label", "value": p} for p in page_numbers ] query_engine = vector_index.as_query_engine( similarity_top_k=2, filters=MetadataFilters.from_dicts( metadata_dicts, condition=FilterCondition.OR ) ) response = query_engine.query(query) return response vector_query_tool = FunctionTool.from_defaults( name=f"vector_tool_{name}", fn=vector_query ) summary_index = SummaryIndex(nodes) summary_query_engine = summary_index.as_query_engine( response_mode="tree_summarize", use_async=True, ) summary_tool = QueryEngineTool.from_defaults( name=f"summary_tool_{name}", query_engine=summary_query_engine, description=( f"Useful for summarization questions related to {name}" ), ) return vector_query_tool, summary_tool