Spaces:

wwwillchen
/

mesop-docs-bot

Running

File size: 5,393 Bytes

06b2d9e

import os
import sys

import nest_asyncio
import Stemmer
from llama_index.core import (
  PromptTemplate,
  Settings,
  SimpleDirectoryReader,
  StorageContext,
  VectorStoreIndex,
  load_index_from_storage,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.query_engine import CitationQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.schema import NodeWithScore as NodeWithScore
from llama_index.embeddings.google import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.retrievers.bm25 import BM25Retriever

import mesop as me

nest_asyncio.apply()

CITATION_QA_TEMPLATE = PromptTemplate(
  "Please provide an answer based solely on the provided sources. "
  "When referencing information from a source, "
  "cite the appropriate source(s) using their corresponding numbers. "
  "Every answer should include at least one source citation. "
  "Only cite a source when you are explicitly referencing it. "
  "If you are sure NONE of the sources are helpful, then say: 'Sorry, I didn't find any docs about this.'"
  "If you are not sure if any of the sources are helpful, then say: 'You might find this helpful', where 'this' is the source's title.'"
  "DO NOT say Source 1, Source 2, etc. Only reference sources like this: [1], [2], etc."
  "I want you to pick just ONE source to answer the question."
  "For example:\n"
  "Source 1:\n"
  "The sky is red in the evening and blue in the morning.\n"
  "Source 2:\n"
  "Water is wet when the sky is red.\n"
  "Query: When is water wet?\n"
  "Answer: Water will be wet when the sky is red [2], "
  "which occurs in the evening [1].\n"
  "Now it's your turn. Below are several numbered sources of information:"
  "\n------\n"
  "{context_str}"
  "\n------\n"
  "Query: {query_str}\n"
  "Answer: "
)

os.environ["GOOGLE_API_KEY"] = os.environ["GEMINI_API_KEY"]


def get_meta(file_path: str) -> dict[str, str]:
  with open(file_path) as f:
    title = f.readline().strip()
    if title.startswith("# "):
      title = title[2:]
    else:
      title = (
        file_path.split("/")[-1]
        .replace(".md", "")
        .replace("-", " ")
        .capitalize()
      )

  file_path = file_path.replace(".md", "")
  CONST = "../../docs/"
  docs_index = file_path.index(CONST)
  docs_path = file_path[docs_index + len(CONST) :]

  url = "https://mesop-dev.github.io/mesop/" + docs_path

  print(f"URL: {url}")
  return {
    "url": url,
    "title": title,
  }


embed_model = GeminiEmbedding(
  model_name="models/text-embedding-004", api_key=os.environ["GOOGLE_API_KEY"]
)
Settings.embed_model = embed_model

PERSIST_DIR = "./gen"


def build_or_load_index():
  if not os.path.exists(PERSIST_DIR) or "--build-index" in sys.argv:
    print("Building index")

    documents = SimpleDirectoryReader(
      "../../docs/",
      required_exts=[
        ".md",
      ],
      exclude=[
        "showcase.md",
        "demo.md",
        "blog",
        "internal",
      ],
      file_metadata=get_meta,
      recursive=True,
    ).load_data()
    for doc in documents:
      doc.excluded_llm_metadata_keys = ["url"]
    splitter = SentenceSplitter(chunk_size=512)

    nodes = splitter.get_nodes_from_documents(documents)
    bm25_retriever = BM25Retriever.from_defaults(
      nodes=nodes,
      similarity_top_k=5,
      # Optional: We can pass in the stemmer and set the language for stopwords
      # This is important for removing stopwords and stemming the query + text
      # The default is english for both
      stemmer=Stemmer.Stemmer("english"),
      language="english",
    )
    bm25_retriever.persist(PERSIST_DIR + "/bm25_retriever")

    index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    return index, bm25_retriever
  else:
    print("Loading index")
    bm25_retriever = BM25Retriever.from_persist_dir(
      PERSIST_DIR + "/bm25_retriever"
    )
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    return index, bm25_retriever


if me.runtime().is_hot_reload_in_progress:
  print("Hot reload - skip building index!")
  query_engine = me._query_engine
  bm25_retriever = me._bm25_retriever

else:
  index, bm25_retriever = build_or_load_index()
  llm = Gemini(model="models/gemini-1.5-flash-latest")
  retriever = QueryFusionRetriever(
    [
      index.as_retriever(similarity_top_k=5),
      bm25_retriever,
    ],
    llm=llm,
    num_queries=1,
    use_async=True,
    similarity_top_k=5,
  )
  query_engine = CitationQueryEngine.from_args(
    index,
    retriever=retriever,
    llm=llm,
    citation_qa_template=CITATION_QA_TEMPLATE,
    similarity_top_k=5,
    embedding_model=embed_model,
    streaming=True,
  )

  blocking_query_engine = CitationQueryEngine.from_args(
    index,
    retriever=retriever,
    llm=llm,
    citation_qa_template=CITATION_QA_TEMPLATE,
    similarity_top_k=5,
    embedding_model=embed_model,
    streaming=False,
  )
  # TODO: replace with proper mechanism for persisting objects
  # across hot reloads
  me._query_engine = query_engine
  me._bm25_retriever = bm25_retriever


NEWLINE = "\n"


def ask(query: str):
  return query_engine.query(query)


def retrieve(query: str):
  return bm25_retriever.retrieve(query)