Spaces:
Runtime error
Runtime error
from fastapi import FastAPI | |
# from transformers import pipeline | |
from txtai.embeddings import Embeddings | |
from txtai.pipeline import Extractor | |
from langchain.document_loaders import WebBaseLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# NOTE - we configure docs_url to serve the interactive Docs at the root path | |
# of the app. This way, we can use the docs as a landing page for the app on Spaces. | |
app = FastAPI(docs_url="/") | |
# Create embeddings model with content support | |
embeddings = Embeddings( | |
{"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True} | |
) | |
# Create extractor instance | |
# extractor = Extractor(embeddings, "google/flan-t5-base") | |
def _stream(dataset, limit, index: int = 0): | |
for row in dataset: | |
yield (index, row.page_content, None) | |
index += 1 | |
if index >= limit: | |
break | |
def _max_index_id(path): | |
db = sqlite3.connect(path) | |
table = "sections" | |
df = pd.read_sql_query(f"select * from {table}", db) | |
return {"max_index": df["indexid"].max()} | |
def _prompt(question): | |
return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered. | |
Question: {question} | |
Context: """ | |
async def _search(query, extractor, question=None): | |
# Default question to query if empty | |
if not question: | |
question = query | |
return extractor([("answer", query, _prompt(question), False)])[0][1] | |
def _text_splitter(doc): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=50, | |
length_function=len, | |
) | |
return text_splitter.transform_documents(doc) | |
def _load_docs(path: str): | |
load_doc = WebBaseLoader(path).load() | |
doc = _text_splitter(load_doc) | |
return doc | |
async def _upsert_docs(doc): | |
max_index = _max_index_id("index/documents") | |
embeddings.upsert(_stream(doc, 500, max_index["max_index"])) | |
embeddings.save("index") | |
return embeddings | |
async def get_doc_path(path: str): | |
return path | |
async def rag(question: str): | |
# question = "what is the document about?" | |
embeddings.load("index") | |
path = await get_doc_path(path) | |
doc = _load_docs(path) | |
embeddings = _upsert_docs(doc) | |
# Create extractor instance | |
extractor = Extractor(embeddings, "google/flan-t5-base") | |
answer = await _search(question, extractor) | |
# print(question, answer) | |
return {answer} | |