from fastapi import FastAPI from transformers import pipeline from txtai.embeddings import Embeddings from txtai.pipeline import Extractor from llama_cpp import Llama from huggingface_hub import hf_hub_download # NOTE - we configure docs_url to serve the interactive Docs at the root path # of the app. This way, we can use the docs as a landing page for the app on Spaces. app = FastAPI(docs_url="/") # Create embeddings model with content support # embeddings = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True}) # embeddings.load('index') # Create extractor instance #extractor = Extractor(embeddings, "google/flan-t5-base") # pipe = pipeline(model="TheBloke/Llama-2-7B-GGML/llama-2-7b.ggmlv3.q4_0.bin") model_name_or_path = "TheBloke/Llama-2-7B-GGML" model_basename = "llama-2-7b.ggmlv3.q4_0.bin" model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) llm = Llama(model_path=model_path) @app.get("/generate") def generate(text: str): """ llama2 q4 backend """ output = llm(text) return {"output": output[0]["generated_text"]} def prompt(question): return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered. Question: {question} Context: """ def search(query, question=None): # Default question to query if empty if not question: question = query return extractor([("answer", query, prompt(question), False)])[0][1] # @app.get("/rag") # def rag(question: str): # # question = "what is the document about?" # answer = search(question) # # print(question, answer) # return {answer}