Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration | |
import fitz # PyMuPDF | |
from datasets import load_dataset | |
from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage, Settings | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.llms.ollama import Ollama | |
# Load Llama 3 model components | |
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") | |
retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="custom", passages_path="my_knowledge_base.faiss") | |
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever) | |
# Load the embedding model | |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
# Create an LLM object using the deployed Llama3 Ollama instance | |
llm = Ollama(model="llama3:instruct", request_timeout=60.0) | |
# Set global settings for the LLM, chunk size, and embedding model | |
Settings.llm = llm | |
Settings.chunk_size = 512 | |
Settings.embed_model = embed_model | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(pdf_files): | |
texts = [] | |
for pdf in pdf_files: | |
doc = fitz.open(pdf.name) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
texts.append(text) | |
return texts | |
# Function to provide answers based on questions and PDFs | |
def rag_answer(question, pdf_files): | |
texts = extract_text_from_pdf(pdf_files) | |
context = " ".join(texts) | |
inputs = tokenizer(question, return_tensors="pt") | |
outputs = model.generate(**inputs, context_input=context) | |
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] | |
# Function to create the Vector Store Index from documents | |
def create_vector_store_index(documents): | |
index = VectorStoreIndex.from_documents(documents) | |
index.storage_context.persist(persist_dir="pdf_docs") | |
return index | |
# Load dataset and convert to Document format | |
pdf_docs = load_dataset('your-dataset-name', split='train') # Replace with your actual dataset name | |
documents = [Document(text=row['text'], metadata={'title': row['title']}) for index, row in pdf_docs.iterrows()] | |
# Create or load the vector store index | |
try: | |
storage_context = StorageContext.from_defaults(persist_dir="pdf_docs") | |
vector_index = load_index_from_storage(storage_context) | |
except: | |
vector_index = create_vector_store_index(documents) | |
# Define the query engine powered by the Vector Store | |
query_engine = vector_index.as_query_engine(similarity_top_k=10) | |
# Functions for Gradio UI | |
def query(text): | |
z = query_engine.query(text) | |
return z | |
def interface(text): | |
z = query(text) | |
response = z.response | |
return response | |
# Gradio interface | |
with gr.Blocks(theme=gr.themes.Glass().set(block_title_text_color="black", body_background_fill="black", input_background_fill="black", body_text_color="white")) as demo: | |
gr.Markdown("h1 {text-align: center;display: block;}Information Custodian Chat Agent") | |
with gr.Row(): | |
output_text = gr.Textbox(lines=20) | |
with gr.Row(): | |
input_text = gr.Textbox(label='Enter your query here') | |
input_text.submit(fn=interface, inputs=input_text, outputs=output_text) | |
demo.launch(share=True) | |