Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
import os | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.llms import HuggingFaceHub | |
from langchain.prompts import ChatPromptTemplate # Use correct import | |
api_token = os.environ.get("HF_TOKEN", None) | |
# Simple document class | |
class Document: | |
def __init__(self, page_content): | |
self.page_content = page_content | |
self.metadata = {} # Add a metadata attribute | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# Function to embed PDF text in the vector store | |
def pdf_to_vector_store(pdf_text): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
documents = [Document(page_content=pdf_text)] | |
print("Documents before splitting:", documents) | |
split_docs = text_splitter.split_documents(documents) | |
print("Documents after splitting:", split_docs) | |
if len(split_docs) > 0: | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
db = FAISS.from_documents(split_docs, embeddings) | |
return db | |
return None | |
# Streamlit app | |
st.title("Chat with PDF using LLAMA Model") | |
# File uploader | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if uploaded_file is not None: | |
# Extract text from the uploaded PDF | |
pdf_text = extract_text_from_pdf(uploaded_file) | |
# Display extracted text (or handle it as needed) | |
st.write("Extracted Text from PDF:") | |
st.write(pdf_text[:100]) # Display first 100 characters for brevity | |
# Embed PDF text in the vector store | |
st.write("Embedding PDF text into the vector store...") | |
db = pdf_to_vector_store(pdf_text) | |
if db: | |
st.write("FAISS and embeddings setup completed.") | |
else: | |
st.write("Failed to setup FAISS and embeddings.") | |
# If embedding was successful, proceed to Q&A | |
if db: | |
st.write("You can now ask questions about the PDF.") | |
# Text input for user question | |
user_question = st.text_input("Enter your question:") | |
if user_question: | |
# Function to answer questions using LLAMA model and vector store | |
def answer_question(query, db): | |
# Define the search type, e.g., 'similarity' | |
search_type = "similarity" | |
docs = db.search( | |
query, search_type=search_type, k=5 | |
) # Retrieve top 5 relevant document chunks | |
# Extract text from the documents | |
context = " ".join([doc.page_content for doc in docs]) | |
# Construct the prompt | |
prompt_template = ChatPromptTemplate.from_template( | |
""" | |
Answer the following question based only on the context from vector store I have provided. Think step by step before providing a detailed answer. | |
<context> | |
{context} | |
</context> | |
Question: {input} | |
""" | |
) | |
prompt = prompt_template.format(context=context, input=query) | |
# Define model parameters | |
model_id = "google/flan-t5-large" # Use a smaller model | |
temperature = 0.7 | |
max_tokens = 300 | |
top_k = 450 | |
# Initialize the HuggingFaceHub model | |
llm = HuggingFaceHub( | |
repo_id=model_id, | |
huggingfacehub_api_token=api_token | |
) | |
# Get the response | |
response = llm(prompt) | |
return response | |
# Get the answer | |
answer = answer_question(user_question, db) | |
st.write("Answer from LLAMA Model:") | |
st.write(answer) | |
# Note: Ensure you handle large PDFs appropriately to avoid performance issues | |