test / app.py
gopalnoutiyal's picture
Update app.py
e0ffeff verified
raw
history blame contribute delete
No virus
4.16 kB
import streamlit as st
import fitz # PyMuPDF
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain.prompts import ChatPromptTemplate # Use correct import
api_token = os.environ.get("HF_TOKEN", None)
# Simple document class
class Document:
def __init__(self, page_content):
self.page_content = page_content
self.metadata = {} # Add a metadata attribute
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
# Function to embed PDF text in the vector store
def pdf_to_vector_store(pdf_text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = [Document(page_content=pdf_text)]
print("Documents before splitting:", documents)
split_docs = text_splitter.split_documents(documents)
print("Documents after splitting:", split_docs)
if len(split_docs) > 0:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(split_docs, embeddings)
return db
return None
# Streamlit app
st.title("Chat with PDF using LLAMA Model")
# File uploader
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
# Extract text from the uploaded PDF
pdf_text = extract_text_from_pdf(uploaded_file)
# Display extracted text (or handle it as needed)
st.write("Extracted Text from PDF:")
st.write(pdf_text[:100]) # Display first 100 characters for brevity
# Embed PDF text in the vector store
st.write("Embedding PDF text into the vector store...")
db = pdf_to_vector_store(pdf_text)
if db:
st.write("FAISS and embeddings setup completed.")
else:
st.write("Failed to setup FAISS and embeddings.")
# If embedding was successful, proceed to Q&A
if db:
st.write("You can now ask questions about the PDF.")
# Text input for user question
user_question = st.text_input("Enter your question:")
if user_question:
# Function to answer questions using LLAMA model and vector store
def answer_question(query, db):
# Define the search type, e.g., 'similarity'
search_type = "similarity"
docs = db.search(
query, search_type=search_type, k=5
) # Retrieve top 5 relevant document chunks
# Extract text from the documents
context = " ".join([doc.page_content for doc in docs])
# Construct the prompt
prompt_template = ChatPromptTemplate.from_template(
"""
Answer the following question based only on the context from vector store I have provided. Think step by step before providing a detailed answer.
<context>
{context}
</context>
Question: {input}
"""
)
prompt = prompt_template.format(context=context, input=query)
# Define model parameters
model_id = "google/flan-t5-large" # Use a smaller model
temperature = 0.7
max_tokens = 300
top_k = 450
# Initialize the HuggingFaceHub model
llm = HuggingFaceHub(
repo_id=model_id,
huggingfacehub_api_token=api_token
)
# Get the response
response = llm(prompt)
return response
# Get the answer
answer = answer_question(user_question, db)
st.write("Answer from LLAMA Model:")
st.write(answer)
# Note: Ensure you handle large PDFs appropriately to avoid performance issues