import streamlit as st import fitz # PyMuPDF import os from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.llms import HuggingFaceHub from langchain.prompts import ChatPromptTemplate # Use correct import api_token = os.environ.get("HF_TOKEN", None) # Simple document class class Document: def __init__(self, page_content): self.page_content = page_content self.metadata = {} # Add a metadata attribute # Function to extract text from PDF def extract_text_from_pdf(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text # Function to embed PDF text in the vector store def pdf_to_vector_store(pdf_text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) documents = [Document(page_content=pdf_text)] print("Documents before splitting:", documents) split_docs = text_splitter.split_documents(documents) print("Documents after splitting:", split_docs) if len(split_docs) > 0: embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") db = FAISS.from_documents(split_docs, embeddings) return db return None # Streamlit app st.title("Chat with PDF using LLAMA Model") # File uploader uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: # Extract text from the uploaded PDF pdf_text = extract_text_from_pdf(uploaded_file) # Display extracted text (or handle it as needed) st.write("Extracted Text from PDF:") st.write(pdf_text[:100]) # Display first 100 characters for brevity # Embed PDF text in the vector store st.write("Embedding PDF text into the vector store...") db = pdf_to_vector_store(pdf_text) if db: st.write("FAISS and embeddings setup completed.") else: st.write("Failed to setup FAISS and embeddings.") # If embedding was successful, proceed to Q&A if db: st.write("You can now ask questions about the PDF.") # Text input for user question user_question = st.text_input("Enter your question:") if user_question: # Function to answer questions using LLAMA model and vector store def answer_question(query, db): # Define the search type, e.g., 'similarity' search_type = "similarity" docs = db.search( query, search_type=search_type, k=5 ) # Retrieve top 5 relevant document chunks # Extract text from the documents context = " ".join([doc.page_content for doc in docs]) # Construct the prompt prompt_template = ChatPromptTemplate.from_template( """ Answer the following question based only on the context from vector store I have provided. Think step by step before providing a detailed answer. {context} Question: {input} """ ) prompt = prompt_template.format(context=context, input=query) # Define model parameters model_id = "google/flan-t5-large" # Use a smaller model temperature = 0.7 max_tokens = 300 top_k = 450 # Initialize the HuggingFaceHub model llm = HuggingFaceHub( repo_id=model_id, huggingfacehub_api_token=api_token ) # Get the response response = llm(prompt) return response # Get the answer answer = answer_question(user_question, db) st.write("Answer from LLAMA Model:") st.write(answer) # Note: Ensure you handle large PDFs appropriately to avoid performance issues