File size: 2,386 Bytes
2c73555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ab03db
f99a711
 
2c73555
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import streamlit as st
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pickle
from groq import Groq
import os

# Streamlit App
st.title("RAG-based PDF Query App")
st.write("Upload a PDF, extract its content, and query it using Groq API.")

# Upload PDF
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

if uploaded_file is not None:
    # Extract text from PDF
    def extract_text_from_pdf(uploaded_file):
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        return text

    pdf_text = extract_text_from_pdf(uploaded_file)
    st.success("PDF uploaded and extracted successfully!")

    # Chunk & Tokenize Text
    def chunk_text(text, chunk_size=500, overlap=50):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
        return text_splitter.split_text(text)

    chunks = chunk_text(pdf_text)

    # Create Embeddings & Store in FAISS
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(chunks, embedding_model)

    # Save FAISS index
    with open("faiss_index.pkl", "wb") as f:
        pickle.dump(vector_store, f)

    st.success("Document processed and stored in vector database!")

    # Query Section
    query = st.text_input("Enter your query:")

    if st.button("Search"):
        if query:
            # Load FAISS index
            with open("faiss_index.pkl", "rb") as f:
                vector_store = pickle.load(f)

            docs = vector_store.similarity_search(query, k=3)
            context = "\n".join([doc.page_content for doc in docs])

            
            GROQ_API_KEY = os.environ["GROQ_API_KEY"]  # Ensure you have stored it properly
            client = Groq(api_key=GROQ_API_KEY)

            response = client.chat.completions.create(
                messages=[{"role": "user", "content": context + "\n\n" + query}],
                model="llama-3.3-70b-versatile",
            )

            st.subheader("Response:")
            st.write(response.choices[0].message.content)
        else:
            st.warning("Please enter a query to search.")