Spaces:

shamilcoded
/

DocuQuery_AI

Sleeping

App Files Files Community

shamilcoded commited on Apr 18, 2025

Commit

8e456d3

verified ·

1 Parent(s): e8294f8

Create app.py

Browse files

Files changed (1) hide show

app.py +87 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import streamlit as st
+import os
+import tempfile
+import faiss
+import fitz  # PyMuPDF for PDFs
+import docx
+import openpyxl
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.docstore.document import Document
+from langchain_community.llms import Groq
+from langchain.chains import RetrievalQA
+from langchain.schema import Document as LCDocument
+# Initialize LLM
+llm = Groq(
+    model="llama3-8b-8192",
+    api_key=os.getenv("GROQ_API_KEY")  # Put this in Hugging Face secrets
+)
+# Embeddings model
+embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# File processors
+def read_pdf(file_path):
+    text = ""
+    doc = fitz.open(file_path)
+    for page in doc:
+        text += page.get_text()
+    return text
+def read_docx(file_path):
+    doc = docx.Document(file_path)
+    return "\n".join([p.text for p in doc.paragraphs])
+def read_excel(file_path):
+    wb = openpyxl.load_workbook(file_path, data_only=True)
+    text = ""
+    for sheet in wb.sheetnames:
+        ws = wb[sheet]
+        for row in ws.iter_rows(values_only=True):
+            text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
+    return text
+def process_file(uploaded_file):
+    suffix = uploaded_file.name.split(".")[-1]
+    with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
+        tmp_file.write(uploaded_file.read())
+        tmp_path = tmp_file.name
+    if suffix.lower() == "pdf":
+        return read_pdf(tmp_path)
+    elif suffix.lower() in ["docx"]:
+        return read_docx(tmp_path)
+    elif suffix.lower() in ["xlsx"]:
+        return read_excel(tmp_path)
+    else:
+        return "Unsupported file type."
+# Streamlit UI
+st.title("📄 RAG Document QA with Faiss + LLaMA3")
+uploaded_file = st.file_uploader("Upload a PDF, Word or Excel file", type=["pdf", "docx", "xlsx"])
+if uploaded_file:
+    st.success("✅ File uploaded successfully.")
+    raw_text = process_file(uploaded_file)
+    # Split text into chunks
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    texts = splitter.split_text(raw_text)
+    docs = [Document(page_content=t) for t in texts]
+    # Embed and create vector store
+    with st.spinner("Indexing document..."):
+        db = FAISS.from_documents(docs, embedding_model)
+        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
+        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
+    st.success("✅ Document indexed! Ask your questions below:")
+    user_query = st.text_input("❓ Ask a question about your document")
+    if user_query:
+        with st.spinner("Generating answer..."):
+            answer = qa.run(user_query)
+            st.markdown(f"**💬 Answer:** {answer}")