Spaces:

tech5
/

docu-backend

Running

App Files Files Community

tech5 commited on Jan 15

Commit

e27c97c

1 Parent(s): 0fd8e7d

Deploy FastAPI RAG backend

Browse files

Files changed (17) hide show

Dockerfile +20 -0
backend/__pycache__/api.cpython-310.pyc +0 -0
backend/api.py +152 -0
docu-backend +1 -0
rag/__pycache__/chain.cpython-310.pyc +0 -0
rag/__pycache__/combine.cpython-310.pyc +0 -0
rag/__pycache__/lang_doc.cpython-310.pyc +0 -0
rag/__pycache__/lc.cpython-310.pyc +0 -0
rag/__pycache__/rag.cpython-310.pyc +0 -0
rag/__pycache__/smark_chunking.cpython-310.pyc +0 -0
rag/__pycache__/smart_chunking.cpython-310.pyc +0 -0
rag/chain.py +88 -0
rag/combine.py +74 -0
rag/lang_doc.py +14 -0
rag/smart_chunking.py +62 -0
rag/t.py +124 -0
requirements.txt +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "backend.api:app", "--host", "0.0.0.0", "--port", "7860"]

backend/__pycache__/api.cpython-310.pyc ADDED Viewed

Binary file (3.67 kB). View file

backend/api.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from fastapi import FastAPI, UploadFile, File, status
+import os
+from fastapi.exceptions import HTTPException
+import shutil
+from rag.smart_chunking import get_chunked_docs
+from rag.chain import store_documents, load_documents, get_rag_chain
+from langchain_huggingface import HuggingFaceEmbeddings
+from datetime import datetime
+from fastapi.middleware.cors import CORSMiddleware
+from functools import lru_cache
+from pathlib import Path
+@lru_cache
+def get_embeddings():
+    return HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+@lru_cache
+def get_vectorstore():
+    return load_documents(embedding_model=get_embeddings())
+BASE_DIR = Path("/app")
+upload_dir = BASE_DIR / "uploads"
+upload_dir.mkdir(parents=True, exist_ok=True)
+app = FastAPI(
+    title="Multi_Rag_System_API",
+    description="This is Api for Multi Rag System",
+    version="V1"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Track system stats
+system_stats = {
+    "total_uploads": 0,
+    "total_queries": 0,
+    "start_time": datetime.now().isoformat()
+}
+# Info about API
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "Multi-Modal RAG System API",
+        "version": "v1.0.0",
+        "endpoints": {
+            "health": "/health",
+            "upload": "/upload",
+            "query": "/query",
+            "stats": "/stats",
+            "docs": "/docs"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint for monitoring"""
+    try:
+        # Check if upload directory exists
+        upload_dir_exists = upload_dir.exists()
+        # Count uploaded files
+        uploaded_files = len(list(upload_dir.glob("*.pdf"))) if upload_dir_exists else 0
+        return {
+            "status": "healthy",
+            "timestamp": datetime.now().isoformat(),
+            "upload_directory": upload_dir_exists,
+            "uploaded_documents": uploaded_files,
+            "embeddings_model": "sentence-transformers/all-MiniLM-L6-v2"
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail=f"Health check failed: {str(e)}"
+        )
+# Tracks the System_stats
+@app.get("/stats")
+async def get_stats():
+    """Get system statistics"""
+    return {
+    "stats": system_stats,
+    "uploaded_documents": len(list(upload_dir.glob("*.pdf"))),
+    "current_time": datetime.now().isoformat()
+}
+# This Endpoint upload Pdf and store into VectorDatabase
+@app.post("/upload")
+async def upload_file(file: UploadFile = File(...)):
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    file_path = upload_dir / file.filename
+    with open(file_path, "wb") as f:
+        shutil.copyfileobj(file.file, f)
+    chunked_docs = get_chunked_docs(file_path)
+    if not chunked_docs:
+        raise HTTPException(status_code=500, detail="No content extracted from PDF")
+    store_documents(chunked_docs, get_embeddings())
+    # INCREMENT THE COUNTER HERE!
+    system_stats["total_uploads"] += 1
+    return {
+        "message": "PDF uploaded and indexed successfully",
+        "chunks_created": len(chunked_docs)
+    }
+from pydantic import BaseModel
+class QueryRequest(BaseModel):
+    input: str
+# This Endpoint Load the VectorDataBase and answer the User question
+@app.post("/query")
+async def get_response(req: QueryRequest):
+    vectorstore = get_vectorstore()
+    retriever = vectorstore.as_retriever(
+        search_type="mmr",
+        search_kwargs={"k": 3}
+    )
+    chain = get_rag_chain(retriever)
+    response = chain.invoke(req.input)
+    system_stats["total_queries"] += 1
+    return {
+        "question": req.input,
+        "response": response.content
+    }

docu-backend ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 0fd8e7dcb74ae565454da1f0a5918464d8e729d3

rag/__pycache__/chain.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

rag/__pycache__/combine.cpython-310.pyc ADDED Viewed

Binary file (1.3 kB). View file

rag/__pycache__/lang_doc.cpython-310.pyc ADDED Viewed

Binary file (453 Bytes). View file

rag/__pycache__/lc.cpython-310.pyc ADDED Viewed

Binary file (447 Bytes). View file

rag/__pycache__/rag.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

rag/__pycache__/smark_chunking.cpython-310.pyc ADDED Viewed

Binary file (1.06 kB). View file

rag/__pycache__/smart_chunking.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

rag/chain.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from langchain_groq import ChatGroq
+from .smart_chunking import get_chunked_docs
+from langchain_core.documents import Document
+from typing import List
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+import os
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.prompts import ChatPromptTemplate
+VECTOR_PATH = "vectorstore/faiss_index"
+llm = ChatGroq(model="llama-3.3-70b-versatile")
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# This funtion include page_content + metadata fot better retrieval
+def format_docs_with_metadata(docs):
+    formatted = []
+    for d in docs:
+        meta = d.metadata
+        citation = f"(Page {meta.get('page')}"
+        if meta.get("ref"):
+            citation += f", {meta.get('ref')}"
+        citation += ")"
+        formatted.append(
+            f"{citation}\n{d.page_content}"
+        )
+    return "\n\n".join(formatted)
+# Funtion For Storing Documents into VectorDatabase
+def store_documents(docs:List[Document],embedding_model:str):
+    vectorstore = FAISS.from_documents(docs,embedding=embedding_model)
+    vectorstore.save_local(VECTOR_PATH)
+    return vectorstore
+# Funtion to load VectorDatabase for Retrieval Process
+def load_documents(embedding_model:str):
+    if not os.path.exists(VECTOR_PATH):
+        raise ValueError("Vectorstore not found,Upload Your Document First")
+    return FAISS.load_local(VECTOR_PATH,embeddings=embedding_model,allow_dangerous_deserialization=True)
+# Prompt for LLM to execute Your task more efficiently
+prompt = ChatPromptTemplate.from_template(
+    """You are a professional research analyst.
+Answer the question strictly using the information contained in the document excerpts below.
+Do not mention the phrases "provided context", "given context", or similar meta-references.
+Do not include conversational language or assumptions.
+Writing guidelines:
+- Use a formal, neutral, and analytical tone.
+- Present information directly and concisely.
+- If information is missing, clearly state that it is not available in the document.
+- Do not speculate or add external knowledge.
+Citation rules:
+- List citations in a separate section highlighted with blue.
+- Each citation must include page number and table/figure/image reference if available.
+- Use this format exactly:
+  • Page X, Table/Figure/Image Y (if applicable)
+<Document Excerpts>
+{context}
+</Document Excerpts>
+Question:
+{input}
+"""
+)
+# Get Retrieval chain
+def get_rag_chain(retriever):
+    chain = (
+    {
+        "context": retriever | format_docs_with_metadata,
+        "input": RunnablePassthrough()
+    }
+    |prompt
+    |llm
+    )
+    return chain

rag/combine.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import pdfplumber
+import fitz
+import camelot
+import pytesseract
+from PIL import Image
+import io
+# Raw Documents
+def raw_document_text(pdf_path: str):
+    documents = []
+    # Open PDF
+    with pdfplumber.open(pdf_path) as pdf:
+        doc_fitz = fitz.open(pdf_path)
+        for page_index, page in enumerate(pdf.pages, start=1):
+            # TEXT
+            text = page.extract_text()
+            if text:
+                documents.append({
+                    "content": text,
+                    "metadata": {
+                        "page": page_index,
+                        "type": "text"
+                    }
+                })
+            # TABLES
+            tables = camelot.read_pdf(
+                pdf_path,
+                pages=str(page_index),
+                flavor="stream"
+            )
+            for t_idx, table in enumerate(tables):
+                table_text = table.df.to_string(index=False)
+                documents.append({
+                    "content": table_text,
+                    "metadata": {
+                        "page": page_index,
+                        "type": "table",
+                        "ref": f"Table {t_idx + 1}"
+                    }
+                })
+            # IMAGES + OCR
+            page_fitz = doc_fitz[page_index - 1]
+            images = page_fitz.get_images(full=True)
+            for img_idx, img in enumerate(images):
+                xref = img[0]
+                base_image = doc_fitz.extract_image(xref)
+                image_bytes = base_image["image"]
+                image = Image.open(io.BytesIO(image_bytes))
+                ocr_text = pytesseract.image_to_string(image)
+                if ocr_text.strip():
+                    documents.append({
+                        "content": ocr_text,
+                        "metadata": {
+                            "page": page_index,
+                            "type": "image",
+                            "ref": f"Image {img_idx + 1}"
+                        }
+                    })
+    return documents

rag/lang_doc.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from langchain_core.documents import Document
+# Raw Documents to Langchain Documents
+def get_langchain_docs(docs:str):
+    lc_docs = []
+    for doc in docs:
+        document = Document(
+            page_content=doc['content'],
+            metadata=doc['metadata']
+        )
+        lc_docs.append(document)
+    return lc_docs

rag/smart_chunking.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from langchain_core.documents import Document
+from .combine import raw_document_text
+from .lang_doc import get_langchain_docs
+# Smart Chunks Documents
+def smart_text_chunker(doc, max_chars=500):
+    chunks = []
+    buffer = ""
+    paragraphs = doc.page_content.split("\n\n")
+    for para in paragraphs:
+        if len(buffer) + len(para) <= max_chars:
+            buffer += para + "\n\n"
+        else:
+            chunks.append(
+                Document(
+                    page_content=buffer.strip(),
+                    metadata=doc.metadata
+                )
+            )
+            buffer = para + "\n\n"
+    if buffer.strip():
+        chunks.append(
+            Document(
+                page_content=buffer.strip(),
+                metadata=doc.metadata
+            )
+        )
+    return chunks
+# Funtion for Raw Documents -> Langchain Document -> Smart Chunking Documents
+def get_chunked_docs(pdf:str):
+    chunked_docs = []
+    docs = raw_document_text(pdf)
+    documents = get_langchain_docs(docs)
+    for doc in documents:
+        doc_type = doc.metadata["type"]
+        if doc_type == "text":
+            chunked_docs.extend(smart_text_chunker(doc))
+        elif doc_type == "table":
+            chunked_docs.append(doc)
+        elif doc_type == "image":
+            chunked_docs.append(doc)
+    return chunked_docs

rag/t.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from langchain_core.documents import Document
+from collections import defaultdict
+import re
+import pdfplumber
+import fitz  # PyMuPDF
+import camelot
+import pytesseract
+from PIL import Image
+import io
+# -------------------------------
+# STEP 1: EXTRACT RAW CONTENT
+# -------------------------------
+def raw_document_text(pdf_path: str):
+    documents = []
+    with pdfplumber.open(pdf_path) as pdf:
+        doc_fitz = fitz.open(pdf_path)
+        for page_index, page in enumerate(pdf.pages, start=1):
+            # -------- TEXT --------
+            text = page.extract_text()
+            if text:
+                documents.append({
+                    "content": text,
+                    "metadata": {
+                        "page": page_index,
+                        "type": "text"
+                    }
+                })
+            # -------- TABLES --------
+            try:
+                tables = camelot.read_pdf(
+                    pdf_path,
+                    pages=str(page_index),
+                    flavor="stream"
+                )
+                for t_idx, table in enumerate(tables):
+                    table_text = table.df.to_string(index=False)
+                    documents.append({
+                        "content": table_text,
+                        "metadata": {
+                            "page": page_index,
+                            "type": "table",
+                            "ref": f"Table {t_idx + 1}"
+                        }
+                    })
+            except Exception:
+                pass
+            # -------- IMAGES + OCR --------
+            page_fitz = doc_fitz[page_index - 1]
+            images = page_fitz.get_images(full=True)
+            for img_idx, img in enumerate(images):
+                xref = img[0]
+                base_image = doc_fitz.extract_image(xref)
+                image_bytes = base_image["image"]
+                image = Image.open(io.BytesIO(image_bytes))
+                ocr_text = pytesseract.image_to_string(image)
+                if ocr_text.strip():
+                    documents.append({
+                        "content": ocr_text,
+                        "metadata": {
+                            "page": page_index,
+                            "type": "image",
+                            "ref": f"Image {img_idx + 1}"
+                        }
+                    })
+    return documents
+# -------------------------------
+# STEP 2: RAW → LANGCHAIN DOCS
+# -------------------------------
+def to_langchain_documents(raw_docs):
+    lc_docs = []
+    for doc in raw_docs:
+        lc_docs.append(
+            Document(
+                page_content=doc["content"],
+                metadata=doc["metadata"]
+            )
+        )
+    return lc_docs
+# -------------------------------
+# STEP 3: BUILD INVERTED INDEX
+# -------------------------------
+def build_inverted_index(lc_docs):
+    index = defaultdict(set)
+    for doc_id, doc in enumerate(lc_docs):
+        words = re.findall(r"\b\w+\b", doc.page_content.lower())
+        for word in words:
+            index[word].add(doc_id)
+    return index
+# -------------------------------
+# STEP 4: RUN PIPELINE
+# -------------------------------
+if __name__ == "__main__":
+    pdf_path = "Report.pdf"  # <-- change path
+    raw_docs = raw_document_text(pdf_path)
+    lc_docs = to_langchain_documents(raw_docs)
+    index = build_inverted_index(lc_docs)
+    print(f"Total LangChain Documents: {len(lc_docs)}")
+    print(f"Total Indexed Words: {len(index)}")
+    # Preview index
+    print(dict(list(index.items())[:20]))

requirements.txt ADDED Viewed

Binary file (648 Bytes). View file