tech5 commited on
Commit
e27c97c
·
1 Parent(s): 0fd8e7d

Deploy FastAPI RAG backend

Browse files
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ poppler-utils \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ COPY . .
17
+
18
+ EXPOSE 7860
19
+
20
+ CMD ["uvicorn", "backend.api:app", "--host", "0.0.0.0", "--port", "7860"]
backend/__pycache__/api.cpython-310.pyc ADDED
Binary file (3.67 kB). View file
 
backend/api.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, status
2
+ import os
3
+ from fastapi.exceptions import HTTPException
4
+ import shutil
5
+ from rag.smart_chunking import get_chunked_docs
6
+ from rag.chain import store_documents, load_documents, get_rag_chain
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from datetime import datetime
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from functools import lru_cache
11
+ from pathlib import Path
12
+
13
+ @lru_cache
14
+ def get_embeddings():
15
+ return HuggingFaceEmbeddings(
16
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
17
+ )
18
+
19
+ @lru_cache
20
+ def get_vectorstore():
21
+ return load_documents(embedding_model=get_embeddings())
22
+
23
+
24
+ BASE_DIR = Path("/app")
25
+ upload_dir = BASE_DIR / "uploads"
26
+ upload_dir.mkdir(parents=True, exist_ok=True)
27
+
28
+
29
+ app = FastAPI(
30
+ title="Multi_Rag_System_API",
31
+ description="This is Api for Multi Rag System",
32
+ version="V1"
33
+ )
34
+
35
+ # CORS middleware
36
+ app.add_middleware(
37
+ CORSMiddleware,
38
+ allow_origins=["*"],
39
+ allow_credentials=True,
40
+ allow_methods=["*"],
41
+ allow_headers=["*"],
42
+ )
43
+
44
+ # Track system stats
45
+ system_stats = {
46
+ "total_uploads": 0,
47
+ "total_queries": 0,
48
+ "start_time": datetime.now().isoformat()
49
+ }
50
+
51
+ # Info about API
52
+ @app.get("/")
53
+ async def root():
54
+ """Root endpoint with API information"""
55
+ return {
56
+ "message": "Multi-Modal RAG System API",
57
+ "version": "v1.0.0",
58
+ "endpoints": {
59
+ "health": "/health",
60
+ "upload": "/upload",
61
+ "query": "/query",
62
+ "stats": "/stats",
63
+ "docs": "/docs"
64
+ }
65
+ }
66
+
67
+
68
+ @app.get("/health")
69
+ async def health_check():
70
+ """Health check endpoint for monitoring"""
71
+ try:
72
+ # Check if upload directory exists
73
+ upload_dir_exists = upload_dir.exists()
74
+
75
+
76
+ # Count uploaded files
77
+ uploaded_files = len(list(upload_dir.glob("*.pdf"))) if upload_dir_exists else 0
78
+
79
+ return {
80
+ "status": "healthy",
81
+ "timestamp": datetime.now().isoformat(),
82
+ "upload_directory": upload_dir_exists,
83
+ "uploaded_documents": uploaded_files,
84
+ "embeddings_model": "sentence-transformers/all-MiniLM-L6-v2"
85
+ }
86
+ except Exception as e:
87
+ raise HTTPException(
88
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
89
+ detail=f"Health check failed: {str(e)}"
90
+ )
91
+
92
+
93
+ # Tracks the System_stats
94
+ @app.get("/stats")
95
+ async def get_stats():
96
+ """Get system statistics"""
97
+ return {
98
+ "stats": system_stats,
99
+ "uploaded_documents": len(list(upload_dir.glob("*.pdf"))),
100
+ "current_time": datetime.now().isoformat()
101
+ }
102
+
103
+
104
+ # This Endpoint upload Pdf and store into VectorDatabase
105
+ @app.post("/upload")
106
+ async def upload_file(file: UploadFile = File(...)):
107
+ if not file.filename.endswith(".pdf"):
108
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
109
+
110
+ file_path = upload_dir / file.filename
111
+
112
+ with open(file_path, "wb") as f:
113
+ shutil.copyfileobj(file.file, f)
114
+
115
+ chunked_docs = get_chunked_docs(file_path)
116
+
117
+ if not chunked_docs:
118
+ raise HTTPException(status_code=500, detail="No content extracted from PDF")
119
+
120
+ store_documents(chunked_docs, get_embeddings())
121
+
122
+ # INCREMENT THE COUNTER HERE!
123
+ system_stats["total_uploads"] += 1
124
+
125
+ return {
126
+ "message": "PDF uploaded and indexed successfully",
127
+ "chunks_created": len(chunked_docs)
128
+ }
129
+
130
+ from pydantic import BaseModel
131
+
132
+ class QueryRequest(BaseModel):
133
+ input: str
134
+
135
+
136
+ # This Endpoint Load the VectorDataBase and answer the User question
137
+ @app.post("/query")
138
+ async def get_response(req: QueryRequest):
139
+ vectorstore = get_vectorstore()
140
+ retriever = vectorstore.as_retriever(
141
+ search_type="mmr",
142
+ search_kwargs={"k": 3}
143
+ )
144
+ chain = get_rag_chain(retriever)
145
+ response = chain.invoke(req.input)
146
+
147
+ system_stats["total_queries"] += 1
148
+
149
+ return {
150
+ "question": req.input,
151
+ "response": response.content
152
+ }
docu-backend ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 0fd8e7dcb74ae565454da1f0a5918464d8e729d3
rag/__pycache__/chain.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
rag/__pycache__/combine.cpython-310.pyc ADDED
Binary file (1.3 kB). View file
 
rag/__pycache__/lang_doc.cpython-310.pyc ADDED
Binary file (453 Bytes). View file
 
rag/__pycache__/lc.cpython-310.pyc ADDED
Binary file (447 Bytes). View file
 
rag/__pycache__/rag.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
rag/__pycache__/smark_chunking.cpython-310.pyc ADDED
Binary file (1.06 kB). View file
 
rag/__pycache__/smart_chunking.cpython-310.pyc ADDED
Binary file (1.07 kB). View file
 
rag/chain.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from .smart_chunking import get_chunked_docs
3
+ from langchain_core.documents import Document
4
+ from typing import List
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ import os
8
+ from langchain_core.runnables import RunnablePassthrough
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+
11
+ VECTOR_PATH = "vectorstore/faiss_index"
12
+
13
+ llm = ChatGroq(model="llama-3.3-70b-versatile")
14
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
15
+
16
+
17
+ # This funtion include page_content + metadata fot better retrieval
18
+ def format_docs_with_metadata(docs):
19
+ formatted = []
20
+ for d in docs:
21
+ meta = d.metadata
22
+ citation = f"(Page {meta.get('page')}"
23
+ if meta.get("ref"):
24
+ citation += f", {meta.get('ref')}"
25
+ citation += ")"
26
+
27
+ formatted.append(
28
+ f"{citation}\n{d.page_content}"
29
+ )
30
+ return "\n\n".join(formatted)
31
+
32
+
33
+ # Funtion For Storing Documents into VectorDatabase
34
+ def store_documents(docs:List[Document],embedding_model:str):
35
+ vectorstore = FAISS.from_documents(docs,embedding=embedding_model)
36
+ vectorstore.save_local(VECTOR_PATH)
37
+ return vectorstore
38
+
39
+ # Funtion to load VectorDatabase for Retrieval Process
40
+ def load_documents(embedding_model:str):
41
+ if not os.path.exists(VECTOR_PATH):
42
+ raise ValueError("Vectorstore not found,Upload Your Document First")
43
+ return FAISS.load_local(VECTOR_PATH,embeddings=embedding_model,allow_dangerous_deserialization=True)
44
+
45
+
46
+ # Prompt for LLM to execute Your task more efficiently
47
+ prompt = ChatPromptTemplate.from_template(
48
+ """You are a professional research analyst.
49
+
50
+ Answer the question strictly using the information contained in the document excerpts below.
51
+ Do not mention the phrases "provided context", "given context", or similar meta-references.
52
+ Do not include conversational language or assumptions.
53
+
54
+ Writing guidelines:
55
+ - Use a formal, neutral, and analytical tone.
56
+ - Present information directly and concisely.
57
+ - If information is missing, clearly state that it is not available in the document.
58
+ - Do not speculate or add external knowledge.
59
+
60
+ Citation rules:
61
+ - List citations in a separate section highlighted with blue.
62
+ - Each citation must include page number and table/figure/image reference if available.
63
+ - Use this format exactly:
64
+ • Page X, Table/Figure/Image Y (if applicable)
65
+
66
+ <Document Excerpts>
67
+ {context}
68
+ </Document Excerpts>
69
+
70
+ Question:
71
+ {input}
72
+ """
73
+ )
74
+
75
+ # Get Retrieval chain
76
+ def get_rag_chain(retriever):
77
+ chain = (
78
+ {
79
+ "context": retriever | format_docs_with_metadata,
80
+ "input": RunnablePassthrough()
81
+ }
82
+ |prompt
83
+ |llm
84
+ )
85
+ return chain
86
+
87
+
88
+
rag/combine.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import fitz
3
+ import camelot
4
+ import pytesseract
5
+ from PIL import Image
6
+ import io
7
+
8
+
9
+ # Raw Documents
10
+ def raw_document_text(pdf_path: str):
11
+ documents = []
12
+
13
+ # Open PDF
14
+ with pdfplumber.open(pdf_path) as pdf:
15
+ doc_fitz = fitz.open(pdf_path)
16
+
17
+ for page_index, page in enumerate(pdf.pages, start=1):
18
+
19
+ # TEXT
20
+ text = page.extract_text()
21
+ if text:
22
+ documents.append({
23
+ "content": text,
24
+ "metadata": {
25
+ "page": page_index,
26
+ "type": "text"
27
+ }
28
+ })
29
+
30
+
31
+ # TABLES
32
+ tables = camelot.read_pdf(
33
+ pdf_path,
34
+ pages=str(page_index),
35
+ flavor="stream"
36
+ )
37
+
38
+ for t_idx, table in enumerate(tables):
39
+ table_text = table.df.to_string(index=False)
40
+ documents.append({
41
+ "content": table_text,
42
+ "metadata": {
43
+ "page": page_index,
44
+ "type": "table",
45
+ "ref": f"Table {t_idx + 1}"
46
+ }
47
+ })
48
+
49
+
50
+ # IMAGES + OCR
51
+ page_fitz = doc_fitz[page_index - 1]
52
+ images = page_fitz.get_images(full=True)
53
+
54
+ for img_idx, img in enumerate(images):
55
+ xref = img[0]
56
+ base_image = doc_fitz.extract_image(xref)
57
+ image_bytes = base_image["image"]
58
+
59
+ image = Image.open(io.BytesIO(image_bytes))
60
+ ocr_text = pytesseract.image_to_string(image)
61
+
62
+ if ocr_text.strip():
63
+ documents.append({
64
+ "content": ocr_text,
65
+ "metadata": {
66
+ "page": page_index,
67
+ "type": "image",
68
+ "ref": f"Image {img_idx + 1}"
69
+ }
70
+ })
71
+
72
+ return documents
73
+
74
+
rag/lang_doc.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+
3
+
4
+ # Raw Documents to Langchain Documents
5
+ def get_langchain_docs(docs:str):
6
+ lc_docs = []
7
+ for doc in docs:
8
+ document = Document(
9
+ page_content=doc['content'],
10
+ metadata=doc['metadata']
11
+ )
12
+ lc_docs.append(document)
13
+ return lc_docs
14
+
rag/smart_chunking.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from .combine import raw_document_text
3
+ from .lang_doc import get_langchain_docs
4
+
5
+ # Smart Chunks Documents
6
+ def smart_text_chunker(doc, max_chars=500):
7
+ chunks = []
8
+ buffer = ""
9
+
10
+ paragraphs = doc.page_content.split("\n\n")
11
+
12
+ for para in paragraphs:
13
+ if len(buffer) + len(para) <= max_chars:
14
+ buffer += para + "\n\n"
15
+ else:
16
+ chunks.append(
17
+ Document(
18
+ page_content=buffer.strip(),
19
+ metadata=doc.metadata
20
+ )
21
+ )
22
+ buffer = para + "\n\n"
23
+
24
+ if buffer.strip():
25
+ chunks.append(
26
+ Document(
27
+ page_content=buffer.strip(),
28
+ metadata=doc.metadata
29
+ )
30
+ )
31
+
32
+ return chunks
33
+
34
+
35
+ # Funtion for Raw Documents -> Langchain Document -> Smart Chunking Documents
36
+ def get_chunked_docs(pdf:str):
37
+ chunked_docs = []
38
+ docs = raw_document_text(pdf)
39
+ documents = get_langchain_docs(docs)
40
+ for doc in documents:
41
+ doc_type = doc.metadata["type"]
42
+ if doc_type == "text":
43
+ chunked_docs.extend(smart_text_chunker(doc))
44
+ elif doc_type == "table":
45
+ chunked_docs.append(doc)
46
+ elif doc_type == "image":
47
+ chunked_docs.append(doc)
48
+ return chunked_docs
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
rag/t.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from collections import defaultdict
3
+ import re
4
+ import pdfplumber
5
+ import fitz # PyMuPDF
6
+ import camelot
7
+ import pytesseract
8
+ from PIL import Image
9
+ import io
10
+
11
+
12
+ # -------------------------------
13
+ # STEP 1: EXTRACT RAW CONTENT
14
+ # -------------------------------
15
+ def raw_document_text(pdf_path: str):
16
+ documents = []
17
+
18
+ with pdfplumber.open(pdf_path) as pdf:
19
+ doc_fitz = fitz.open(pdf_path)
20
+
21
+ for page_index, page in enumerate(pdf.pages, start=1):
22
+
23
+ # -------- TEXT --------
24
+ text = page.extract_text()
25
+ if text:
26
+ documents.append({
27
+ "content": text,
28
+ "metadata": {
29
+ "page": page_index,
30
+ "type": "text"
31
+ }
32
+ })
33
+
34
+ # -------- TABLES --------
35
+ try:
36
+ tables = camelot.read_pdf(
37
+ pdf_path,
38
+ pages=str(page_index),
39
+ flavor="stream"
40
+ )
41
+
42
+ for t_idx, table in enumerate(tables):
43
+ table_text = table.df.to_string(index=False)
44
+ documents.append({
45
+ "content": table_text,
46
+ "metadata": {
47
+ "page": page_index,
48
+ "type": "table",
49
+ "ref": f"Table {t_idx + 1}"
50
+ }
51
+ })
52
+ except Exception:
53
+ pass
54
+
55
+ # -------- IMAGES + OCR --------
56
+ page_fitz = doc_fitz[page_index - 1]
57
+ images = page_fitz.get_images(full=True)
58
+
59
+ for img_idx, img in enumerate(images):
60
+ xref = img[0]
61
+ base_image = doc_fitz.extract_image(xref)
62
+ image_bytes = base_image["image"]
63
+
64
+ image = Image.open(io.BytesIO(image_bytes))
65
+ ocr_text = pytesseract.image_to_string(image)
66
+
67
+ if ocr_text.strip():
68
+ documents.append({
69
+ "content": ocr_text,
70
+ "metadata": {
71
+ "page": page_index,
72
+ "type": "image",
73
+ "ref": f"Image {img_idx + 1}"
74
+ }
75
+ })
76
+
77
+ return documents
78
+
79
+
80
+ # -------------------------------
81
+ # STEP 2: RAW → LANGCHAIN DOCS
82
+ # -------------------------------
83
+ def to_langchain_documents(raw_docs):
84
+ lc_docs = []
85
+ for doc in raw_docs:
86
+ lc_docs.append(
87
+ Document(
88
+ page_content=doc["content"],
89
+ metadata=doc["metadata"]
90
+ )
91
+ )
92
+ return lc_docs
93
+
94
+
95
+ # -------------------------------
96
+ # STEP 3: BUILD INVERTED INDEX
97
+ # -------------------------------
98
+ def build_inverted_index(lc_docs):
99
+ index = defaultdict(set)
100
+
101
+ for doc_id, doc in enumerate(lc_docs):
102
+ words = re.findall(r"\b\w+\b", doc.page_content.lower())
103
+
104
+ for word in words:
105
+ index[word].add(doc_id)
106
+
107
+ return index
108
+
109
+
110
+ # -------------------------------
111
+ # STEP 4: RUN PIPELINE
112
+ # -------------------------------
113
+ if __name__ == "__main__":
114
+ pdf_path = "Report.pdf" # <-- change path
115
+
116
+ raw_docs = raw_document_text(pdf_path)
117
+ lc_docs = to_langchain_documents(raw_docs)
118
+ index = build_inverted_index(lc_docs)
119
+
120
+ print(f"Total LangChain Documents: {len(lc_docs)}")
121
+ print(f"Total Indexed Words: {len(index)}")
122
+
123
+ # Preview index
124
+ print(dict(list(index.items())[:20]))
requirements.txt ADDED
Binary file (648 Bytes). View file