HarshKalia-24 commited on
Commit
8ef9756
·
0 Parent(s):

Initial deployment-ready version

Browse files
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GOOGLE_API_KEY=AIzaSyBOIxwYU-v9UBt87oXASKVU-zw_hsWfFW8
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.9-slim
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y tesseract-ocr libtesseract-dev poppler-utils && \
5
+ rm -rf /var/lib/apt/lists/*
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ ENV PORT=8000
15
+
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
__pycache__/ingestion.cpython-311.pyc ADDED
Binary file (1.44 kB). View file
 
__pycache__/ingestion.cpython-312.pyc ADDED
Binary file (1.09 kB). View file
 
__pycache__/main.cpython-311.pyc ADDED
Binary file (2.98 kB). View file
 
__pycache__/main.cpython-312.pyc ADDED
Binary file (1.96 kB). View file
 
__pycache__/ocr.cpython-311.pyc ADDED
Binary file (3.53 kB). View file
 
__pycache__/ocr.cpython-312.pyc ADDED
Binary file (2.04 kB). View file
 
__pycache__/pipelines.cpython-311.pyc ADDED
Binary file (5.98 kB). View file
 
__pycache__/pipelines.cpython-312.pyc ADDED
Binary file (3.81 kB). View file
 
ingestion.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Tuple
2
+ from pipelines import add_documents
3
+ import logging
4
+ from ocr import guess_and_extract
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def ingest_files(session_id: str, files: list[Tuple[str, bytes]]): # Change input type
9
+ """
10
+ Process files and ingest per-page content
11
+ files: list of tuples (filename, file_bytes)
12
+ """
13
+ all_texts = []
14
+ all_metas = []
15
+
16
+ for filename, file_bytes in files:
17
+ if not file_bytes:
18
+ continue
19
+
20
+ # Extract text (returns list of pages)
21
+ pages = guess_and_extract(filename, file_bytes)
22
+
23
+ for page_num, page_text in enumerate(pages, 1):
24
+ if not page_text.strip():
25
+ continue
26
+
27
+ all_texts.append(page_text)
28
+ all_metas.append({
29
+ "session_id": session_id,
30
+ "filename": filename,
31
+ "page": page_num # Track page number
32
+ })
33
+
34
+ if not all_texts:
35
+ return 0
36
+
37
+ return add_documents(all_texts, all_metas)
38
+
39
+ # def ingest_files(session_id: str, files: list[tuple[str, str]]) -> int:
40
+ # """
41
+ # Ingest files into document store with session metadata
42
+ # files: list of tuples (text, filename)
43
+ # """
44
+ # texts = []
45
+ # metas = []
46
+
47
+ # for text, filename in files:
48
+ # cleaned_text = text.strip()
49
+ # if not cleaned_text:
50
+ # logger.warning(f"Skipping empty content for {filename}")
51
+ # continue
52
+ # texts.append(cleaned_text)
53
+ # metas.append({
54
+ # "session_id": session_id,
55
+ # "filename": filename
56
+ # })
57
+
58
+ # if not texts:
59
+ # logger.warning(f"No valid content to ingest for session {session_id}")
60
+ # return 0
61
+
62
+ # try:
63
+ # added = add_documents(texts, metas)
64
+ # logger.info(f"Added {added} documents for session {session_id}")
65
+ # return added
66
+ # except Exception as e:
67
+ # logger.exception(f"Ingestion failed for session {session_id}: {e}")
68
+ # return 0
main.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from typing import List, Optional
4
+ from ocr import guess_and_extract
5
+ from ingestion import ingest_files
6
+ from pipelines import query_rag
7
+ from dotenv import load_dotenv
8
+ import logging
9
+
10
+ load_dotenv()
11
+
12
+ app = FastAPI(
13
+ title="Haystack RAG API",
14
+ description="PDF Summarization and Question Answering System",
15
+ version="1.0.0"
16
+ )
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # CORS configuration
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ @app.post("/upload")
32
+ async def upload_files(
33
+ session_id: str = Form(...),
34
+ files: List[UploadFile] = File(...)
35
+ ):
36
+ pairs = []
37
+ for f in files:
38
+ content = await f.read()
39
+ # Store (filename, content) instead of text
40
+ pairs.append((f.filename or "unnamed", content))
41
+
42
+ added = ingest_files(session_id, pairs)
43
+ return {"status": "success", "documents_added": added}
44
+
45
+ # @app.post("/upload")
46
+ # async def upload_files(
47
+ # session_id: str = Form(..., description="Unique session ID"),
48
+ # files: List[UploadFile] = File(..., description="Files to upload")
49
+ # ):
50
+ # """Upload and process files"""
51
+ # if not files:
52
+ # raise HTTPException(400, detail="No files uploaded")
53
+
54
+ # pairs = []
55
+ # for f in files:
56
+ # try:
57
+ # content = await f.read()
58
+ # text = guess_and_extract(f.filename or "unnamed", content)
59
+ # pairs.append((text, f.filename or "unnamed"))
60
+ # except Exception as e:
61
+ # logger.error(f"Failed to process {f.filename}: {e}")
62
+ # continue # Continue processing other files
63
+
64
+ # added = ingest_files(session_id, pairs)
65
+ # return {"status": "success", "documents_added": added}
66
+
67
+ @app.post("/query")
68
+ async def query(
69
+ session_id: str = Form(..., description="Session ID to query"),
70
+ question: str = Form(..., description="User question")
71
+ ):
72
+ """Query the RAG system"""
73
+ if not session_id.strip():
74
+ raise HTTPException(400, detail="Session ID cannot be empty")
75
+
76
+ result = query_rag(question, session_id)
77
+ return result
78
+
79
+ @app.get("/healthz")
80
+ async def healthz():
81
+ """Health check endpoint"""
82
+ return {"status": "ok", "version": app.version}
ocr.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import pdfplumber
3
+ import pytesseract
4
+ from PIL import Image
5
+ import logging
6
+ from typing import List
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}
11
+ PDF_EXTS = {".pdf"}
12
+
13
+ def extract_text_from_pdf(file_bytes: bytes) -> List[str]:
14
+ """Extract text per page from PDF"""
15
+ text_parts = []
16
+ try:
17
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
18
+ for page in pdf.pages:
19
+ page_text = page.extract_text() or ""
20
+ text_parts.append(page_text)
21
+ return text_parts
22
+ except Exception as e:
23
+ logger.error(f"PDF extraction failed: {e}")
24
+ return []
25
+
26
+ # def extract_text_from_pdf(file_bytes: bytes) -> str:
27
+ # """Extract text from PDF using pdfplumber"""
28
+ # text_parts = []
29
+ # try:
30
+ # with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
31
+ # for page in pdf.pages:
32
+ # page_text = page.extract_text() or ""
33
+ # text_parts.append(page_text)
34
+ # return "\n".join(text_parts).strip()
35
+ # except Exception as e:
36
+ # logger.error(f"PDF extraction failed: {e}")
37
+ # return ""
38
+
39
+ def extract_text_from_image(file_bytes: bytes) -> str:
40
+ """Extract text from image using pytesseract"""
41
+ try:
42
+ image = Image.open(io.BytesIO(file_bytes))
43
+ return pytesseract.image_to_string(image).strip()
44
+ except Exception as e:
45
+ logger.error(f"Image OCR failed: {e}")
46
+ return ""
47
+
48
+ def guess_and_extract(filename: str, file_bytes: bytes) -> str:
49
+ """Extract text based on file extension with fallback"""
50
+ ext = ("." + filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
51
+
52
+ try:
53
+ if ext in PDF_EXTS:
54
+ return extract_text_from_pdf(file_bytes)
55
+ elif ext in IMAGE_EXTS:
56
+ return extract_text_from_image(file_bytes)
57
+ else:
58
+ # Try text decoding with common encodings
59
+ for encoding in ["utf-8", "latin-1", "iso-8859-1"]:
60
+ try:
61
+ return file_bytes.decode(encoding).strip()
62
+ except UnicodeDecodeError:
63
+ continue
64
+ return "" # Fallback if all decodings fail
65
+ except Exception as e:
66
+ logger.error(f"Text extraction failed for {filename}: {e}")
67
+ return ""
pipelines.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from haystack.utils import Secret
4
+ from haystack.dataclasses import Document
5
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
6
+ from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
7
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
8
+ from haystack.components.rankers import SentenceTransformersSimilarityRanker
9
+ from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
10
+ from haystack.components.preprocessors import DocumentSplitter
11
+
12
+ # Set up logging
13
+ logger = logging.getLogger(__name__)
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ # Document store and components
17
+ document_store = InMemoryDocumentStore()
18
+ doc_embedder = SentenceTransformersDocumentEmbedder(model="BAAI/bge-large-en-v1.5")
19
+ text_embedder = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5")
20
+ retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5)
21
+ reranker = SentenceTransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
22
+
23
+ # Initialize generator
24
+ generator = GoogleAIGeminiGenerator(
25
+ api_key=Secret.from_env_var("GOOGLE_API_KEY"),
26
+ model="gemini-2.0-flash"
27
+ )
28
+
29
+ splitter = DocumentSplitter(
30
+ split_by="word",
31
+ split_length=400,
32
+ split_overlap=50
33
+ )
34
+
35
+ # Warm up components
36
+ doc_embedder.warm_up()
37
+ text_embedder.warm_up()
38
+ reranker.warm_up()
39
+
40
+ def add_documents(texts: list[str], meta_list: list[dict]):
41
+ """Process and store documents with chunking"""
42
+ # Create base documents
43
+ docs = [
44
+ Document(content=text, meta=meta)
45
+ for text, meta in zip(texts, meta_list)
46
+ if text.strip()
47
+ ]
48
+
49
+ if not docs:
50
+ return 0
51
+
52
+ # Split into chunks
53
+ split_docs = splitter.run(docs)["documents"]
54
+
55
+ # Batch embedding (critical for performance)
56
+ embedded_docs = []
57
+ batch_size = 32 # Optimal for most GPUs
58
+
59
+ for i in range(0, len(split_docs), batch_size):
60
+ batch = split_docs[i:i+batch_size]
61
+ embedded_batch = doc_embedder.run(batch)["documents"]
62
+ embedded_docs.extend(embedded_batch)
63
+
64
+ document_store.write_documents(embedded_docs)
65
+ return len(embedded_docs)
66
+
67
+ # def add_documents(texts: list[str], meta_list: list[dict] | None = None):
68
+ # """Add documents to the document store with embeddings"""
69
+ # docs = []
70
+ # meta_list = meta_list or [{}] * len(texts)
71
+ # for text, meta in zip(texts, meta_list):
72
+ # if not text.strip(): # Check for empty/whitespace-only text
73
+ # continue
74
+ # docs.append(Document(content=text, meta=meta))
75
+
76
+ # # Handle case where all texts are empty
77
+ # if not docs:
78
+ # return 0
79
+
80
+ # # Embed and store documents
81
+ # embedded_docs = doc_embedder.run(docs)["documents"]
82
+ # document_store.write_documents(embedded_docs)
83
+ # return len(embedded_docs)
84
+
85
+ def query_rag(question: str, session_id: str):
86
+ """Query the RAG system with session filtering"""
87
+ try:
88
+ # Validate input
89
+ if not question.strip():
90
+ return {
91
+ "answer": "Please provide a non-empty question.",
92
+ "sources": []
93
+ }
94
+
95
+ # Embed question
96
+ query_emb = text_embedder.run(question)["embedding"]
97
+
98
+ # Retrieve documents with session filter
99
+ filters = {"field": "meta.session_id", "operator": "==", "value": session_id}
100
+ retrieved_docs = retriever.run(query_embedding=query_emb, filters=filters)["documents"]
101
+
102
+ if not retrieved_docs:
103
+ return {
104
+ "answer": "No documents found for this session. Please upload a file first.",
105
+ "sources": []
106
+ }
107
+
108
+ # Rerank documents
109
+ reranked_docs = reranker.run(query=question, documents=retrieved_docs)["documents"]
110
+
111
+ # Generate answer with context
112
+ context = "\n\n".join([doc.content for doc in reranked_docs])
113
+ prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
114
+
115
+ # Handle generator response safely
116
+ response = generator.run(parts=[prompt])
117
+ answer = response.get("replies", [""])[0] if response else "No response generated"
118
+
119
+ # Format sources
120
+ sources = [
121
+ {
122
+ "filename": d.meta.get("filename", "Unknown"),
123
+ "page": d.meta.get("page", 1), # Add page number
124
+ "snippet": d.content[:400] + "..." if len(d.content) > 400 else d.content
125
+ }
126
+ for d in reranked_docs
127
+ ]
128
+
129
+ return {"answer": answer, "sources": sources}
130
+
131
+ except Exception as e:
132
+ logger.exception(f"Query failed: {e}") # Log full exception
133
+ return {
134
+ "answer": "Sorry, I encountered an error processing your request.",
135
+ "sources": []
136
+ }
render.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ - type: web
3
+ name: rag-api
4
+ runtime: docker
5
+ env: python
6
+ buildCommand: docker build -t rag-api .
7
+ startCommand: uvicorn main:app --host 0.0.0.0 --port 8000
8
+ envVars:
9
+ - key: GOOGLE_API_KEY
10
+ value: AIzaSyBOIxwYU-v9UBt87oXASKVU-zw_hsWfFW8
11
+ plan: free
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ pdfplumber
3
+ pytesseract
4
+ pillow
5
+ python-multipart
6
+ fastapi
7
+ uvicorn
8
+ google-generativeai
9
+ grpcio==1.74.0
10
+ grpcio-tools==1.74.0
11
+ grpcio-status==1.74.0
12
+ protobuf==5.29.5
13
+ rpds.py==0.27.0