Spaces:
Sleeping
Sleeping
Commit
·
8ef9756
0
Parent(s):
Initial deployment-ready version
Browse files- .env +1 -0
- Dockerfile +16 -0
- __pycache__/ingestion.cpython-311.pyc +0 -0
- __pycache__/ingestion.cpython-312.pyc +0 -0
- __pycache__/main.cpython-311.pyc +0 -0
- __pycache__/main.cpython-312.pyc +0 -0
- __pycache__/ocr.cpython-311.pyc +0 -0
- __pycache__/ocr.cpython-312.pyc +0 -0
- __pycache__/pipelines.cpython-311.pyc +0 -0
- __pycache__/pipelines.cpython-312.pyc +0 -0
- ingestion.py +68 -0
- main.py +82 -0
- ocr.py +67 -0
- pipelines.py +136 -0
- render.yaml +11 -0
- requirements.txt +13 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
GOOGLE_API_KEY=AIzaSyBOIxwYU-v9UBt87oXASKVU-zw_hsWfFW8
|
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11.9-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && \
|
| 4 |
+
apt-get install -y tesseract-ocr libtesseract-dev poppler-utils && \
|
| 5 |
+
rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
ENV PORT=8000
|
| 15 |
+
|
| 16 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
__pycache__/ingestion.cpython-311.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
__pycache__/ingestion.cpython-312.pyc
ADDED
|
Binary file (1.09 kB). View file
|
|
|
__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (2.98 kB). View file
|
|
|
__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (1.96 kB). View file
|
|
|
__pycache__/ocr.cpython-311.pyc
ADDED
|
Binary file (3.53 kB). View file
|
|
|
__pycache__/ocr.cpython-312.pyc
ADDED
|
Binary file (2.04 kB). View file
|
|
|
__pycache__/pipelines.cpython-311.pyc
ADDED
|
Binary file (5.98 kB). View file
|
|
|
__pycache__/pipelines.cpython-312.pyc
ADDED
|
Binary file (3.81 kB). View file
|
|
|
ingestion.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Tuple
|
| 2 |
+
from pipelines import add_documents
|
| 3 |
+
import logging
|
| 4 |
+
from ocr import guess_and_extract
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
def ingest_files(session_id: str, files: list[Tuple[str, bytes]]): # Change input type
|
| 9 |
+
"""
|
| 10 |
+
Process files and ingest per-page content
|
| 11 |
+
files: list of tuples (filename, file_bytes)
|
| 12 |
+
"""
|
| 13 |
+
all_texts = []
|
| 14 |
+
all_metas = []
|
| 15 |
+
|
| 16 |
+
for filename, file_bytes in files:
|
| 17 |
+
if not file_bytes:
|
| 18 |
+
continue
|
| 19 |
+
|
| 20 |
+
# Extract text (returns list of pages)
|
| 21 |
+
pages = guess_and_extract(filename, file_bytes)
|
| 22 |
+
|
| 23 |
+
for page_num, page_text in enumerate(pages, 1):
|
| 24 |
+
if not page_text.strip():
|
| 25 |
+
continue
|
| 26 |
+
|
| 27 |
+
all_texts.append(page_text)
|
| 28 |
+
all_metas.append({
|
| 29 |
+
"session_id": session_id,
|
| 30 |
+
"filename": filename,
|
| 31 |
+
"page": page_num # Track page number
|
| 32 |
+
})
|
| 33 |
+
|
| 34 |
+
if not all_texts:
|
| 35 |
+
return 0
|
| 36 |
+
|
| 37 |
+
return add_documents(all_texts, all_metas)
|
| 38 |
+
|
| 39 |
+
# def ingest_files(session_id: str, files: list[tuple[str, str]]) -> int:
|
| 40 |
+
# """
|
| 41 |
+
# Ingest files into document store with session metadata
|
| 42 |
+
# files: list of tuples (text, filename)
|
| 43 |
+
# """
|
| 44 |
+
# texts = []
|
| 45 |
+
# metas = []
|
| 46 |
+
|
| 47 |
+
# for text, filename in files:
|
| 48 |
+
# cleaned_text = text.strip()
|
| 49 |
+
# if not cleaned_text:
|
| 50 |
+
# logger.warning(f"Skipping empty content for {filename}")
|
| 51 |
+
# continue
|
| 52 |
+
# texts.append(cleaned_text)
|
| 53 |
+
# metas.append({
|
| 54 |
+
# "session_id": session_id,
|
| 55 |
+
# "filename": filename
|
| 56 |
+
# })
|
| 57 |
+
|
| 58 |
+
# if not texts:
|
| 59 |
+
# logger.warning(f"No valid content to ingest for session {session_id}")
|
| 60 |
+
# return 0
|
| 61 |
+
|
| 62 |
+
# try:
|
| 63 |
+
# added = add_documents(texts, metas)
|
| 64 |
+
# logger.info(f"Added {added} documents for session {session_id}")
|
| 65 |
+
# return added
|
| 66 |
+
# except Exception as e:
|
| 67 |
+
# logger.exception(f"Ingestion failed for session {session_id}: {e}")
|
| 68 |
+
# return 0
|
main.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
from ocr import guess_and_extract
|
| 5 |
+
from ingestion import ingest_files
|
| 6 |
+
from pipelines import query_rag
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
app = FastAPI(
|
| 13 |
+
title="Haystack RAG API",
|
| 14 |
+
description="PDF Summarization and Question Answering System",
|
| 15 |
+
version="1.0.0"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Configure logging
|
| 19 |
+
logging.basicConfig(level=logging.INFO)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# CORS configuration
|
| 23 |
+
app.add_middleware(
|
| 24 |
+
CORSMiddleware,
|
| 25 |
+
allow_origins=["*"],
|
| 26 |
+
allow_credentials=True,
|
| 27 |
+
allow_methods=["*"],
|
| 28 |
+
allow_headers=["*"],
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
@app.post("/upload")
|
| 32 |
+
async def upload_files(
|
| 33 |
+
session_id: str = Form(...),
|
| 34 |
+
files: List[UploadFile] = File(...)
|
| 35 |
+
):
|
| 36 |
+
pairs = []
|
| 37 |
+
for f in files:
|
| 38 |
+
content = await f.read()
|
| 39 |
+
# Store (filename, content) instead of text
|
| 40 |
+
pairs.append((f.filename or "unnamed", content))
|
| 41 |
+
|
| 42 |
+
added = ingest_files(session_id, pairs)
|
| 43 |
+
return {"status": "success", "documents_added": added}
|
| 44 |
+
|
| 45 |
+
# @app.post("/upload")
|
| 46 |
+
# async def upload_files(
|
| 47 |
+
# session_id: str = Form(..., description="Unique session ID"),
|
| 48 |
+
# files: List[UploadFile] = File(..., description="Files to upload")
|
| 49 |
+
# ):
|
| 50 |
+
# """Upload and process files"""
|
| 51 |
+
# if not files:
|
| 52 |
+
# raise HTTPException(400, detail="No files uploaded")
|
| 53 |
+
|
| 54 |
+
# pairs = []
|
| 55 |
+
# for f in files:
|
| 56 |
+
# try:
|
| 57 |
+
# content = await f.read()
|
| 58 |
+
# text = guess_and_extract(f.filename or "unnamed", content)
|
| 59 |
+
# pairs.append((text, f.filename or "unnamed"))
|
| 60 |
+
# except Exception as e:
|
| 61 |
+
# logger.error(f"Failed to process {f.filename}: {e}")
|
| 62 |
+
# continue # Continue processing other files
|
| 63 |
+
|
| 64 |
+
# added = ingest_files(session_id, pairs)
|
| 65 |
+
# return {"status": "success", "documents_added": added}
|
| 66 |
+
|
| 67 |
+
@app.post("/query")
|
| 68 |
+
async def query(
|
| 69 |
+
session_id: str = Form(..., description="Session ID to query"),
|
| 70 |
+
question: str = Form(..., description="User question")
|
| 71 |
+
):
|
| 72 |
+
"""Query the RAG system"""
|
| 73 |
+
if not session_id.strip():
|
| 74 |
+
raise HTTPException(400, detail="Session ID cannot be empty")
|
| 75 |
+
|
| 76 |
+
result = query_rag(question, session_id)
|
| 77 |
+
return result
|
| 78 |
+
|
| 79 |
+
@app.get("/healthz")
|
| 80 |
+
async def healthz():
|
| 81 |
+
"""Health check endpoint"""
|
| 82 |
+
return {"status": "ok", "version": app.version}
|
ocr.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import pdfplumber
|
| 3 |
+
import pytesseract
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import logging
|
| 6 |
+
from typing import List
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}
|
| 11 |
+
PDF_EXTS = {".pdf"}
|
| 12 |
+
|
| 13 |
+
def extract_text_from_pdf(file_bytes: bytes) -> List[str]:
|
| 14 |
+
"""Extract text per page from PDF"""
|
| 15 |
+
text_parts = []
|
| 16 |
+
try:
|
| 17 |
+
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 18 |
+
for page in pdf.pages:
|
| 19 |
+
page_text = page.extract_text() or ""
|
| 20 |
+
text_parts.append(page_text)
|
| 21 |
+
return text_parts
|
| 22 |
+
except Exception as e:
|
| 23 |
+
logger.error(f"PDF extraction failed: {e}")
|
| 24 |
+
return []
|
| 25 |
+
|
| 26 |
+
# def extract_text_from_pdf(file_bytes: bytes) -> str:
|
| 27 |
+
# """Extract text from PDF using pdfplumber"""
|
| 28 |
+
# text_parts = []
|
| 29 |
+
# try:
|
| 30 |
+
# with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 31 |
+
# for page in pdf.pages:
|
| 32 |
+
# page_text = page.extract_text() or ""
|
| 33 |
+
# text_parts.append(page_text)
|
| 34 |
+
# return "\n".join(text_parts).strip()
|
| 35 |
+
# except Exception as e:
|
| 36 |
+
# logger.error(f"PDF extraction failed: {e}")
|
| 37 |
+
# return ""
|
| 38 |
+
|
| 39 |
+
def extract_text_from_image(file_bytes: bytes) -> str:
|
| 40 |
+
"""Extract text from image using pytesseract"""
|
| 41 |
+
try:
|
| 42 |
+
image = Image.open(io.BytesIO(file_bytes))
|
| 43 |
+
return pytesseract.image_to_string(image).strip()
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Image OCR failed: {e}")
|
| 46 |
+
return ""
|
| 47 |
+
|
| 48 |
+
def guess_and_extract(filename: str, file_bytes: bytes) -> str:
|
| 49 |
+
"""Extract text based on file extension with fallback"""
|
| 50 |
+
ext = ("." + filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
if ext in PDF_EXTS:
|
| 54 |
+
return extract_text_from_pdf(file_bytes)
|
| 55 |
+
elif ext in IMAGE_EXTS:
|
| 56 |
+
return extract_text_from_image(file_bytes)
|
| 57 |
+
else:
|
| 58 |
+
# Try text decoding with common encodings
|
| 59 |
+
for encoding in ["utf-8", "latin-1", "iso-8859-1"]:
|
| 60 |
+
try:
|
| 61 |
+
return file_bytes.decode(encoding).strip()
|
| 62 |
+
except UnicodeDecodeError:
|
| 63 |
+
continue
|
| 64 |
+
return "" # Fallback if all decodings fail
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Text extraction failed for {filename}: {e}")
|
| 67 |
+
return ""
|
pipelines.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from haystack.utils import Secret
|
| 4 |
+
from haystack.dataclasses import Document
|
| 5 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
| 6 |
+
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
|
| 7 |
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
| 8 |
+
from haystack.components.rankers import SentenceTransformersSimilarityRanker
|
| 9 |
+
from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
|
| 10 |
+
from haystack.components.preprocessors import DocumentSplitter
|
| 11 |
+
|
| 12 |
+
# Set up logging
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
|
| 16 |
+
# Document store and components
|
| 17 |
+
document_store = InMemoryDocumentStore()
|
| 18 |
+
doc_embedder = SentenceTransformersDocumentEmbedder(model="BAAI/bge-large-en-v1.5")
|
| 19 |
+
text_embedder = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5")
|
| 20 |
+
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5)
|
| 21 |
+
reranker = SentenceTransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 22 |
+
|
| 23 |
+
# Initialize generator
|
| 24 |
+
generator = GoogleAIGeminiGenerator(
|
| 25 |
+
api_key=Secret.from_env_var("GOOGLE_API_KEY"),
|
| 26 |
+
model="gemini-2.0-flash"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
splitter = DocumentSplitter(
|
| 30 |
+
split_by="word",
|
| 31 |
+
split_length=400,
|
| 32 |
+
split_overlap=50
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Warm up components
|
| 36 |
+
doc_embedder.warm_up()
|
| 37 |
+
text_embedder.warm_up()
|
| 38 |
+
reranker.warm_up()
|
| 39 |
+
|
| 40 |
+
def add_documents(texts: list[str], meta_list: list[dict]):
|
| 41 |
+
"""Process and store documents with chunking"""
|
| 42 |
+
# Create base documents
|
| 43 |
+
docs = [
|
| 44 |
+
Document(content=text, meta=meta)
|
| 45 |
+
for text, meta in zip(texts, meta_list)
|
| 46 |
+
if text.strip()
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
if not docs:
|
| 50 |
+
return 0
|
| 51 |
+
|
| 52 |
+
# Split into chunks
|
| 53 |
+
split_docs = splitter.run(docs)["documents"]
|
| 54 |
+
|
| 55 |
+
# Batch embedding (critical for performance)
|
| 56 |
+
embedded_docs = []
|
| 57 |
+
batch_size = 32 # Optimal for most GPUs
|
| 58 |
+
|
| 59 |
+
for i in range(0, len(split_docs), batch_size):
|
| 60 |
+
batch = split_docs[i:i+batch_size]
|
| 61 |
+
embedded_batch = doc_embedder.run(batch)["documents"]
|
| 62 |
+
embedded_docs.extend(embedded_batch)
|
| 63 |
+
|
| 64 |
+
document_store.write_documents(embedded_docs)
|
| 65 |
+
return len(embedded_docs)
|
| 66 |
+
|
| 67 |
+
# def add_documents(texts: list[str], meta_list: list[dict] | None = None):
|
| 68 |
+
# """Add documents to the document store with embeddings"""
|
| 69 |
+
# docs = []
|
| 70 |
+
# meta_list = meta_list or [{}] * len(texts)
|
| 71 |
+
# for text, meta in zip(texts, meta_list):
|
| 72 |
+
# if not text.strip(): # Check for empty/whitespace-only text
|
| 73 |
+
# continue
|
| 74 |
+
# docs.append(Document(content=text, meta=meta))
|
| 75 |
+
|
| 76 |
+
# # Handle case where all texts are empty
|
| 77 |
+
# if not docs:
|
| 78 |
+
# return 0
|
| 79 |
+
|
| 80 |
+
# # Embed and store documents
|
| 81 |
+
# embedded_docs = doc_embedder.run(docs)["documents"]
|
| 82 |
+
# document_store.write_documents(embedded_docs)
|
| 83 |
+
# return len(embedded_docs)
|
| 84 |
+
|
| 85 |
+
def query_rag(question: str, session_id: str):
|
| 86 |
+
"""Query the RAG system with session filtering"""
|
| 87 |
+
try:
|
| 88 |
+
# Validate input
|
| 89 |
+
if not question.strip():
|
| 90 |
+
return {
|
| 91 |
+
"answer": "Please provide a non-empty question.",
|
| 92 |
+
"sources": []
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
# Embed question
|
| 96 |
+
query_emb = text_embedder.run(question)["embedding"]
|
| 97 |
+
|
| 98 |
+
# Retrieve documents with session filter
|
| 99 |
+
filters = {"field": "meta.session_id", "operator": "==", "value": session_id}
|
| 100 |
+
retrieved_docs = retriever.run(query_embedding=query_emb, filters=filters)["documents"]
|
| 101 |
+
|
| 102 |
+
if not retrieved_docs:
|
| 103 |
+
return {
|
| 104 |
+
"answer": "No documents found for this session. Please upload a file first.",
|
| 105 |
+
"sources": []
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
# Rerank documents
|
| 109 |
+
reranked_docs = reranker.run(query=question, documents=retrieved_docs)["documents"]
|
| 110 |
+
|
| 111 |
+
# Generate answer with context
|
| 112 |
+
context = "\n\n".join([doc.content for doc in reranked_docs])
|
| 113 |
+
prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
|
| 114 |
+
|
| 115 |
+
# Handle generator response safely
|
| 116 |
+
response = generator.run(parts=[prompt])
|
| 117 |
+
answer = response.get("replies", [""])[0] if response else "No response generated"
|
| 118 |
+
|
| 119 |
+
# Format sources
|
| 120 |
+
sources = [
|
| 121 |
+
{
|
| 122 |
+
"filename": d.meta.get("filename", "Unknown"),
|
| 123 |
+
"page": d.meta.get("page", 1), # Add page number
|
| 124 |
+
"snippet": d.content[:400] + "..." if len(d.content) > 400 else d.content
|
| 125 |
+
}
|
| 126 |
+
for d in reranked_docs
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
return {"answer": answer, "sources": sources}
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.exception(f"Query failed: {e}") # Log full exception
|
| 133 |
+
return {
|
| 134 |
+
"answer": "Sorry, I encountered an error processing your request.",
|
| 135 |
+
"sources": []
|
| 136 |
+
}
|
render.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
- type: web
|
| 3 |
+
name: rag-api
|
| 4 |
+
runtime: docker
|
| 5 |
+
env: python
|
| 6 |
+
buildCommand: docker build -t rag-api .
|
| 7 |
+
startCommand: uvicorn main:app --host 0.0.0.0 --port 8000
|
| 8 |
+
envVars:
|
| 9 |
+
- key: GOOGLE_API_KEY
|
| 10 |
+
value: AIzaSyBOIxwYU-v9UBt87oXASKVU-zw_hsWfFW8
|
| 11 |
+
plan: free
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentence-transformers
|
| 2 |
+
pdfplumber
|
| 3 |
+
pytesseract
|
| 4 |
+
pillow
|
| 5 |
+
python-multipart
|
| 6 |
+
fastapi
|
| 7 |
+
uvicorn
|
| 8 |
+
google-generativeai
|
| 9 |
+
grpcio==1.74.0
|
| 10 |
+
grpcio-tools==1.74.0
|
| 11 |
+
grpcio-status==1.74.0
|
| 12 |
+
protobuf==5.29.5
|
| 13 |
+
rpds.py==0.27.0
|