Spaces:

HarshKalia-24
/

Summarise-files

Sleeping

App Files Files Community

HarshKalia-24 commited on Aug 18

Commit

8ef9756

0 Parent(s):

Initial deployment-ready version

Browse files

Files changed (16) hide show

.env +1 -0
Dockerfile +16 -0
__pycache__/ingestion.cpython-311.pyc +0 -0
__pycache__/ingestion.cpython-312.pyc +0 -0
__pycache__/main.cpython-311.pyc +0 -0
__pycache__/main.cpython-312.pyc +0 -0
__pycache__/ocr.cpython-311.pyc +0 -0
__pycache__/ocr.cpython-312.pyc +0 -0
__pycache__/pipelines.cpython-311.pyc +0 -0
__pycache__/pipelines.cpython-312.pyc +0 -0
ingestion.py +68 -0
main.py +82 -0
ocr.py +67 -0
pipelines.py +136 -0
render.yaml +11 -0
requirements.txt +13 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GOOGLE_API_KEY=AIzaSyBOIxwYU-v9UBt87oXASKVU-zw_hsWfFW8

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11.9-slim
+RUN apt-get update && \
+    apt-get install -y tesseract-ocr libtesseract-dev poppler-utils && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+ENV PORT=8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

__pycache__/ingestion.cpython-311.pyc ADDED Viewed

Binary file (1.44 kB). View file

__pycache__/ingestion.cpython-312.pyc ADDED Viewed

Binary file (1.09 kB). View file

__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (2.98 kB). View file

__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (1.96 kB). View file

__pycache__/ocr.cpython-311.pyc ADDED Viewed

Binary file (3.53 kB). View file

__pycache__/ocr.cpython-312.pyc ADDED Viewed

Binary file (2.04 kB). View file

__pycache__/pipelines.cpython-311.pyc ADDED Viewed

Binary file (5.98 kB). View file

__pycache__/pipelines.cpython-312.pyc ADDED Viewed

Binary file (3.81 kB). View file

ingestion.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from typing import Dict, List, Tuple
+from pipelines import add_documents
+import logging
+from ocr import guess_and_extract
+logger = logging.getLogger(__name__)
+def ingest_files(session_id: str, files: list[Tuple[str, bytes]]):  # Change input type
+    """
+    Process files and ingest per-page content
+    files: list of tuples (filename, file_bytes)
+    """
+    all_texts = []
+    all_metas = []
+    for filename, file_bytes in files:
+        if not file_bytes:
+            continue
+        # Extract text (returns list of pages)
+        pages = guess_and_extract(filename, file_bytes)
+        for page_num, page_text in enumerate(pages, 1):
+            if not page_text.strip():
+                continue
+            all_texts.append(page_text)
+            all_metas.append({
+                "session_id": session_id,
+                "filename": filename,
+                "page": page_num  # Track page number
+            })
+    if not all_texts:
+        return 0
+    return add_documents(all_texts, all_metas)
+# def ingest_files(session_id: str, files: list[tuple[str, str]]) -> int:
+#     """
+#     Ingest files into document store with session metadata
+#     files: list of tuples (text, filename)
+#     """
+#     texts = []
+#     metas = []
+#     for text, filename in files:
+#         cleaned_text = text.strip()
+#         if not cleaned_text:
+#             logger.warning(f"Skipping empty content for {filename}")
+#             continue
+#         texts.append(cleaned_text)
+#         metas.append({
+#             "session_id": session_id,
+#             "filename": filename
+#         })
+#     if not texts:
+#         logger.warning(f"No valid content to ingest for session {session_id}")
+#         return 0
+#     try:
+#         added = add_documents(texts, metas)
+#         logger.info(f"Added {added} documents for session {session_id}")
+#         return added
+#     except Exception as e:
+#         logger.exception(f"Ingestion failed for session {session_id}: {e}")
+#         return 0

main.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List, Optional
+from ocr import guess_and_extract
+from ingestion import ingest_files
+from pipelines import query_rag
+from dotenv import load_dotenv
+import logging
+load_dotenv()
+app = FastAPI(
+    title="Haystack RAG API",
+    description="PDF Summarization and Question Answering System",
+    version="1.0.0"
+)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.post("/upload")
+async def upload_files(
+    session_id: str = Form(...),
+    files: List[UploadFile] = File(...)
+):
+    pairs = []
+    for f in files:
+        content = await f.read()
+        # Store (filename, content) instead of text
+        pairs.append((f.filename or "unnamed", content))
+    added = ingest_files(session_id, pairs)
+    return {"status": "success", "documents_added": added}
+# @app.post("/upload")
+# async def upload_files(
+#     session_id: str = Form(..., description="Unique session ID"),
+#     files: List[UploadFile] = File(..., description="Files to upload")
+# ):
+#     """Upload and process files"""
+#     if not files:
+#         raise HTTPException(400, detail="No files uploaded")
+#     pairs = []
+#     for f in files:
+#         try:
+#             content = await f.read()
+#             text = guess_and_extract(f.filename or "unnamed", content)
+#             pairs.append((text, f.filename or "unnamed"))
+#         except Exception as e:
+#             logger.error(f"Failed to process {f.filename}: {e}")
+#             continue  # Continue processing other files
+#     added = ingest_files(session_id, pairs)
+#     return {"status": "success", "documents_added": added}
+@app.post("/query")
+async def query(
+    session_id: str = Form(..., description="Session ID to query"),
+    question: str = Form(..., description="User question")
+):
+    """Query the RAG system"""
+    if not session_id.strip():
+        raise HTTPException(400, detail="Session ID cannot be empty")
+    result = query_rag(question, session_id)
+    return result
+@app.get("/healthz")
+async def healthz():
+    """Health check endpoint"""
+    return {"status": "ok", "version": app.version}

ocr.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import io
+import pdfplumber
+import pytesseract
+from PIL import Image
+import logging
+from typing import List
+logger = logging.getLogger(__name__)
+IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}
+PDF_EXTS = {".pdf"}
+def extract_text_from_pdf(file_bytes: bytes) -> List[str]:
+    """Extract text per page from PDF"""
+    text_parts = []
+    try:
+        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                text_parts.append(page_text)
+        return text_parts
+    except Exception as e:
+        logger.error(f"PDF extraction failed: {e}")
+        return []
+# def extract_text_from_pdf(file_bytes: bytes) -> str:
+#     """Extract text from PDF using pdfplumber"""
+#     text_parts = []
+#     try:
+#         with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+#             for page in pdf.pages:
+#                 page_text = page.extract_text() or ""
+#                 text_parts.append(page_text)
+#         return "\n".join(text_parts).strip()
+#     except Exception as e:
+#         logger.error(f"PDF extraction failed: {e}")
+#         return ""
+def extract_text_from_image(file_bytes: bytes) -> str:
+    """Extract text from image using pytesseract"""
+    try:
+        image = Image.open(io.BytesIO(file_bytes))
+        return pytesseract.image_to_string(image).strip()
+    except Exception as e:
+        logger.error(f"Image OCR failed: {e}")
+        return ""
+def guess_and_extract(filename: str, file_bytes: bytes) -> str:
+    """Extract text based on file extension with fallback"""
+    ext = ("." + filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
+    try:
+        if ext in PDF_EXTS:
+            return extract_text_from_pdf(file_bytes)
+        elif ext in IMAGE_EXTS:
+            return extract_text_from_image(file_bytes)
+        else:
+            # Try text decoding with common encodings
+            for encoding in ["utf-8", "latin-1", "iso-8859-1"]:
+                try:
+                    return file_bytes.decode(encoding).strip()
+                except UnicodeDecodeError:
+                    continue
+            return ""  # Fallback if all decodings fail
+    except Exception as e:
+        logger.error(f"Text extraction failed for {filename}: {e}")
+        return ""

pipelines.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import logging
+from haystack.utils import Secret
+from haystack.dataclasses import Document
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
+from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
+from haystack.components.rankers import SentenceTransformersSimilarityRanker
+from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
+from haystack.components.preprocessors import DocumentSplitter
+# Set up logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# Document store and components
+document_store = InMemoryDocumentStore()
+doc_embedder = SentenceTransformersDocumentEmbedder(model="BAAI/bge-large-en-v1.5")
+text_embedder = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5")
+retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5)
+reranker = SentenceTransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
+# Initialize generator
+generator = GoogleAIGeminiGenerator(
+    api_key=Secret.from_env_var("GOOGLE_API_KEY"),
+    model="gemini-2.0-flash"
+)
+splitter = DocumentSplitter(
+    split_by="word",
+    split_length=400,
+    split_overlap=50
+)
+# Warm up components
+doc_embedder.warm_up()
+text_embedder.warm_up()
+reranker.warm_up()
+def add_documents(texts: list[str], meta_list: list[dict]):
+    """Process and store documents with chunking"""
+    # Create base documents
+    docs = [
+        Document(content=text, meta=meta)
+        for text, meta in zip(texts, meta_list)
+        if text.strip()
+    ]
+    if not docs:
+        return 0
+    # Split into chunks
+    split_docs = splitter.run(docs)["documents"]
+    # Batch embedding (critical for performance)
+    embedded_docs = []
+    batch_size = 32  # Optimal for most GPUs
+    for i in range(0, len(split_docs), batch_size):
+        batch = split_docs[i:i+batch_size]
+        embedded_batch = doc_embedder.run(batch)["documents"]
+        embedded_docs.extend(embedded_batch)
+    document_store.write_documents(embedded_docs)
+    return len(embedded_docs)
+# def add_documents(texts: list[str], meta_list: list[dict] | None = None):
+#     """Add documents to the document store with embeddings"""
+#     docs = []
+#     meta_list = meta_list or [{}] * len(texts)
+#     for text, meta in zip(texts, meta_list):
+#         if not text.strip():  # Check for empty/whitespace-only text
+#             continue
+#         docs.append(Document(content=text, meta=meta))
+#     # Handle case where all texts are empty
+#     if not docs:
+#         return 0
+#     # Embed and store documents
+#     embedded_docs = doc_embedder.run(docs)["documents"]
+#     document_store.write_documents(embedded_docs)
+#     return len(embedded_docs)
+def query_rag(question: str, session_id: str):
+    """Query the RAG system with session filtering"""
+    try:
+        # Validate input
+        if not question.strip():
+            return {
+                "answer": "Please provide a non-empty question.",
+                "sources": []
+            }
+        # Embed question
+        query_emb = text_embedder.run(question)["embedding"]
+        # Retrieve documents with session filter
+        filters = {"field": "meta.session_id", "operator": "==", "value": session_id}
+        retrieved_docs = retriever.run(query_embedding=query_emb, filters=filters)["documents"]
+        if not retrieved_docs:
+            return {
+                "answer": "No documents found for this session. Please upload a file first.",
+                "sources": []
+            }
+        # Rerank documents
+        reranked_docs = reranker.run(query=question, documents=retrieved_docs)["documents"]
+        # Generate answer with context
+        context = "\n\n".join([doc.content for doc in reranked_docs])
+        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
+        # Handle generator response safely
+        response = generator.run(parts=[prompt])
+        answer = response.get("replies", [""])[0] if response else "No response generated"
+        # Format sources
+        sources = [
+        {
+            "filename": d.meta.get("filename", "Unknown"),
+            "page": d.meta.get("page", 1),  # Add page number
+            "snippet": d.content[:400] + "..." if len(d.content) > 400 else d.content
+        }
+        for d in reranked_docs
+    ]
+        return {"answer": answer, "sources": sources}
+    except Exception as e:
+        logger.exception(f"Query failed: {e}")  # Log full exception
+        return {
+            "answer": "Sorry, I encountered an error processing your request.",
+            "sources": []
+        }

render.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+services:
+  - type: web
+    name: rag-api
+    runtime: docker
+    env: python
+    buildCommand: docker build -t rag-api .
+    startCommand: uvicorn main:app --host 0.0.0.0 --port 8000
+    envVars:
+      - key: GOOGLE_API_KEY
+        value: AIzaSyBOIxwYU-v9UBt87oXASKVU-zw_hsWfFW8
+    plan: free

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+sentence-transformers
+pdfplumber
+pytesseract
+pillow
+python-multipart
+fastapi
+uvicorn
+google-generativeai
+grpcio==1.74.0
+grpcio-tools==1.74.0
+grpcio-status==1.74.0
+protobuf==5.29.5
+rpds.py==0.27.0