Spaces:

mtyrrell
/

chatfed_ingestor

Sleeping

App Files Files Community

mtyrrell commited on Aug 27

Commit

707cb4c

1 Parent(s): 01327a1

refactor for orchestrator

Browse files

Files changed (2) hide show

app/main.py +43 -243
requirements.txt +1 -4

app/main.py CHANGED Viewed

@@ -1,14 +1,8 @@
 import gradio as gr
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from pydantic import BaseModel
-from typing import Optional, Dict, Any, List
-import uvicorn
 import os
 import hashlib
 import logging
 from datetime import datetime
-from contextlib import asynccontextmanager
-import json
 import re
 from pathlib import Path
@@ -27,29 +21,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 logger = logging.getLogger(__name__)
 # Models
-class IngestRequest(BaseModel):
-    doc_id: str
-    file_content: bytes
-    filename: str
-    content_type: str
-class IngestResponse(BaseModel):
-    doc_id: str
-    chunks_indexed: int
-    status: str
-    metadata: Dict[str, Any]
-class DocumentChunk(BaseModel):
-    doc_id: str
-    chunk_id: str
-    content: str
-    metadata: Dict[str, Any]
-# Global storage for processed documents
-DOCUMENT_STORE: Dict[str, List[DocumentChunk]] = {}
-DOCUMENT_METADATA: Dict[str, Dict[str, Any]] = {}
-def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, Dict[str, Any]]:
     """Extract text from PDF bytes (in memory)"""
     try:
         from io import BytesIO
@@ -66,7 +39,7 @@ def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, Dict[str, Any
         logger.error(f"PDF extraction error: {str(e)}")
         raise Exception(f"Failed to extract text from PDF: {str(e)}")
-def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, Dict[str, Any]]:
     """Extract text from DOCX bytes (in memory)"""
     try:
         from io import BytesIO
@@ -84,8 +57,8 @@ def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, Dict[str, An
         logger.error(f"DOCX extraction error: {str(e)}")
         raise Exception(f"Failed to extract text from DOCX: {str(e)}")
-def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
-    """Clean text and split into chunks"""
     # Basic text cleaning
     text = re.sub(r'\n+', '\n', text)
     text = re.sub(r'\s+', ' ', text)
@@ -109,37 +82,22 @@ def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
     chunks = text_splitter.split_text(text)
     # Create DocumentChunk objects
-    document_chunks = []
     for i, chunk_text in enumerate(chunks):
-        chunk = DocumentChunk(
-            doc_id=doc_id,
-            chunk_id=f"{doc_id}_chunk_{i}",
-            content=chunk_text,
-            metadata={
-                "chunk_index": i,
-                "chunk_length": len(chunk_text),
-                "created_at": datetime.now().isoformat()
-            }
-        )
-        document_chunks.append(chunk)
-    return document_chunks
-def generate_doc_id(filename: str, content: bytes) -> str:
-    """Generate unique document ID"""
-    content_hash = hashlib.md5(content).hexdigest()[:8]
-    clean_name = re.sub(r'[^a-zA-Z0-9._-]', '_', filename)
-    name_without_ext = os.path.splitext(clean_name)[0]
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    return f"{timestamp}_{name_without_ext}_{content_hash}"
-def process_document(file_content: bytes, filename: str) -> IngestResponse:
-    """Main document processing function - now processes in memory"""
-    start_time = datetime.now()
     try:
-        # Generate document ID
-        doc_id = generate_doc_id(filename, file_content)
         # Extract text based on file type (in memory)
         file_extension = os.path.splitext(filename)[1].lower()
@@ -152,196 +110,38 @@ def process_document(file_content: bytes, filename: str) -> IngestResponse:
             raise ValueError(f"Unsupported file type: {file_extension}")
         # Clean and chunk text
-        chunks = clean_and_chunk_text(text, doc_id)
-        # Store chunks and metadata in memory only
-        DOCUMENT_STORE[doc_id] = chunks
-        processing_time = (datetime.now() - start_time).total_seconds()
-        DOCUMENT_METADATA[doc_id] = {
-            "filename": filename,
-            "doc_id": doc_id,
-            "file_type": file_extension,
-            "processing_time": processing_time,
-            "total_text_length": len(text),
-            "chunks_count": len(chunks),
-            "processed_at": datetime.now().isoformat(),
-            "status": "ready"
-        }
-        logger.info(f"Successfully processed document {doc_id}: {len(chunks)} chunks")
-        return IngestResponse(
-            doc_id=doc_id,
-            chunks_indexed=len(chunks),
-            status="ready",
-            metadata=DOCUMENT_METADATA[doc_id]
-        )
     except Exception as e:
         logger.error(f"Document processing failed: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
-def get_document_context(doc_id: str, max_chunks: int = 10) -> str:
-    """Retrieve document context for a given doc_id"""
-    if doc_id not in DOCUMENT_STORE:
-        return f"Document {doc_id} not found."
-    chunks = DOCUMENT_STORE[doc_id][:max_chunks]
-    context_parts = []
-    for chunk in chunks:
-        context_parts.append(f"[Chunk {chunk.metadata['chunk_index']}]: {chunk.content}")
-    return "\n\n".join(context_parts)
-# Gradio functions
-def gradio_upload_and_process(file):
-    """Process uploaded file through Gradio"""
-    if file is None:
-        return "No file uploaded", ""
-    try:
-        with open(file.name, 'rb') as f:
-            file_content = f.read()
-        filename = os.path.basename(file.name)
-        result = process_document(file_content, filename)
-        response_text = f"""
-Document ID: {result.doc_id}
-Chunks created: {result.chunks_indexed}
-Processing time: {result.metadata['processing_time']:.2f}s
-Total text length: {result.metadata['total_text_length']} characters
-File type: {result.metadata['file_type']}"""
-        # Get chunks for display
-        chunks = DOCUMENT_STORE.get(result.doc_id, [])
-        chunks_display = ""
-        if chunks:
-            for i, chunk in enumerate(chunks):  # Show first 5 chunks
-                chunks_display += f"chunk: {i+1}\n"
-                chunks_display += f"length: {len(chunk.content)}\n"
-                chunks_display += f"content: {chunk.content}\n\n"
-        return response_text, chunks_display
-    except Exception as e:
-        error_msg = f"Error processing document: {str(e)}"
-        logger.error(error_msg)
-        return error_msg, ""
-# Create simplified Gradio interface
-def create_gradio_interface():
-    with gr.Blocks(title="ChatFed Ingestion Module") as demo:
-        gr.Markdown("# ChatFed Ingestion Module")
-        gr.Markdown("Chunks PDF or DOCX files using LangChain RecursiveCharacterTextSplitter. Intended for use in RAG pipelines as an MCP server with other ChatFed modules.")
-        with gr.Row():
-            with gr.Column():
-                file_input = gr.File(label="Upload PDF or DOCX file for testing", file_types=[".pdf", ".docx"])
-                process_btn = gr.Button("Process Document")
-            with gr.Column():
-                result_output = gr.Textbox(label="Processing Result", lines=4)
-        with gr.Row():
-            chunks_output = gr.Textbox(label="Processed Chunks", lines=15)
-        process_btn.click(
-            fn=gradio_upload_and_process,
-            inputs=[file_input],
-            outputs=[result_output, chunks_output]
-        )
-    return demo
-# FastAPI setup
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    logger.info("Document Ingestion Module starting up...")
-    yield
-    logger.info("Document Ingestion Module shutting down...")
-app = FastAPI(title="ChatFed Document Ingestion", version="1.0.0", lifespan=lifespan)
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy", "documents_processed": len(DOCUMENT_METADATA)}
-@app.get("/")
-async def root():
-    return {
-        "message": "ChatFed Document Ingestion API",
-        "endpoints": {
-            "health": "/health",
-            "ingest": "/ingest",
-            "context": "/context/{doc_id}",
-            "documents": "/documents"
-        }
-    }
-@app.post("/ingest")
-async def ingest_endpoint(file: UploadFile = File(...)):
-    """Ingest a document file"""
-    try:
-        file_content = await file.read()
-        result = process_document(file_content, file.filename)
-        return result
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/context/{doc_id}")
-async def get_context_endpoint(doc_id: str, max_chunks: int = 10):
-    """Get context for a specific document"""
-    try:
-        context = get_document_context(doc_id, max_chunks)
-        return {
-            "doc_id": doc_id,
-            "context": context,
-            "metadata": DOCUMENT_METADATA.get(doc_id, {})
-        }
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=str(e))
-@app.get("/documents")
-async def list_documents_endpoint():
-    """List all processed documents"""
-    return {
-        "documents": list(DOCUMENT_METADATA.keys()),
-        "metadata": DOCUMENT_METADATA
-    }
-@app.post("/context")
-async def get_context_simple(doc_id: str, max_chunks: int = 10):
-    """Simple context endpoint for orchestrator integration"""
-    try:
-        context = get_document_context(doc_id, max_chunks)
-        return {"context": context}
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=str(e))
 if __name__ == "__main__":
-    # Create and launch Gradio interface
-    demo = create_gradio_interface()
-    # Run both FastAPI and Gradio
-    import threading
-    def run_gradio():
-        demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, share=False, quiet=True)
-    def run_fastapi():
-        uvicorn.run(app, host="0.0.0.0", port=7863, log_level="info")
-    # Start both services
-    gradio_thread = threading.Thread(target=run_gradio, daemon=True)
-    fastapi_thread = threading.Thread(target=run_fastapi, daemon=True)
-    gradio_thread.start()
-    fastapi_thread.start()
-    try:
-        gradio_thread.join()
-    except KeyboardInterrupt:
-        logger.info("Shutting down...")

 import gradio as gr
 import os
 import hashlib
 import logging
 from datetime import datetime
 import re
 from pathlib import Path
 logger = logging.getLogger(__name__)
 # Models
+def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, dict]:
     """Extract text from PDF bytes (in memory)"""
     try:
         from io import BytesIO
         logger.error(f"PDF extraction error: {str(e)}")
         raise Exception(f"Failed to extract text from PDF: {str(e)}")
+def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, dict]:
     """Extract text from DOCX bytes (in memory)"""
     try:
         from io import BytesIO
         logger.error(f"DOCX extraction error: {str(e)}")
         raise Exception(f"Failed to extract text from DOCX: {str(e)}")
+def clean_and_chunk_text(text: str) -> str:
+    """Clean text and split into chunks, returning formatted context"""
     # Basic text cleaning
     text = re.sub(r'\n+', '\n', text)
     text = re.sub(r'\s+', ' ', text)
     chunks = text_splitter.split_text(text)
     # Create DocumentChunk objects
+    context_parts = []
     for i, chunk_text in enumerate(chunks):
+        context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
+    return "\n\n".join(context_parts)
+def ingest(file):
+    """Main ingestion function - processes file and returns context directly"""
+    if file is None:
+        return "No file uploaded", ""
     try:
+        with open(file.name, 'rb') as f:
+            file_content = f.read()
+        filename = os.path.basename(file.name)
         # Extract text based on file type (in memory)
         file_extension = os.path.splitext(filename)[1].lower()
             raise ValueError(f"Unsupported file type: {file_extension}")
         # Clean and chunk text
+        context = clean_and_chunk_text(text)
+        logger.info(f"Successfully processed document {filename}: {len(text)} characters")
+        return context
     except Exception as e:
         logger.error(f"Document processing failed: {str(e)}")
+        raise Exception(f"Processing failed: {str(e)}")
 if __name__ == "__main__":
+    ui = gr.Interface(
+        fn=ingest,
+        inputs=gr.File(
+            label="Document Upload",
+            file_types=[".pdf", ".docx"],
+            info="Upload a PDF or DOCX file to extract and chunk text for use as context"
+        ),
+        outputs=gr.Textbox(
+            label="Processed Context",
+            lines=15,
+            show_copy_button=True,
+            info="Chunked document content ready for use as context in RAG pipelines"
+        ),
+        title="ChatFed Ingestion Module",
+        description="Processes PDF or DOCX files and returns chunked text context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied to generation service).",
+        api_name="ingest"
+    )
+    ui.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        mcp_server=True,
+        show_error=True
+    )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-fastapi==0.104.1
 uvicorn[standard]==0.24.0
 gradio==4.44.0
 pydantic==2.5.2
@@ -11,7 +11,4 @@ python-docx==1.1.0
 # LangChain text splitters (standalone package)
 langchain-text-splitters==0.0.1
-# Utilities
-python-dotenv==1.0.0

+# fastapi==0.104.1
 uvicorn[standard]==0.24.0
 gradio==4.44.0
 pydantic==2.5.2
 # LangChain text splitters (standalone package)
 langchain-text-splitters==0.0.1