Spaces:

mtyrrell
/

chatfed_ingestor

Sleeping

App Files Files Community

mtyrrell commited on Aug 27

Commit

537051a

1 Parent(s): 4cd2e8f

basic chunking

Browse files

Files changed (6) hide show

.DS_Store +0 -0
Dockerfile +39 -0
app/main.py +546 -0
app/utils.py +0 -0
params.cfg +0 -0
requirements.txt +14 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+# -------- base image --------
+FROM python:3.10-slim
+ENV PYTHONUNBUFFERED=1 \
+    OMP_NUM_THREADS=1 \
+    TOKENIZERS_PARALLELISM=false
+# ---------- Create Non-Root User ----------
+# Ensures proper file permissions for dev and runtime
+RUN useradd -m -u 1000 user
+# -------- install deps --------
+WORKDIR /app
+# ---------- Install Python Dependencies ----------
+# Copy requirements and install as non-root user
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install system dependencies for document processing
+# RUN apt-get update && apt-get install -y \
+#    build-essential \
+#    && rm -rf /var/lib/apt/lists/*
+# ---------- Copy Project Files ----------
+# Set appropriate ownership and permissions
+COPY --link --chown=1000 . .
+# Create directories for document storage
+RUN mkdir -p uploaded_docs processed_docs
+# Expose Gradio default port
+EXPOSE 7860 7863
+# Launch with unbuffered output
+CMD ["python", "-m", "app.main"]

app/main.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import gradio as gr
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from pydantic import BaseModel
+from typing import Optional, Dict, Any, List
+import uvicorn
+import os
+import hashlib
+import logging
+from datetime import datetime
+from contextlib import asynccontextmanager
+import json
+import re
+from pathlib import Path
+# Document processing imports
+import PyPDF2
+from docx import Document as DocxDocument
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Create directories for document storage
+UPLOAD_DIR = Path("uploaded_docs")
+PROCESSED_DIR = Path("processed_docs")
+UPLOAD_DIR.mkdir(exist_ok=True)
+PROCESSED_DIR.mkdir(exist_ok=True)
+# Models
+class IngestRequest(BaseModel):
+    doc_id: str
+    file_content: bytes
+    filename: str
+    content_type: str
+class IngestResponse(BaseModel):
+    doc_id: str
+    chunks_indexed: int
+    status: str
+    metadata: Dict[str, Any]
+class DocumentChunk(BaseModel):
+    doc_id: str
+    chunk_id: str
+    content: str
+    metadata: Dict[str, Any]
+# Global storage for processed documents (in production, use proper vector store)
+DOCUMENT_STORE: Dict[str, List[DocumentChunk]] = {}
+DOCUMENT_METADATA: Dict[str, Dict[str, Any]] = {}
+def extract_text_from_pdf(file_path: str) -> tuple[str, Dict[str, Any]]:
+    """Extract text from PDF file"""
+    try:
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            metadata = {
+                "total_pages": len(pdf_reader.pages),
+                "page_texts": []
+            }
+            for page_num, page in enumerate(pdf_reader.pages):
+                page_text = page.extract_text()
+                text += f"\n--- Page {page_num + 1} ---\n{page_text}"
+                metadata["page_texts"].append({
+                    "page": page_num + 1,
+                    "text": page_text,
+                    "char_count": len(page_text)
+                })
+        return text, metadata
+    except Exception as e:
+        logger.error(f"PDF extraction error: {str(e)}")
+        raise Exception(f"Failed to extract text from PDF: {str(e)}")
+def extract_text_from_docx(file_path: str) -> tuple[str, Dict[str, Any]]:
+    """Extract text from DOCX file"""
+    try:
+        doc = DocxDocument(file_path)
+        text = ""
+        metadata = {
+            "total_paragraphs": 0,
+            "paragraph_texts": []
+        }
+        for i, paragraph in enumerate(doc.paragraphs):
+            if paragraph.text.strip():
+                text += f"{paragraph.text}\n"
+                metadata["paragraph_texts"].append({
+                    "paragraph": i + 1,
+                    "text": paragraph.text,
+                    "char_count": len(paragraph.text)
+                })
+                metadata["total_paragraphs"] += 1
+        return text, metadata
+    except Exception as e:
+        logger.error(f"DOCX extraction error: {str(e)}")
+        raise Exception(f"Failed to extract text from DOCX: {str(e)}")
+def simple_text_splitter(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
+    """Simple text splitter without external dependencies"""
+    if not text:
+        return []
+    # Split by common separators in order of preference
+    separators = ["\n\n", "\n", ". ", "! ", "? ", " "]
+    def split_text_recursive(text: str, separators: List[str]) -> List[str]:
+        if not separators:
+            # If no separators left, split by character count
+            chunks = []
+            for i in range(0, len(text), chunk_size - chunk_overlap):
+                chunk = text[i:i + chunk_size]
+                if chunk.strip():
+                    chunks.append(chunk.strip())
+            return chunks
+        separator = separators[0]
+        remaining_separators = separators[1:]
+        splits = text.split(separator)
+        chunks = []
+        current_chunk = ""
+        for split in splits:
+            # If adding this split would exceed chunk_size
+            if len(current_chunk) + len(split) + len(separator) > chunk_size:
+                if current_chunk:
+                    # If current chunk is still too big, recursively split it
+                    if len(current_chunk) > chunk_size:
+                        sub_chunks = split_text_recursive(current_chunk, remaining_separators)
+                        chunks.extend(sub_chunks)
+                    else:
+                        chunks.append(current_chunk.strip())
+                current_chunk = split
+            else:
+                if current_chunk:
+                    current_chunk += separator + split
+                else:
+                    current_chunk = split
+        # Add the last chunk
+        if current_chunk:
+            if len(current_chunk) > chunk_size:
+                sub_chunks = split_text_recursive(current_chunk, remaining_separators)
+                chunks.extend(sub_chunks)
+            else:
+                chunks.append(current_chunk.strip())
+        return chunks
+    # Split the text
+    initial_chunks = split_text_recursive(text, separators)
+    # Add overlap between chunks
+    final_chunks = []
+    for i, chunk in enumerate(initial_chunks):
+        if i > 0 and chunk_overlap > 0:
+            # Add overlap from previous chunk
+            prev_chunk = initial_chunks[i-1]
+            overlap = prev_chunk[-chunk_overlap:] if len(prev_chunk) > chunk_overlap else prev_chunk
+            chunk = overlap + " " + chunk
+        final_chunks.append(chunk)
+    return [chunk for chunk in final_chunks if chunk.strip()]
+def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
+    """Clean text and split into chunks"""
+    # Basic text cleaning
+    text = re.sub(r'\n+', '\n', text)  # Remove multiple newlines
+    text = re.sub(r'\s+', ' ', text)   # Remove multiple spaces
+    text = text.strip()
+    # Split text into chunks using simple splitter
+    chunks = simple_text_splitter(text, chunk_size=500, chunk_overlap=50)
+    # Create DocumentChunk objects
+    document_chunks = []
+    for i, chunk_text in enumerate(chunks):
+        chunk = DocumentChunk(
+            doc_id=doc_id,
+            chunk_id=f"{doc_id}_chunk_{i}",
+            content=chunk_text,
+            metadata={
+                "chunk_index": i,
+                "chunk_length": len(chunk_text),
+                "created_at": datetime.now().isoformat()
+            }
+        )
+        document_chunks.append(chunk)
+    return document_chunks
+def generate_doc_id(filename: str, content: bytes) -> str:
+    """Generate unique document ID"""
+    # Create hash from content for uniqueness
+    content_hash = hashlib.md5(content).hexdigest()[:8]
+    # Clean filename
+    clean_name = re.sub(r'[^a-zA-Z0-9._-]', '_', filename)
+    # Remove extension
+    name_without_ext = os.path.splitext(clean_name)[0]
+    # Create doc_id
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return f"{timestamp}_{name_without_ext}_{content_hash}"
+def process_document(file_content: bytes, filename: str) -> IngestResponse:
+    """Main document processing function"""
+    start_time = datetime.now()
+    try:
+        # Generate document ID
+        doc_id = generate_doc_id(filename, file_content)
+        # Save uploaded file temporarily
+        file_extension = os.path.splitext(filename)[1].lower()
+        temp_file_path = UPLOAD_DIR / f"{doc_id}{file_extension}"
+        with open(temp_file_path, 'wb') as f:
+            f.write(file_content)
+        # Extract text based on file type
+        if file_extension == '.pdf':
+            text, extraction_metadata = extract_text_from_pdf(str(temp_file_path))
+        elif file_extension == '.docx':
+            text, extraction_metadata = extract_text_from_docx(str(temp_file_path))
+        else:
+            raise ValueError(f"Unsupported file type: {file_extension}")
+        # Clean and chunk text
+        chunks = clean_and_chunk_text(text, doc_id)
+        # Store chunks (in production, this would go to vector store)
+        DOCUMENT_STORE[doc_id] = chunks
+        # Store metadata
+        processing_time = (datetime.now() - start_time).total_seconds()
+        DOCUMENT_METADATA[doc_id] = {
+            "filename": filename,
+            "doc_id": doc_id,
+            "file_type": file_extension,
+            "processing_time": processing_time,
+            "total_text_length": len(text),
+            "chunks_count": len(chunks),
+            "extraction_metadata": extraction_metadata,
+            "processed_at": datetime.now().isoformat(),
+            "status": "ready"
+        }
+        # Clean up temporary file
+        temp_file_path.unlink()
+        # Save processed document
+        processed_file_path = PROCESSED_DIR / f"{doc_id}.json"
+        with open(processed_file_path, 'w') as f:
+            json.dump({
+                "metadata": DOCUMENT_METADATA[doc_id],
+                "chunks": [chunk.dict() for chunk in chunks]
+            }, f, indent=2)
+        logger.info(f"Successfully processed document {doc_id}: {len(chunks)} chunks")
+        return IngestResponse(
+            doc_id=doc_id,
+            chunks_indexed=len(chunks),
+            status="ready",
+            metadata=DOCUMENT_METADATA[doc_id]
+        )
+    except Exception as e:
+        logger.error(f"Document processing failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+def get_document_context(doc_id: str, max_chunks: int = 10) -> str:
+    """Retrieve document context for a given doc_id"""
+    if doc_id not in DOCUMENT_STORE:
+        return f"Document {doc_id} not found."
+    chunks = DOCUMENT_STORE[doc_id][:max_chunks]
+    context_parts = []
+    for chunk in chunks:
+        context_parts.append(f"[Chunk {chunk.metadata['chunk_index']}]: {chunk.content}")
+    return "\n\n".join(context_parts)
+# Gradio functions
+def gradio_upload_and_process(file):
+    """Process uploaded file through Gradio"""
+    if file is None:
+        return "No file uploaded", "", ""
+    try:
+        with open(file.name, 'rb') as f:
+            file_content = f.read()
+        filename = os.path.basename(file.name)
+        result = process_document(file_content, filename)
+        # Format response for Gradio
+        response_text = f"""
+✅ Document processed successfully!
+📄 Document ID: {result.doc_id}
+📊 Chunks created: {result.chunks_indexed}
+⏱️ Processing time: {result.metadata['processing_time']:.2f}s
+📝 Total text length: {result.metadata['total_text_length']} characters
+📑 File type: {result.metadata['file_type']}
+Status: {result.status}
+"""
+        # Get the processed chunks for display
+        chunks = DOCUMENT_STORE.get(result.doc_id, [])
+        chunks_display = ""
+        if chunks:
+            chunks_display = "📄 Processed Chunks:\n\n"
+            for i, chunk in enumerate(chunks[:10]):  # Show first 10 chunks
+                chunks_display += f"--- Chunk {i+1} ---\n"
+                chunks_display += f"Length: {len(chunk.content)} characters\n"
+                chunks_display += f"Content: {chunk.content[:200]}{'...' if len(chunk.content) > 200 else ''}\n\n"
+            if len(chunks) > 10:
+                chunks_display += f"... and {len(chunks) - 10} more chunks\n"
+        return response_text, result.doc_id, chunks_display
+    except Exception as e:
+        error_msg = f"❌ Error processing document: {str(e)}"
+        logger.error(error_msg)
+        return error_msg, "", ""
+def gradio_get_context(doc_id: str, max_chunks: int = 5):
+    """Get document context through Gradio"""
+    if not doc_id.strip():
+        return "Please enter a document ID"
+    try:
+        context = get_document_context(doc_id.strip(), max_chunks)
+        return f"📄 Context for document '{doc_id}':\n\n{context}"
+    except Exception as e:
+        return f"❌ Error retrieving context: {str(e)}"
+def list_documents():
+    """List all processed documents"""
+    if not DOCUMENT_METADATA:
+        return "No documents processed yet."
+    doc_list = []
+    for doc_id, metadata in DOCUMENT_METADATA.items():
+        doc_list.append(f"• {doc_id} ({metadata['filename']}) - {metadata['chunks_count']} chunks")
+    return "📚 Processed Documents:\n\n" + "\n".join(doc_list)
+# Create Gradio interface
+def create_gradio_interface():
+    with gr.Blocks(title="ChatFed Document Ingestion", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 📚 ChatFed Document Ingestion Module")
+        gr.Markdown("Upload PDF or DOCX files to make them available for retrieval.")
+        with gr.Tab("📤 Upload Document"):
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(
+                        label="Upload PDF or DOCX file",
+                        file_types=[".pdf", ".docx"]
+                    )
+                    process_btn = gr.Button("🔄 Process Document", variant="primary")
+                with gr.Column():
+                    result_output = gr.Textbox(
+                        label="Processing Result",
+                        lines=8,
+                        interactive=False
+                    )
+                    doc_id_output = gr.Textbox(
+                        label="Document ID",
+                        interactive=False
+                    )
+            # Add a new section for displaying chunks
+            with gr.Row():
+                chunks_output = gr.Textbox(
+                    label="Processed Chunks Preview",
+                    lines=15,
+                    interactive=False
+                )
+            process_btn.click(
+                fn=gradio_upload_and_process,
+                inputs=[file_input],
+                outputs=[result_output, doc_id_output, chunks_output]
+            )
+        with gr.Tab("🔍 View Document"):
+            with gr.Row():
+                with gr.Column():
+                    doc_id_input = gr.Textbox(
+                        label="Document ID",
+                        placeholder="Enter document ID to view context..."
+                    )
+                    max_chunks_input = gr.Slider(
+                        label="Max Chunks to Display",
+                        minimum=1,
+                        maximum=20,
+                        value=5,
+                        step=1
+                    )
+                    view_btn = gr.Button("👀 View Context", variant="secondary")
+                with gr.Column():
+                    context_output = gr.Textbox(
+                        label="Document Context",
+                        lines=15,
+                        interactive=False
+                    )
+            view_btn.click(
+                fn=gradio_get_context,
+                inputs=[doc_id_input, max_chunks_input],
+                outputs=[context_output]
+            )
+        with gr.Tab("📋 Document List"):
+            with gr.Column():
+                refresh_btn = gr.Button("🔄 Refresh List")
+                doc_list_output = gr.Textbox(
+                    label="All Documents",
+                    lines=10,
+                    interactive=False
+                )
+            refresh_btn.click(
+                fn=list_documents,
+                inputs=[],
+                outputs=[doc_list_output]
+            )
+            # Load initial list
+            demo.load(fn=list_documents, inputs=[], outputs=[doc_list_output])
+    return demo
+# FastAPI setup
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("Document Ingestion Module starting up...")
+    yield
+    logger.info("Document Ingestion Module shutting down...")
+app = FastAPI(
+    title="ChatFed Document Ingestion",
+    version="1.0.0",
+    lifespan=lifespan
+)
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "documents_processed": len(DOCUMENT_METADATA)}
+@app.get("/")
+async def root():
+    return {
+        "message": "ChatFed Document Ingestion API",
+        "endpoints": {
+            "health": "/health",
+            "ingest": "/ingest",
+            "context": "/context/{doc_id}",
+            "documents": "/documents"
+        }
+    }
+@app.post("/ingest")
+async def ingest_endpoint(file: UploadFile = File(...)):
+    """Ingest a document file"""
+    try:
+        file_content = await file.read()
+        result = process_document(file_content, file.filename)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/context/{doc_id}")
+async def get_context_endpoint(doc_id: str, max_chunks: int = 10):
+    """Get context for a specific document"""
+    try:
+        context = get_document_context(doc_id, max_chunks)
+        return {
+            "doc_id": doc_id,
+            "context": context,
+            "metadata": DOCUMENT_METADATA.get(doc_id, {})
+        }
+    except Exception as e:
+        raise HTTPException(status_code=404, detail=str(e))
+@app.get("/documents")
+async def list_documents_endpoint():
+    """List all processed documents"""
+    return {
+        "documents": list(DOCUMENT_METADATA.keys()),
+        "metadata": DOCUMENT_METADATA
+    }
+# Add a simple API endpoint for the orchestrator to call
+@app.post("/context")
+async def get_context_simple(doc_id: str, max_chunks: int = 10):
+    """Simple context endpoint for orchestrator integration"""
+    try:
+        context = get_document_context(doc_id, max_chunks)
+        return {"context": context}
+    except Exception as e:
+        raise HTTPException(status_code=404, detail=str(e))
+if __name__ == "__main__":
+    # Create and launch Gradio interface
+    demo = create_gradio_interface()
+    # Run both FastAPI and Gradio
+    import threading
+    def run_gradio():
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            show_error=True,
+            share=False,
+            quiet=True
+        )
+    def run_fastapi():
+        uvicorn.run(app, host="0.0.0.0", port=7863, log_level="info")
+    # Start Gradio in main thread
+    gradio_thread = threading.Thread(target=run_gradio, daemon=True)
+    gradio_thread.start()
+    # Start FastAPI in background
+    fastapi_thread = threading.Thread(target=run_fastapi, daemon=True)
+    fastapi_thread.start()
+    # Keep main thread alive
+    try:
+        gradio_thread.join()
+    except KeyboardInterrupt:
+        logger.info("Shutting down...")

app/utils.py ADDED Viewed

File without changes

params.cfg ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+gradio==4.44.0
+pydantic==2.5.2
+python-multipart>=0.0.9
+# Document processing
+PyPDF2==3.0.1
+python-docx==1.1.0
+# Utilities
+python-dotenv==1.0.0