import gradio as gr import os import hashlib import logging from datetime import datetime import re from pathlib import Path # Document processing imports import PyPDF2 from docx import Document as DocxDocument from langchain_text_splitters import RecursiveCharacterTextSplitter # Local imports from .utils import getconfig config = getconfig("params.cfg") # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Models def extract_text_from_pdf_bytes(file_content: bytes) -> tuple[str, dict]: """Extract text from PDF bytes (in memory)""" try: from io import BytesIO pdf_reader = PyPDF2.PdfReader(BytesIO(file_content)) text = "" metadata = {"total_pages": len(pdf_reader.pages)} for page_num, page in enumerate(pdf_reader.pages): page_text = page.extract_text() text += f"\n--- Page {page_num + 1} ---\n{page_text}" return text, metadata except Exception as e: logger.error(f"PDF extraction error: {str(e)}") raise Exception(f"Failed to extract text from PDF: {str(e)}") def extract_text_from_docx_bytes(file_content: bytes) -> tuple[str, dict]: """Extract text from DOCX bytes (in memory)""" try: from io import BytesIO doc = DocxDocument(BytesIO(file_content)) text = "" metadata = {"total_paragraphs": 0} for paragraph in doc.paragraphs: if paragraph.text.strip(): text += f"{paragraph.text}\n" metadata["total_paragraphs"] += 1 return text, metadata except Exception as e: logger.error(f"DOCX extraction error: {str(e)}") raise Exception(f"Failed to extract text from DOCX: {str(e)}") def clean_and_chunk_text(text: str) -> str: """Clean text and split into chunks, returning formatted context""" # Basic text cleaning text = re.sub(r'\n+', '\n', text) text = re.sub(r'\s+', ' ', text) text = text.strip() # Get chunking parameters from config chunk_size = config.getint('chunking', 'chunk_size', fallback=700) chunk_overlap = config.getint('chunking', 'chunk_overlap', fallback=50) separators_str = config.get('chunking', 'separators', fallback='\n\n,\n,. ,! ,? , ,') separators = [s.strip() for s in separators_str.split(',')] # Split text into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=separators, is_separator_regex=False ) chunks = text_splitter.split_text(text) # Create DocumentChunk objects context_parts = [] for i, chunk_text in enumerate(chunks): context_parts.append(f"[Chunk {i+1}]: {chunk_text}") return "\n\n".join(context_parts) def ingest(file): """Main ingestion function - processes file and returns context directly""" if file is None: return "No file uploaded", "" try: with open(file.name, 'rb') as f: file_content = f.read() filename = os.path.basename(file.name) # Extract text based on file type (in memory) file_extension = os.path.splitext(filename)[1].lower() if file_extension == '.pdf': text, extraction_metadata = extract_text_from_pdf_bytes(file_content) elif file_extension == '.docx': text, extraction_metadata = extract_text_from_docx_bytes(file_content) else: raise ValueError(f"Unsupported file type: {file_extension}") # Clean and chunk text context = clean_and_chunk_text(text) logger.info(f"Successfully processed document {filename}: {len(text)} characters") return context except Exception as e: logger.error(f"Document processing failed: {str(e)}") raise Exception(f"Processing failed: {str(e)}") if __name__ == "__main__": ui = gr.Interface( fn=ingest, inputs=gr.File( label="Document Upload", file_types=[".pdf", ".docx"] ), outputs=gr.Textbox( label="Processed Context", lines=15, show_copy_button=True ), title="ChatFed Ingestion Module", description="Processes PDF or DOCX files and returns chunked text context. Intended for use in RAG pipelines as an MCP server with other ChatFed modules (i.e. context supplied to generation service).", api_name="ingest" ) ui.launch( server_name="0.0.0.0", server_port=7860, # mcp_server=True, show_error=True )