eudr_chabo_orchestrator

Running on CPU Upgrade

App Files Files Community

mtyrrell commited on Sep 17

Commit

ecc8726

1 Parent(s): 5952c14

refactor and ts cleanup

Browse files

Files changed (8) hide show

app/__pycache__/main.cpython-311.pyc +0 -0
app/__pycache__/models.cpython-311.pyc +0 -0
app/__pycache__/nodes.cpython-311.pyc +0 -0
app/__pycache__/utils.cpython-311.pyc +0 -0
app/main.py +13 -285
app/models.py +38 -0
app/nodes.py +167 -0
app/utils.py +72 -0

app/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (32.6 kB). View file

app/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (2.49 kB). View file

app/__pycache__/nodes.cpython-311.pyc ADDED Viewed

Binary file (7.84 kB). View file

app/__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/app/__pycache__/utils.cpython-311.pyc and b/app/__pycache__/utils.cpython-311.pyc differ

app/main.py CHANGED Viewed

@@ -14,17 +14,19 @@ import os
 from datetime import datetime
 import logging
 from contextlib import asynccontextmanager
-import threading
 from langchain_core.runnables import RunnableLambda
-import tempfile
-import mimetypes
 import asyncio
 from typing import Generator
 import json
 import httpx
-import ast
-from utils import getconfig
 config = getconfig("params.cfg")
 RETRIEVER = config.get("retriever", "RETRIEVER", fallback="https://giz-chatfed-retriever.hf.space")
@@ -36,272 +38,9 @@ MAX_CONTEXT_CHARS = config.get("general", "MAX_CONTEXT_CHARS")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# CORE FUNCTIONALITY - KEEP THESE
-# File type detection
-def detect_file_type(filename: str, file_content: bytes = None) -> str:
-    """Detect file type based on extension and content"""
-    if not filename:
-        return "unknown"
-    # Get file extension
-    _, ext = os.path.splitext(filename.lower())
-    # Define file type mappings
-    file_type_mappings = {
-        '.geojson': 'geojson',
-        '.json': 'json',  # Could be geojson, will check content
-        '.pdf': 'text',
-        '.docx': 'text',
-        '.doc': 'text',
-        '.txt': 'text',
-        '.md': 'text',
-        '.csv': 'text',
-        '.xlsx': 'text',
-        '.xls': 'text'
-    }
-    detected_type = file_type_mappings.get(ext, 'unknown')
-    # For JSON files, check if it's actually GeoJSON
-    if detected_type == 'json' and file_content:
-        try:
-            import json
-            content_str = file_content.decode('utf-8')
-            data = json.loads(content_str)
-            # Check if it has GeoJSON structure
-            if isinstance(data, dict) and ('type' in data and data.get('type') == 'FeatureCollection'):
-                detected_type = 'geojson'
-            elif isinstance(data, dict) and ('type' in data and data.get('type') in ['Feature', 'Point', 'LineString', 'Polygon', 'MultiPoint', 'MultiLineString', 'MultiPolygon', 'GeometryCollection']):
-                detected_type = 'geojson'
-        except:
-            pass  # Keep as json if parsing fails
-    logger.info(f"Detected file type: {detected_type} for file: {filename}")
-    return detected_type
-# Models - KEEP THESE
-class GraphState(TypedDict):
-    query: str
-    context: str
-    ingestor_context: str
-    result: str
-    sources: Optional[List[Dict[str, str]]]  # Added for ChatUI sources
-    reports_filter: str
-    sources_filter: str
-    subtype_filter: str
-    year_filter: str
-    file_content: Optional[bytes]
-    filename: Optional[str]
-    metadata: Optional[Dict[str, Any]]
-    file_type: Optional[str]
-    workflow_type: Optional[str]  # 'standard' or 'geojson_direct'
-class ChatFedInput(TypedDict):
-    query: str
-    reports_filter: Optional[str]
-    sources_filter: Optional[str]
-    subtype_filter: Optional[str]
-    year_filter: Optional[str]
-    session_id: Optional[str]
-    user_id: Optional[str]
-    file_content: Optional[bytes]
-    filename: Optional[str]
-class ChatFedOutput(TypedDict):
-    result: str
-    metadata: Dict[str, Any]
-class ChatUIInput(BaseModel):
-    text: str
-# CORE PROCESSING NODES - KEEP THESE
-# File type detection node
-def detect_file_type_node(state: GraphState) -> GraphState:
-    """Detect file type and determine workflow"""
-    file_type = "unknown"
-    workflow_type = "standard"
-    if state.get("file_content") and state.get("filename"):
-        file_type = detect_file_type(state["filename"], state["file_content"])
-        # Determine workflow based on file type
-        if file_type == "geojson":
-            workflow_type = "geojson_direct"
-        else:
-            workflow_type = "standard"
-    metadata = state.get("metadata", {})
-    metadata.update({
-        "file_type": file_type,
-        "workflow_type": workflow_type
-    })
-    return {
-        "file_type": file_type,
-        "workflow_type": workflow_type,
-        "metadata": metadata
-    }
-# Module functions
-def ingest_node(state: GraphState) -> GraphState:
-    """Process file through appropriate ingestor based on file type"""
-    start_time = datetime.now()
-    # If no file provided, skip this step
-    if not state.get("file_content") or not state.get("filename"):
-        logger.info("No file provided, skipping ingestion")
-        return {"ingestor_context": "", "metadata": state.get("metadata", {})}
-    file_type = state.get("file_type", "unknown")
-    logger.info(f"Ingesting {file_type} file: {state['filename']}")
-    try:
-        # Choose ingestor based on file type
-        if file_type == "geojson":
-            ingestor_url = GEOJSON_INGESTOR
-            logger.info(f"Using GeoJSON ingestor: {ingestor_url}")
-        else:
-            ingestor_url = INGESTOR
-            logger.info(f"Using standard ingestor: {ingestor_url}")
-        client = Client(ingestor_url)
-        # Create a temporary file to upload
-        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(state["filename"])[1]) as tmp_file:
-            tmp_file.write(state["file_content"])
-            tmp_file_path = tmp_file.name
-        try:
-            # Call the ingestor's ingest endpoint
-            ingestor_context = client.predict(
-                file(tmp_file_path),
-                api_name="/ingest"
-            )
-            logger.info(f"Ingest result length: {len(ingestor_context) if ingestor_context else 0}")
-            # Handle error cases
-            if isinstance(ingestor_context, str) and ingestor_context.startswith("Error:"):
-                raise Exception(ingestor_context)
-        finally:
-            # Clean up temporary file
-            os.unlink(tmp_file_path)
-        duration = (datetime.now() - start_time).total_seconds()
-        metadata = state.get("metadata", {})
-        metadata.update({
-            "ingestion_duration": duration,
-            "ingestor_context_length": len(ingestor_context) if ingestor_context else 0,
-            "ingestion_success": True,
-            "ingestor_used": ingestor_url
-        })
-        return {
-            "ingestor_context": ingestor_context,
-            "metadata": metadata
-        }
-    except Exception as e:
-        duration = (datetime.now() - start_time).total_seconds()
-        logger.error(f"Ingestion failed: {str(e)}")
-        metadata = state.get("metadata", {})
-        metadata.update({
-            "ingestion_duration": duration,
-            "ingestion_success": False,
-            "ingestion_error": str(e)
-        })
-        return {"ingestor_context": "", "metadata": metadata}
-def geojson_direct_result_node(state: GraphState) -> GraphState:
-    """For GeoJSON files, return ingestor results directly without retrieval/generation"""
-    logger.info("Processing GeoJSON file - returning direct results")
-    ingestor_context = state.get("ingestor_context", "")
-    # For GeoJSON files, the ingestor result is the final result
-    result = ingestor_context if ingestor_context else "No results from GeoJSON processing."
-    metadata = state.get("metadata", {})
-    metadata.update({
-        "processing_type": "geojson_direct",
-        "result_length": len(result)
-    })
-    return {
-        "result": result,
-        "metadata": metadata
-    }
-def retrieve_node(state: GraphState) -> GraphState:
-    start_time = datetime.now()
-    logger.info(f"Retrieval: {state['query'][:50]}...")
-    try:
-        client = Client(RETRIEVER)
-        context = client.predict(
-            query=state["query"],
-            reports_filter=state.get("reports_filter", ""),
-            sources_filter=state.get("sources_filter", ""),
-            subtype_filter=state.get("subtype_filter", ""),
-            year_filter=state.get("year_filter", ""),
-            api_name="/retrieve"
-        )
-        duration = (datetime.now() - start_time).total_seconds()
-        metadata = state.get("metadata", {})
-        metadata.update({
-            "retrieval_duration": duration,
-            "context_length": len(context) if context else 0,
-            "retrieval_success": True
-        })
-        return {"context": context, "metadata": metadata}
-    except Exception as e:
-        duration = (datetime.now() - start_time).total_seconds()
-        logger.error(f"Retrieval failed: {str(e)}")
-        metadata = state.get("metadata", {})
-        metadata.update({
-            "retrieval_duration": duration,
-            "retrieval_success": False,
-            "retrieval_error": str(e)
-        })
-        return {"context": "", "metadata": metadata}
-# Helper function to convert retrieval context to expected format
-def convert_context_to_list(context: str) -> List[Dict[str, Any]]:
-    """Convert string context to list format expected by generator"""
-    try:
-        # Try to parse as list first
-        if context.startswith('['):
-            return ast.literal_eval(context)
-        else:
-            # If it's a string, wrap it in a simple format
-            return [{
-                "answer": context,
-                "answer_metadata": {
-                    "filename": "Retrieved Context",
-                    "page": "Unknown",
-                    "year": "Unknown",
-                    "source": "Retriever"
-                }
-            }]
-    except:
-        # Fallback: simple string wrapping
-        return [{
-            "answer": context,
-            "answer_metadata": {
-                "filename": "Retrieved Context",
-                "page": "Unknown",
-                "year": "Unknown",
-                "source": "Retriever"
-            }
-        }]
-# MAIN STREAMING GENERATOR - KEEP THIS (but consider simplifying the fallback logic)
 async def generate_node_streaming(state: GraphState) -> Generator[GraphState, None, None]:
     """Streaming version that calls generator's FastAPI endpoint"""
     start_time = datetime.now()
@@ -456,7 +195,7 @@ async def generate_node_streaming(state: GraphState) -> Generator[GraphState, No
                                 except json.JSONDecodeError:
                                     raise Exception(data_content)
-        # TODO: CONSIDER REMOVING THIS GRADIO FALLBACK IF FASTAPI IS RELIABLE
         except Exception as fastapi_error:
             logger.warning(f"FastAPI endpoint failed: {fastapi_error}")
             logger.info("Falling back to Gradio client")
@@ -538,7 +277,7 @@ async def generate_node_streaming(state: GraphState) -> Generator[GraphState, No
         })
         yield {"result": f"Error: {str(e)}", "metadata": metadata}
-# Conditional routing function - KEEP THIS
 def route_workflow(state: GraphState) -> str:
     """Route to appropriate workflow based on file type"""
     workflow_type = state.get("workflow_type", "standard")
@@ -864,7 +603,6 @@ async def root():
         "message": "ChatFed Orchestrator API",
         "endpoints": {
             "health": "/health",
-            # "chatfed": "/chatfed",  # Commented out - test if ChatUI needs this
             "chatfed-ui-stream": "/chatfed-ui-stream",
             "chatfed-with-file": "/chatfed-with-file",
             # "chatfed-with-file-stream": "/chatfed-with-file/stream",
@@ -873,7 +611,7 @@ async def root():
-# # FILE UPLOAD ADAPTER - KEEP THIS
 async def chatfed_with_file_adapter(
     query: str,
     file_content: Optional[bytes] = None,
@@ -942,7 +680,7 @@ async def chatfed_with_file_adapter(
         logger.error(f"File upload streaming failed: {str(e)}")
         yield f"Error: {str(e)}"
-# TODO: PROBABLY REMOVE - NON-STREAMING FILE UPLOAD
 # @app.post("/chatfed-with-file")
 # async def chatfed_with_file(
 #     query: str = Form(...),
@@ -984,7 +722,7 @@ async def chatfed_with_file_adapter(
 #         media_type="text/plain"
 #     )
-# MAIN FILE UPLOAD STREAMING ENDPOINT - KEEP THIS
 @app.post("/chatfed-with-file/stream")
 async def chatfed_with_file_stream(
     query: str = Form(...),
@@ -1050,16 +788,6 @@ async def chatfed_with_file_stream(
         }
     )
-# TODO: TEST IF CHATUI NEEDS THESE LANGSERVE ENDPOINTS
-# If ChatUI works without these, they can be removed
-# add_routes(
-#     app,
-#     RunnableLambda(process_query_langserve),
-#     path="/chatfed",
-#     input_type=ChatFedInput,
-#     output_type=ChatFedOutput
-# )
 add_routes(
     app,
     RunnableLambda(chatui_adapter),

 from datetime import datetime
 import logging
 from contextlib import asynccontextmanager
+# import threading
 from langchain_core.runnables import RunnableLambda
+# import tempfile
+# import mimetypes
 import asyncio
 from typing import Generator
 import json
 import httpx
+# import ast
+from utils import getconfig, convert_context_to_list
+from nodes import detect_file_type_node, ingest_node, geojson_direct_result_node, retrieve_node
+from models import GraphState, ChatFedInput, ChatFedOutput, ChatUIInput
 config = getconfig("params.cfg")
 RETRIEVER = config.get("retriever", "RETRIEVER", fallback="https://giz-chatfed-retriever.hf.space")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# MAIN STREAMING GENERATOR
 async def generate_node_streaming(state: GraphState) -> Generator[GraphState, None, None]:
     """Streaming version that calls generator's FastAPI endpoint"""
     start_time = datetime.now()
                                 except json.JSONDecodeError:
                                     raise Exception(data_content)
+        # GRADIO FALLBACK
         except Exception as fastapi_error:
             logger.warning(f"FastAPI endpoint failed: {fastapi_error}")
             logger.info("Falling back to Gradio client")
         })
         yield {"result": f"Error: {str(e)}", "metadata": metadata}
+# Conditional routing function
 def route_workflow(state: GraphState) -> str:
     """Route to appropriate workflow based on file type"""
     workflow_type = state.get("workflow_type", "standard")
         "message": "ChatFed Orchestrator API",
         "endpoints": {
             "health": "/health",
             "chatfed-ui-stream": "/chatfed-ui-stream",
             "chatfed-with-file": "/chatfed-with-file",
             # "chatfed-with-file-stream": "/chatfed-with-file/stream",
+# # FILE UPLOAD ADAPTER
 async def chatfed_with_file_adapter(
     query: str,
     file_content: Optional[bytes] = None,
         logger.error(f"File upload streaming failed: {str(e)}")
         yield f"Error: {str(e)}"
+# NON-STREAMING FILE UPLOAD
 # @app.post("/chatfed-with-file")
 # async def chatfed_with_file(
 #     query: str = Form(...),
 #         media_type="text/plain"
 #     )
+# MAIN FILE UPLOAD STREAMING ENDPOINT
 @app.post("/chatfed-with-file/stream")
 async def chatfed_with_file_stream(
     query: str = Form(...),
         }
     )
 add_routes(
     app,
     RunnableLambda(chatui_adapter),

app/models.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Models
+from typing import Optional, Dict, Any, List
+from typing_extensions import TypedDict
+from pydantic import BaseModel
+class GraphState(TypedDict):
+    query: str
+    context: str
+    ingestor_context: str
+    result: str
+    sources: Optional[List[Dict[str, str]]]  # Added for ChatUI sources
+    reports_filter: str
+    sources_filter: str
+    subtype_filter: str
+    year_filter: str
+    file_content: Optional[bytes]
+    filename: Optional[str]
+    metadata: Optional[Dict[str, Any]]
+    file_type: Optional[str]
+    workflow_type: Optional[str]  # 'standard' or 'geojson_direct'
+class ChatFedInput(TypedDict):
+    query: str
+    reports_filter: Optional[str]
+    sources_filter: Optional[str]
+    subtype_filter: Optional[str]
+    year_filter: Optional[str]
+    session_id: Optional[str]
+    user_id: Optional[str]
+    file_content: Optional[bytes]
+    filename: Optional[str]
+class ChatFedOutput(TypedDict):
+    result: str
+    metadata: Dict[str, Any]
+class ChatUIInput(BaseModel):
+    text: str

app/nodes.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from utils import detect_file_type
+from models import GraphState
+from datetime import datetime
+import tempfile
+import os
+from gradio_client import Client, file
+import logging
+logger = logging.getLogger(__name__)
+# CORE PROCESSING NODES
+#----------------------------------------
+# File type detection node
+def detect_file_type_node(state: GraphState) -> GraphState:
+    """Detect file type and determine workflow"""
+    file_type = "unknown"
+    workflow_type = "standard"
+    if state.get("file_content") and state.get("filename"):
+        file_type = detect_file_type(state["filename"], state["file_content"])
+        # Determine workflow based on file type
+        if file_type == "geojson":
+            workflow_type = "geojson_direct"
+        else:
+            workflow_type = "standard"
+    metadata = state.get("metadata", {})
+    metadata.update({
+        "file_type": file_type,
+        "workflow_type": workflow_type
+    })
+    return {
+        "file_type": file_type,
+        "workflow_type": workflow_type,
+        "metadata": metadata
+    }
+# Module functions
+def ingest_node(state: GraphState) -> GraphState:
+    """Process file through appropriate ingestor based on file type"""
+    start_time = datetime.now()
+    # If no file provided, skip this step
+    if not state.get("file_content") or not state.get("filename"):
+        logger.info("No file provided, skipping ingestion")
+        return {"ingestor_context": "", "metadata": state.get("metadata", {})}
+    file_type = state.get("file_type", "unknown")
+    logger.info(f"Ingesting {file_type} file: {state['filename']}")
+    try:
+        # Choose ingestor based on file type
+        if file_type == "geojson":
+            ingestor_url = GEOJSON_INGESTOR
+            logger.info(f"Using GeoJSON ingestor: {ingestor_url}")
+        else:
+            ingestor_url = INGESTOR
+            logger.info(f"Using standard ingestor: {ingestor_url}")
+        client = Client(ingestor_url)
+        # Create a temporary file to upload
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(state["filename"])[1]) as tmp_file:
+            tmp_file.write(state["file_content"])
+            tmp_file_path = tmp_file.name
+        try:
+            # Call the ingestor's ingest endpoint
+            ingestor_context = client.predict(
+                file(tmp_file_path),
+                api_name="/ingest"
+            )
+            logger.info(f"Ingest result length: {len(ingestor_context) if ingestor_context else 0}")
+            # Handle error cases
+            if isinstance(ingestor_context, str) and ingestor_context.startswith("Error:"):
+                raise Exception(ingestor_context)
+        finally:
+            # Clean up temporary file
+            os.unlink(tmp_file_path)
+        duration = (datetime.now() - start_time).total_seconds()
+        metadata = state.get("metadata", {})
+        metadata.update({
+            "ingestion_duration": duration,
+            "ingestor_context_length": len(ingestor_context) if ingestor_context else 0,
+            "ingestion_success": True,
+            "ingestor_used": ingestor_url
+        })
+        return {
+            "ingestor_context": ingestor_context,
+            "metadata": metadata
+        }
+    except Exception as e:
+        duration = (datetime.now() - start_time).total_seconds()
+        logger.error(f"Ingestion failed: {str(e)}")
+        metadata = state.get("metadata", {})
+        metadata.update({
+            "ingestion_duration": duration,
+            "ingestion_success": False,
+            "ingestion_error": str(e)
+        })
+        return {"ingestor_context": "", "metadata": metadata}
+def geojson_direct_result_node(state: GraphState) -> GraphState:
+    """For GeoJSON files, return ingestor results directly without retrieval/generation"""
+    logger.info("Processing GeoJSON file - returning direct results")
+    ingestor_context = state.get("ingestor_context", "")
+    # For GeoJSON files, the ingestor result is the final result
+    result = ingestor_context if ingestor_context else "No results from GeoJSON processing."
+    metadata = state.get("metadata", {})
+    metadata.update({
+        "processing_type": "geojson_direct",
+        "result_length": len(result)
+    })
+    return {
+        "result": result,
+        "metadata": metadata
+    }
+def retrieve_node(state: GraphState) -> GraphState:
+    start_time = datetime.now()
+    logger.info(f"Retrieval: {state['query'][:50]}...")
+    try:
+        client = Client(RETRIEVER)
+        context = client.predict(
+            query=state["query"],
+            reports_filter=state.get("reports_filter", ""),
+            sources_filter=state.get("sources_filter", ""),
+            subtype_filter=state.get("subtype_filter", ""),
+            year_filter=state.get("year_filter", ""),
+            api_name="/retrieve"
+        )
+        duration = (datetime.now() - start_time).total_seconds()
+        metadata = state.get("metadata", {})
+        metadata.update({
+            "retrieval_duration": duration,
+            "context_length": len(context) if context else 0,
+            "retrieval_success": True
+        })
+        return {"context": context, "metadata": metadata}
+    except Exception as e:
+        duration = (datetime.now() - start_time).total_seconds()
+        logger.error(f"Retrieval failed: {str(e)}")
+        metadata = state.get("metadata", {})
+        metadata.update({
+            "retrieval_duration": duration,
+            "retrieval_success": False,
+            "retrieval_error": str(e)
+        })
+        return {"context": "", "metadata": metadata}

app/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import ast
 import re
 from dotenv import load_dotenv
 # Local .env file
 load_dotenv()
@@ -44,3 +45,74 @@ def get_auth(provider: str) -> dict:
     return auth_config

 import ast
 import re
 from dotenv import load_dotenv
+from typing import Optional, Dict, Any, List
 # Local .env file
 load_dotenv()
     return auth_config
+# File type detection
+def detect_file_type(filename: str, file_content: bytes = None) -> str:
+    """Detect file type based on extension and content"""
+    if not filename:
+        return "unknown"
+    # Get file extension
+    _, ext = os.path.splitext(filename.lower())
+    # Define file type mappings
+    file_type_mappings = {
+        '.geojson': 'geojson',
+        '.json': 'json',  # Could be geojson, will check content
+        '.pdf': 'text',
+        '.docx': 'text',
+        '.doc': 'text',
+        '.txt': 'text',
+        '.md': 'text',
+        '.csv': 'text',
+        '.xlsx': 'text',
+        '.xls': 'text'
+    }
+    detected_type = file_type_mappings.get(ext, 'unknown')
+    # For JSON files, check if it's actually GeoJSON
+    if detected_type == 'json' and file_content:
+        try:
+            import json
+            content_str = file_content.decode('utf-8')
+            data = json.loads(content_str)
+            # Check if it has GeoJSON structure
+            if isinstance(data, dict) and ('type' in data and data.get('type') == 'FeatureCollection'):
+                detected_type = 'geojson'
+            elif isinstance(data, dict) and ('type' in data and data.get('type') in ['Feature', 'Point', 'LineString', 'Polygon', 'MultiPoint', 'MultiLineString', 'MultiPolygon', 'GeometryCollection']):
+                detected_type = 'geojson'
+        except:
+            pass  # Keep as json if parsing fails
+    logger.info(f"Detected file type: {detected_type} for file: {filename}")
+    return detected_type
+# Helper function to convert retrieval context to expected format
+def convert_context_to_list(context: str) -> List[Dict[str, Any]]:
+    """Convert string context to list format expected by generator"""
+    try:
+        # Try to parse as list first
+        if context.startswith('['):
+            return ast.literal_eval(context)
+        else:
+            # If it's a string, wrap it in a simple format
+            return [{
+                "answer": context,
+                "answer_metadata": {
+                    "filename": "Retrieved Context",
+                    "page": "Unknown",
+                    "year": "Unknown",
+                    "source": "Retriever"
+                }
+            }]
+    except:
+        # Fallback: simple string wrapping
+        return [{
+            "answer": context,
+            "answer_metadata": {
+                "filename": "Retrieved Context",
+                "page": "Unknown",
+                "year": "Unknown",
+                "source": "Retriever"
+            }
+        }]