Spaces:

ricomanifesto
/

SentrySearch

Sleeping

ricomanifesto commited on Jun 28

Commit

38c5fec

1 Parent(s): 1165e73

Restructure project to mirror organized directory layout

- Organize src/ into modular directories: core/, data/, search/, ui/
- Move core logic files to src/core/
- Move data management to src/data/
- Move search functionality to src/search/
- Move UI components to src/ui/
- Update all import paths to reflect new structure
- Enhance .gitignore to exclude non-essential files
- Maintain Hugging Face Spaces compatibility

Files changed (14) hide show

.gitignore +49 -25
app.py +1 -1
src/{markdown_generator.py → core/markdown_generator.py} +0 -0
src/{ml_guidance_generator.py → core/ml_guidance_generator.py} +1 -1
src/{section_validator.py → core/section_validator.py} +1 -1
src/{threat_intel_tool.py → core/threat_intel_tool.py} +3 -3
src/{trace_exporter.py → core/trace_exporter.py} +0 -0
src/{validation_criteria.py → core/validation_criteria.py} +0 -0
src/data/ml_knowledge_base_builder.py +701 -0
src/search/bm25_retriever.py +449 -0
src/search/ml_agentic_retriever.py +975 -0
src/{ml_workers_retriever.py → search/ml_workers_retriever.py} +0 -0
src/{app.py → ui/app.py} +2 -2
wrangler.toml +26 -0

.gitignore CHANGED Viewed

@@ -1,40 +1,64 @@
-# Documentation (exclude all markdown except README)
-docs/
-*.md
-!README.md
-# Data files
-data/
-traces/
-*.db
-*.log
-# Environment and secrets
 .env
-venv/
-__pycache__/
-*.pyc
-# IDE files
 .vscode/
 .idea/
 *.swp
 *.swo
-# OS files
 .DS_Store
 Thumbs.db
-# Build artifacts
-dist/
-build/
-*.egg-info/
-# Cloudflare Workers (if you don't want to include them)
-wrangler.toml
-# Migration scripts (if not needed for users)
-scripts/migration/
-# Test files (optional)
-tests/

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+venv/
+env/
+ENV/
+# Environment variables
 .env
+.env.local
+.env.production
+# IDEs
 .vscode/
 .idea/
 *.swp
 *.swo
+# OS
 .DS_Store
 Thumbs.db
+# Application specific
+/data/
+docs/
+tests/
+scripts/
+backup_*/
+PROJECT_STRUCTURE.md
+main.py
+# Logs
+*.log
+# Temporary files
+*.tmp
+*.temp
+# Node modules (if any)
+node_modules/
+# Cloudflare
+.wrangler/

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
 # Import and run the main application
-from app import create_ui
 if __name__ == "__main__":
     create_ui()

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
 # Import and run the main application
+from src.ui.app import create_ui
 if __name__ == "__main__":
     create_ui()

src/{markdown_generator.py → core/markdown_generator.py} RENAMED Viewed

File without changes

src/{ml_guidance_generator.py → core/ml_guidance_generator.py} RENAMED Viewed

@@ -22,7 +22,7 @@ from datetime import datetime
 from pydantic import BaseModel, Field, validator
 # Import Workers-based retriever (production system)
-from ml_workers_retriever import MLWorkersRetriever, ThreatCharacteristics
 from anthropic import Anthropic
 import anthropic

 from pydantic import BaseModel, Field, validator
 # Import Workers-based retriever (production system)
+from src.search.ml_workers_retriever import MLWorkersRetriever, ThreatCharacteristics
 from anthropic import Anthropic
 import anthropic

src/{section_validator.py → core/section_validator.py} RENAMED Viewed

@@ -8,7 +8,7 @@ import anthropic
 from typing import Optional, List
 from datetime import datetime
 import re
-from validation_criteria import SECTION_CRITERIA, VALIDATION_PROMPTS
 class SectionValidator:

 from typing import Optional, List
 from datetime import datetime
 import re
+from src.core.validation_criteria import SECTION_CRITERIA, VALIDATION_PROMPTS
 class SectionValidator:

src/{threat_intel_tool.py → core/threat_intel_tool.py} RENAMED Viewed

@@ -11,9 +11,9 @@ import re
 from pydantic import ValidationError
 import time
 import random
-from section_validator import SectionValidator, SectionImprover
-from ml_guidance_generator import MLGuidanceGenerator, ThreatCharacteristics
-from trace_exporter import get_trace_exporter
 class ThreatIntelTool:

 from pydantic import ValidationError
 import time
 import random
+from src.core.section_validator import SectionValidator, SectionImprover
+from src.core.ml_guidance_generator import MLGuidanceGenerator, ThreatCharacteristics
+from src.core.trace_exporter import get_trace_exporter
 class ThreatIntelTool:

src/{trace_exporter.py → core/trace_exporter.py} RENAMED Viewed

File without changes

src/{validation_criteria.py → core/validation_criteria.py} RENAMED Viewed

File without changes

src/data/ml_knowledge_base_builder.py ADDED Viewed

	@@ -0,0 +1,701 @@

+"""
+ML Knowledge Base Builder for SentrySearch
+Builds a production-ready knowledge base from curated ML anomaly detection papers
+and blog posts. Implements Agentic RAG approach with intelligent content processing.
+Features:
+- Real content ingestion from URLs
+- LLM-powered content enrichment
+- Persistent ChromaDB storage
+- Question-like chunk processing
+- Production-ready error handling
+"""
+import os
+import json
+import time
+import random
+import hashlib
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass, asdict
+from urllib.parse import urlparse
+import logging
+from pathlib import Path
+import requests
+from bs4 import BeautifulSoup
+import chromadb
+from chromadb.config import Settings
+from anthropic import Anthropic
+import anthropic
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class MLPaperSource:
+    """Represents a source ML paper or blog post"""
+    title: str
+    url: str
+    company: str
+    year: str
+    description: str
+    ml_techniques: List[str]
+@dataclass
+class EnrichedChunk:
+    """Represents a processed and enriched document chunk"""
+    chunk_id: str
+    source_title: str
+    source_url: str
+    company: str
+    year: str
+    original_content: str
+    enriched_content: str  # Question-like format
+    ml_techniques: List[str]
+    chunk_summary: str
+    keywords: List[str]
+    chunk_index: int
+    content_hash: str
+    bm25_terms: List[str] = None  # Additional search terms for BM25
+    faq_questions: List[str] = None  # FAQ-style questions
+class ContentExtractor:
+    """Extracts and cleans content from web pages"""
+    def __init__(self, timeout: int = 30):
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+        })
+    def extract_from_url(self, url: str) -> Optional[str]:
+        """Extract clean text content from a URL"""
+        try:
+            logger.info(f"Extracting content from: {url}")
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            # Parse HTML
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+                element.decompose()
+            # Extract main content
+            content = self._extract_main_content(soup)
+            # Clean and normalize text
+            cleaned_content = self._clean_text(content)
+            logger.info(f"Extracted {len(cleaned_content)} characters from {url}")
+            return cleaned_content
+        except requests.RequestException as e:
+            logger.error(f"Request failed for {url}: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"Content extraction failed for {url}: {e}")
+            return None
+    def _extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from parsed HTML"""
+        # Try common article selectors
+        content_selectors = [
+            'article',
+            '[role="main"]',
+            '.post-content',
+            '.article-content',
+            '.entry-content',
+            '.content',
+            'main'
+        ]
+        for selector in content_selectors:
+            content_elem = soup.select_one(selector)
+            if content_elem:
+                return content_elem.get_text()
+        # Fallback to body content
+        body = soup.find('body')
+        return body.get_text() if body else soup.get_text()
+    def _clean_text(self, text: str) -> str:
+        """Clean and normalize extracted text"""
+        # Remove extra whitespace
+        lines = [line.strip() for line in text.split('\n')]
+        lines = [line for line in lines if line]
+        # Join lines and normalize spaces
+        cleaned = ' '.join(lines)
+        cleaned = ' '.join(cleaned.split())
+        return cleaned
+class ContentEnricher:
+    """Enriches content using LLM-powered processing"""
+    def __init__(self, anthropic_client):
+        self.client = anthropic_client
+    def _api_call_with_retry(self, **kwargs):
+        """Make API call with intelligent retry logic using retry-after header"""
+        max_retries = 3
+        base_delay = 5
+        for attempt in range(max_retries):
+            try:
+                print(f"DEBUG: Content Enricher API call attempt {attempt + 1}/{max_retries}")
+                return self.client.messages.create(**kwargs)
+            except anthropic.RateLimitError as e:
+                if attempt == max_retries - 1:
+                    print(f"DEBUG: Content Enricher rate limit exceeded after {max_retries} attempts")
+                    raise e
+                # Check if the error response has retry-after information
+                retry_after = None
+                if hasattr(e, 'response') and e.response:
+                    retry_after_header = e.response.headers.get('retry-after')
+                    if retry_after_header:
+                        try:
+                            retry_after = float(retry_after_header)
+                            print(f"DEBUG: Content Enricher API provided retry-after: {retry_after} seconds")
+                        except (ValueError, TypeError):
+                            pass
+                # Use retry-after if available, otherwise exponential backoff
+                if retry_after:
+                    delay = retry_after + random.uniform(1, 3)
+                else:
+                    delay = base_delay * (2 ** attempt) + random.uniform(1, 5)
+                    delay = min(delay, 120)
+                print(f"DEBUG: Content Enricher rate limit hit. Waiting {delay:.1f} seconds before retry {attempt + 2}")
+                time.sleep(delay)
+            except Exception as e:
+                print(f"DEBUG: Content Enricher non-rate-limit error: {e}")
+                raise e
+    def enrich_chunk(self, chunk: str, source: MLPaperSource) -> Dict[str, str]:
+        """Enrich a chunk with summary, keywords, question-like format, and BM25-optimized metadata"""
+        prompt = f"""
+Analyze this text chunk from a machine learning anomaly detection paper/blog and provide:
+1. QUESTION_FORMAT: Rewrite the chunk content as if it's answering questions about the ML approach
+2. SUMMARY: A 2-line summary of what this chunk covers
+3. KEYWORDS: 5-8 relevant technical keywords (comma-separated)
+4. BM25_TERMS: Additional search terms for BM25 retrieval (comma-separated, include variations, synonyms, acronyms)
+5. FAQ_QUESTIONS: 2-3 potential questions this chunk could answer (pipe-separated)
+Source Context:
+- Company: {source.company}
+- ML Techniques: {', '.join(source.ml_techniques)}
+- Year: {source.year}
+Text Chunk:
+{chunk[:1500]}
+Format your response as:
+QUESTION_FORMAT: [rewritten content]
+SUMMARY: [summary]
+KEYWORDS: [keywords]
+BM25_TERMS: [search terms with variations]
+FAQ_QUESTIONS: [question1|question2|question3]
+"""
+        try:
+            response = self._api_call_with_retry(
+                model="claude-sonnet-4-20250514",
+                max_tokens=800,
+                messages=[{"role": "user", "content": prompt}]
+            )
+            # Safe access to response content
+            if not response.content or len(response.content) == 0:
+                raise ValueError("Empty response from content enrichment API")
+            if not hasattr(response.content[0], 'text'):
+                raise ValueError("Response content missing text attribute")
+            content = response.content[0].text.strip()
+            return self._parse_enrichment_response(content)
+        except Exception as e:
+            logger.error(f"Content enrichment failed: {e}")
+            # Return fallback enrichment
+            return {
+                'question_format': chunk,
+                'summary': f"Content about {source.ml_techniques[0]} implementation at {source.company}",
+                'keywords': ', '.join(source.ml_techniques + [source.company.lower(), 'anomaly detection']),
+                'bm25_terms': ', '.join(source.ml_techniques + [source.company.lower(), 'ml', 'detection', 'analysis']),
+                'faq_questions': f"How does {source.company} implement {source.ml_techniques[0]}?|What is {source.ml_techniques[0]} used for?"
+            }
+    def _parse_enrichment_response(self, response: str) -> Dict[str, str]:
+        """Parse LLM response into structured enrichment data"""
+        result = {
+            'question_format': '',
+            'summary': '',
+            'keywords': '',
+            'bm25_terms': '',
+            'faq_questions': ''
+        }
+        lines = response.split('\n')
+        current_field = None
+        for line in lines:
+            line = line.strip()
+            if line.startswith('QUESTION_FORMAT:'):
+                current_field = 'question_format'
+                result[current_field] = line.replace('QUESTION_FORMAT:', '').strip()
+            elif line.startswith('SUMMARY:'):
+                current_field = 'summary'
+                result[current_field] = line.replace('SUMMARY:', '').strip()
+            elif line.startswith('KEYWORDS:'):
+                current_field = 'keywords'
+                result[current_field] = line.replace('KEYWORDS:', '').strip()
+            elif line.startswith('BM25_TERMS:'):
+                current_field = 'bm25_terms'
+                result[current_field] = line.replace('BM25_TERMS:', '').strip()
+            elif line.startswith('FAQ_QUESTIONS:'):
+                current_field = 'faq_questions'
+                result[current_field] = line.replace('FAQ_QUESTIONS:', '').strip()
+            elif current_field and line:
+                result[current_field] += ' ' + line
+        return result
+class DocumentProcessor:
+    """Processes documents into enriched chunks"""
+    def __init__(self, content_enricher: ContentEnricher, chunk_size: int = 800):
+        self.enricher = content_enricher
+        self.chunk_size = chunk_size
+    def process_document(self, source: MLPaperSource, content: str) -> List[EnrichedChunk]:
+        """Process a document into enriched chunks"""
+        if not content or len(content) < 100:
+            logger.warning(f"Content too short for {source.title}")
+            return []
+        # Create chunks
+        chunks = self._create_chunks(content, source)
+        # Enrich each chunk
+        enriched_chunks = []
+        for i, chunk_content in enumerate(chunks):
+            # Generate content hash for deduplication
+            content_hash = hashlib.md5(chunk_content.encode()).hexdigest()
+            # Enrich with LLM
+            enrichment = self.enricher.enrich_chunk(chunk_content, source)
+            chunk = EnrichedChunk(
+                chunk_id=f"{source.company}_{source.year}_{i}_{content_hash[:8]}",
+                source_title=source.title,
+                source_url=source.url,
+                company=source.company,
+                year=source.year,
+                original_content=chunk_content,
+                enriched_content=enrichment['question_format'],
+                ml_techniques=source.ml_techniques,
+                chunk_summary=enrichment['summary'],
+                keywords=enrichment['keywords'].split(', ') if enrichment['keywords'] else [],
+                chunk_index=i,
+                content_hash=content_hash
+            )
+            # Add BM25-specific metadata to chunk
+            chunk.bm25_terms = enrichment.get('bm25_terms', '').split(', ') if enrichment.get('bm25_terms') else []
+            chunk.faq_questions = enrichment.get('faq_questions', '').split('|') if enrichment.get('faq_questions') else []
+            enriched_chunks.append(chunk)
+            # Rate limiting for API calls
+            time.sleep(0.5)
+        logger.info(f"Processed {len(enriched_chunks)} chunks for {source.title}")
+        return enriched_chunks
+    def _create_chunks(self, content: str, source: MLPaperSource) -> List[str]:
+        """Create overlapping chunks from content"""
+        chunks = []
+        overlap = self.chunk_size // 4  # 25% overlap
+        for i in range(0, len(content), self.chunk_size - overlap):
+            chunk = content[i:i + self.chunk_size]
+            # Skip very short chunks
+            if len(chunk) < 200:
+                continue
+            # Try to break at sentence boundaries
+            if i + self.chunk_size < len(content):
+                last_period = chunk.rfind('.')
+                if last_period > len(chunk) * 0.7:  # If period is in last 30%
+                    chunk = chunk[:last_period + 1]
+            chunks.append(chunk.strip())
+        return chunks
+class KnowledgeBaseStorage:
+    """Manages persistent storage of the knowledge base"""
+    def __init__(self, storage_path: str = "./ml_knowledge_base"):
+        self.storage_path = Path(storage_path)
+        self.storage_path.mkdir(exist_ok=True)
+        # Initialize ChromaDB with persistent storage
+        self.chroma_client = chromadb.PersistentClient(
+            path=str(self.storage_path / "chroma_db")
+        )
+        self.collection_name = "ml_anomaly_detection"
+        self.collection = None
+        self._initialize_collection()
+    def _initialize_collection(self):
+        """Initialize or get existing collection"""
+        try:
+            # Try to get existing collection
+            self.collection = self.chroma_client.get_collection(self.collection_name)
+            logger.info(f"Loaded existing collection with {self.collection.count()} documents")
+        except:
+            # Create new collection
+            self.collection = self.chroma_client.create_collection(
+                name=self.collection_name,
+                metadata={"description": "ML Anomaly Detection Knowledge Base"}
+            )
+            logger.info("Created new collection")
+    def add_chunks(self, chunks: List[EnrichedChunk]) -> bool:
+        """Add enriched chunks to the knowledge base"""
+        try:
+            if not chunks:
+                return True
+            # Prepare data for ChromaDB
+            documents = []
+            metadatas = []
+            ids = []
+            for chunk in chunks:
+                # Create enriched document text
+                document_text = f"""
+Title: {chunk.source_title}
+Company: {chunk.company}
+Year: {chunk.year}
+ML Techniques: {', '.join(chunk.ml_techniques)}
+Keywords: {', '.join(chunk.keywords)}
+Summary: {chunk.chunk_summary}
+Content: {chunk.enriched_content}
+                """.strip()
+                documents.append(document_text)
+                metadatas.append({
+                    'source_title': chunk.source_title,
+                    'source_url': chunk.source_url,
+                    'company': chunk.company,
+                    'year': chunk.year,
+                    'ml_techniques': '|'.join(chunk.ml_techniques),
+                    'keywords': '|'.join(chunk.keywords),
+                    'chunk_summary': chunk.chunk_summary,
+                    'chunk_index': chunk.chunk_index,
+                    'content_hash': chunk.content_hash,
+                    'bm25_terms': '|'.join(chunk.bm25_terms) if chunk.bm25_terms else '',
+                    'faq_questions': '|'.join(chunk.faq_questions) if chunk.faq_questions else ''
+                })
+                ids.append(chunk.chunk_id)
+            # Add to ChromaDB
+            self.collection.add(
+                documents=documents,
+                metadatas=metadatas,
+                ids=ids
+            )
+            # Save chunk details as JSON backup
+            self._save_chunks_backup(chunks)
+            logger.info(f"Added {len(chunks)} chunks to knowledge base")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to add chunks to knowledge base: {e}")
+            return False
+    def _save_chunks_backup(self, chunks: List[EnrichedChunk]):
+        """Save chunk details as JSON backup"""
+        backup_file = self.storage_path / "chunks_backup.jsonl"
+        with open(backup_file, 'a', encoding='utf-8') as f:
+            for chunk in chunks:
+                f.write(json.dumps(asdict(chunk), ensure_ascii=False) + '\n')
+    def get_stats(self) -> Dict:
+        """Get knowledge base statistics"""
+        try:
+            count = self.collection.count()
+            # Get unique companies and years
+            if count > 0:
+                results = self.collection.get(include=['metadatas'])
+                companies = set()
+                years = set()
+                ml_techniques = set()
+                for metadata in results['metadatas']:
+                    companies.add(metadata.get('company', ''))
+                    years.add(metadata.get('year', ''))
+                    techniques = metadata.get('ml_techniques', '').split('|')
+                    ml_techniques.update([t for t in techniques if t])
+                return {
+                    'total_chunks': count,
+                    'companies': sorted(list(companies)),
+                    'years': sorted(list(years)),
+                    'ml_techniques': sorted(list(ml_techniques)),
+                    'storage_path': str(self.storage_path)
+                }
+            else:
+                return {
+                    'total_chunks': 0,
+                    'companies': [],
+                    'years': [],
+                    'ml_techniques': [],
+                    'storage_path': str(self.storage_path)
+                }
+        except Exception as e:
+            logger.error(f"Failed to get stats: {e}")
+            return {'error': str(e)}
+    def search(self, query: str, n_results: int = 10) -> List[Dict]:
+        """Search the knowledge base"""
+        try:
+            results = self.collection.query(
+                query_texts=[query],
+                n_results=n_results,
+                include=['documents', 'metadatas', 'distances']
+            )
+            search_results = []
+            for i, doc in enumerate(results['documents'][0]):
+                search_results.append({
+                    'document': doc,
+                    'metadata': results['metadatas'][0][i],
+                    'distance': results['distances'][0][i],
+                    'score': 1 / (1 + results['distances'][0][i])  # Convert distance to similarity
+                })
+            return search_results
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return []
+def get_curated_ml_sources() -> List[MLPaperSource]:
+    """Get the curated list of ML anomaly detection sources"""
+    sources = [
+        MLPaperSource(
+            title="Detecting Performance Anomalies in External Firmware Deployments",
+            url="https://netflixtechblog.com/detecting-performance-anomalies-in-external-firmware-deployments-ed41b1bfcf46",
+            company="Netflix",
+            year="2019",
+            description="Netflix's approach to detecting anomalies in firmware performance using ML",
+            ml_techniques=["statistical_analysis", "anomaly_detection", "performance_monitoring"]
+        ),
+        MLPaperSource(
+            title="Detecting and Preventing Abuse on LinkedIn using Isolation Forests",
+            url="https://engineering.linkedin.com/blog/2019/isolation-forest",
+            company="LinkedIn",
+            year="2019",
+            description="LinkedIn's implementation of isolation forests for abuse detection",
+            ml_techniques=["isolation_forest", "unsupervised_learning", "abuse_detection"]
+        ),
+        MLPaperSource(
+            title="How Does Spam Protection Work on Stack Exchange?",
+            url="https://stackoverflow.blog/2020/06/25/how-does-spam-protection-work-on-stack-exchange/",
+            company="Stack Exchange",
+            year="2020",
+            description="Stack Exchange's ML-based spam detection system",
+            ml_techniques=["text_classification", "nlp", "spam_detection"]
+        ),
+        MLPaperSource(
+            title="Blocking Slack Invite Spam With Machine Learning",
+            url="https://slack.engineering/blocking-slack-invite-spam-with-machine-learning/",
+            company="Slack",
+            year="2020",
+            description="Slack's ML approach to preventing invite spam",
+            ml_techniques=["classification", "feature_engineering", "spam_detection"]
+        ),
+        MLPaperSource(
+            title="Cloudflare Bot Management: Machine Learning and More",
+            url="https://blog.cloudflare.com/cloudflare-bot-management-machine-learning-and-more/",
+            company="Cloudflare",
+            year="2020",
+            description="Cloudflare's ML-powered bot detection and management",
+            ml_techniques=["behavioral_analysis", "traffic_analysis", "bot_detection"]
+        ),
+        MLPaperSource(
+            title="Graph for Fraud Detection",
+            url="https://engineering.grab.com/graph-for-fraud-detection",
+            company="Grab",
+            year="2022",
+            description="Grab's graph-based approach to fraud detection",
+            ml_techniques=["graph_ml", "fraud_detection", "network_analysis"]
+        ),
+        MLPaperSource(
+            title="Machine Learning for Fraud Detection in Streaming Services",
+            url="https://netflixtechblog.com/machine-learning-for-fraud-detection-in-streaming-services-b0b4ef3be3f6",
+            company="Netflix",
+            year="2023",
+            description="Netflix's ML approach to detecting fraud in streaming services",
+            ml_techniques=["fraud_detection", "streaming_analytics", "behavioral_analysis"]
+        ),
+        MLPaperSource(
+            title="Data Generation and Sampling Strategies",
+            url="https://blog.cloudflare.com/data-generation-and-sampling-strategies/",
+            company="Cloudflare",
+            year="2023",
+            description="Cloudflare's data generation and sampling strategies for ML training",
+            ml_techniques=["data_generation", "sampling", "training_data"]
+        ),
+        MLPaperSource(
+            title="Machine Learning Mobile Traffic Bots",
+            url="https://blog.cloudflare.com/machine-learning-mobile-traffic-bots/",
+            company="Cloudflare",
+            year="2023",
+            description="Cloudflare's ML approach to detecting mobile traffic bots",
+            ml_techniques=["bot_detection", "mobile_traffic", "behavioral_analysis"]
+        ),
+        MLPaperSource(
+            title="Project Radar: Intelligent Early Fraud Detection",
+            url="https://www.uber.com/blog/project-radar-intelligent-early-fraud-detection/",
+            company="Uber",
+            year="2023",
+            description="Uber's Project Radar for intelligent early fraud detection",
+            ml_techniques=["fraud_detection", "early_detection", "real_time_ml"]
+        )
+    ]
+    return sources
+def main():
+    """Main function to build the ML knowledge base"""
+    # Initialize components
+    api_key = os.getenv('ANTHROPIC_API_KEY')
+    if not api_key:
+        logger.error("ANTHROPIC_API_KEY environment variable not set")
+        return
+    print("🔨 Building ML Anomaly Detection Knowledge Base")
+    print("=" * 50)
+    # Initialize components
+    anthropic_client = Anthropic(api_key=api_key)
+    content_extractor = ContentExtractor()
+    content_enricher = ContentEnricher(anthropic_client)
+    document_processor = DocumentProcessor(content_enricher)
+    knowledge_base = KnowledgeBaseStorage()
+    # Get current stats
+    current_stats = knowledge_base.get_stats()
+    print(f"📊 Current knowledge base: {current_stats['total_chunks']} chunks")
+    # Get sources to process
+    sources = get_curated_ml_sources()
+    print(f"📚 Processing {len(sources)} ML sources...")
+    # Process each source
+    total_chunks_added = 0
+    successful_sources = 0
+    for i, source in enumerate(sources, 1):
+        print(f"\n🔄 [{i}/{len(sources)}] Processing: {source.title}")
+        print(f"   Company: {source.company} | Year: {source.year}")
+        # Extract content
+        content = content_extractor.extract_from_url(source.url)
+        if not content:
+            print(f"   ❌ Failed to extract content")
+            continue
+        print(f"   📝 Extracted {len(content):,} characters")
+        # Process into chunks
+        chunks = document_processor.process_document(source, content)
+        if not chunks:
+            print(f"   ❌ No chunks generated")
+            continue
+        print(f"   🧩 Generated {len(chunks)} chunks")
+        # Add to knowledge base
+        if knowledge_base.add_chunks(chunks):
+            total_chunks_added += len(chunks)
+            successful_sources += 1
+            print(f"   ✅ Added to knowledge base")
+        else:
+            print(f"   ❌ Failed to add to knowledge base")
+    # Final stats
+    print(f"\n🎉 Knowledge Base Build Complete!")
+    print("=" * 50)
+    print(f"Sources processed: {successful_sources}/{len(sources)}")
+    print(f"Total chunks added: {total_chunks_added}")
+    final_stats = knowledge_base.get_stats()
+    print(f"Final knowledge base size: {final_stats['total_chunks']} chunks")
+    print(f"Companies: {', '.join(final_stats['companies'])}")
+    print(f"Years: {', '.join(final_stats['years'])}")
+    print(f"Storage location: {final_stats['storage_path']}")
+    # Test search
+    print(f"\n🔍 Testing search functionality...")
+    test_queries = [
+        "How does Netflix detect performance anomalies?",
+        "What ML techniques work for fraud detection?",
+        "Isolation forest implementation details"
+    ]
+    for query in test_queries:
+        results = knowledge_base.search(query, n_results=3)
+        print(f"\nQuery: '{query}'")
+        print(f"Results: {len(results)} found")
+        if results:
+            top_result = results[0]
+            print(f"Top match: {top_result['metadata']['company']} - {top_result['metadata']['source_title'][:60]}...")
+            print(f"Score: {top_result['score']:.3f}")
+if __name__ == "__main__":
+    main()

src/search/bm25_retriever.py ADDED Viewed

	@@ -0,0 +1,449 @@

+"""
+BM25 Retriever for SentrySearch
+Implements BM25-based retrieval with enriched metadata support for the agentic RAG system.
+Provides complementary keyword-based retrieval alongside vector search for enhanced precision.
+Features:
+- BM25 algorithm for exact keyword matching
+- Enriched metadata indexing (summaries, FAQs, keywords)
+- Integration with existing knowledge base
+- Result scoring and ranking
+- Efficient document preprocessing
+"""
+import os
+import json
+import logging
+import time
+from typing import List, Dict, Optional, Tuple, Set
+from dataclasses import dataclass
+from pathlib import Path
+import re
+import pickle
+from rank_bm25 import BM25Okapi
+import numpy as np
+from src.data.ml_knowledge_base_builder import KnowledgeBaseStorage
+logger = logging.getLogger(__name__)
+@dataclass
+class BM25Document:
+    """Represents a document optimized for BM25 retrieval"""
+    doc_id: str
+    content: str
+    enriched_content: str  # Enhanced with metadata
+    metadata: Dict
+    keywords: List[str]
+    summary: str
+    preprocessed_tokens: List[str]
+@dataclass
+class BM25SearchResult:
+    """Represents a BM25 search result"""
+    doc_id: str
+    content: str
+    metadata: Dict
+    bm25_score: float
+    matched_terms: List[str]
+    relevance_score: float  # Normalized score
+class BM25Preprocessor:
+    """Preprocesses documents for BM25 indexing"""
+    def __init__(self):
+        # Common stopwords for technical content
+        self.stopwords = {
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+            'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+            'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+            'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
+            'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'
+        }
+    def preprocess_text(self, text: str) -> List[str]:
+        """Preprocess text for BM25 indexing"""
+        if not text:
+            return []
+        # Convert to lowercase
+        text = text.lower()
+        # Keep technical terms and alphanumeric sequences
+        # Split on whitespace and punctuation but preserve underscores and hyphens in technical terms
+        tokens = re.findall(r'\b[a-zA-Z0-9_-]+\b', text)
+        # Filter tokens
+        filtered_tokens = []
+        for token in tokens:
+            # Skip stopwords
+            if token in self.stopwords:
+                continue
+            # Skip very short tokens unless they're technical (like ML, AI, etc.)
+            if len(token) < 2 and not token.isupper():
+                continue
+            # Skip very long tokens (likely noise)
+            if len(token) > 50:
+                continue
+            filtered_tokens.append(token)
+        return filtered_tokens
+    def create_enriched_content(self, chunk_data: Dict) -> str:
+        """Create enriched content optimized for BM25 search"""
+        content_parts = []
+        # Original content
+        if chunk_data.get('enriched_content'):
+            content_parts.append(chunk_data['enriched_content'])
+        # Add weighted metadata
+        metadata = chunk_data.get('metadata', {})
+        # Company name (high weight)
+        if company := metadata.get('company'):
+            content_parts.append(f"{company} {company} {company}")  # Triple weight
+        # ML techniques (high weight)
+        if ml_techniques := metadata.get('ml_techniques'):
+            techniques = ml_techniques.split('|') if isinstance(ml_techniques, str) else []
+            for technique in techniques:
+                if technique.strip():
+                    # Double weight for techniques
+                    content_parts.append(f"{technique} {technique}")
+        # Keywords (medium weight)
+        if keywords := metadata.get('keywords'):
+            keyword_list = keywords.split('|') if isinstance(keywords, str) else []
+            content_parts.extend(keyword_list)
+        # Summary (medium weight)
+        if summary := metadata.get('chunk_summary'):
+            content_parts.append(summary)
+        # Source title (medium weight)
+        if title := metadata.get('source_title'):
+            content_parts.append(title)
+        return ' '.join(content_parts)
+class BM25Retriever:
+    """BM25-based retriever with enriched metadata support"""
+    def __init__(self, knowledge_base: KnowledgeBaseStorage,
+                 storage_path: str = "./ml_knowledge_base"):
+        self.knowledge_base = knowledge_base
+        self.storage_path = Path(storage_path)
+        self.preprocessor = BM25Preprocessor()
+        # BM25 components
+        self.bm25_index = None
+        self.documents = []
+        self.doc_lookup = {}  # doc_id -> document mapping
+        # Storage files
+        self.bm25_cache_file = self.storage_path / "bm25_index.pkl"
+        self.docs_cache_file = self.storage_path / "bm25_documents.json"
+        # Initialize
+        self._initialize_bm25_index()
+    def _initialize_bm25_index(self):
+        """Initialize or load existing BM25 index"""
+        try:
+            # Try to load cached index
+            if self._load_cached_index():
+                logger.info(f"Loaded cached BM25 index with {len(self.documents)} documents")
+                return
+            # Build new index
+            logger.info("Building new BM25 index...")
+            self._build_bm25_index()
+        except Exception as e:
+            logger.error(f"Failed to initialize BM25 index: {e}")
+            self.bm25_index = None
+            self.documents = []
+    def _load_cached_index(self) -> bool:
+        """Load cached BM25 index if available"""
+        if not (self.bm25_cache_file.exists() and self.docs_cache_file.exists()):
+            return False
+        try:
+            # Load BM25 index
+            with open(self.bm25_cache_file, 'rb') as f:
+                self.bm25_index = pickle.load(f)
+            # Load documents
+            with open(self.docs_cache_file, 'r', encoding='utf-8') as f:
+                docs_data = json.load(f)
+            # Reconstruct documents
+            self.documents = []
+            self.doc_lookup = {}
+            for doc_data in docs_data:
+                doc = BM25Document(**doc_data)
+                self.documents.append(doc)
+                self.doc_lookup[doc.doc_id] = doc
+            return True
+        except Exception as e:
+            logger.warning(f"Failed to load cached BM25 index: {e}")
+            return False
+    def _build_bm25_index(self):
+        """Build BM25 index from knowledge base"""
+        try:
+            # Get all documents from ChromaDB
+            results = self.knowledge_base.collection.get(
+                include=['documents', 'metadatas']
+            )
+            if not results['ids']:
+                logger.warning("No documents found in knowledge base")
+                return
+            logger.info(f"Processing {len(results['ids'])} documents for BM25 indexing...")
+            # Process each document
+            bm25_documents = []
+            tokenized_docs = []
+            for i, doc_id in enumerate(results['ids']):
+                try:
+                    # Create enriched content for BM25
+                    chunk_data = {
+                        'enriched_content': results['documents'][i],
+                        'metadata': results['metadatas'][i]
+                    }
+                    enriched_content = self.preprocessor.create_enriched_content(chunk_data)
+                    # Preprocess for BM25
+                    tokens = self.preprocessor.preprocess_text(enriched_content)
+                    if not tokens:  # Skip empty documents
+                        continue
+                    # Create BM25 document
+                    bm25_doc = BM25Document(
+                        doc_id=doc_id,
+                        content=results['documents'][i],
+                        enriched_content=enriched_content,
+                        metadata=results['metadatas'][i],
+                        keywords=results['metadatas'][i].get('keywords', '').split('|'),
+                        summary=results['metadatas'][i].get('chunk_summary', ''),
+                        preprocessed_tokens=tokens
+                    )
+                    bm25_documents.append(bm25_doc)
+                    tokenized_docs.append(tokens)
+                except Exception as e:
+                    logger.warning(f"Failed to process document {doc_id}: {e}")
+                    continue
+            if not bm25_documents:
+                logger.error("No valid documents processed for BM25")
+                return
+            # Build BM25 index
+            logger.info(f"Building BM25 index with {len(bm25_documents)} documents...")
+            self.bm25_index = BM25Okapi(tokenized_docs)
+            self.documents = bm25_documents
+            # Create lookup dictionary
+            self.doc_lookup = {doc.doc_id: doc for doc in self.documents}
+            # Cache the index
+            self._cache_bm25_index()
+            logger.info(f"BM25 index built successfully with {len(self.documents)} documents")
+        except Exception as e:
+            logger.error(f"Failed to build BM25 index: {e}")
+            self.bm25_index = None
+            self.documents = []
+    def _cache_bm25_index(self):
+        """Cache BM25 index to disk"""
+        try:
+            # Cache BM25 index
+            with open(self.bm25_cache_file, 'wb') as f:
+                pickle.dump(self.bm25_index, f)
+            # Cache documents (convert to JSON-serializable format)
+            docs_data = []
+            for doc in self.documents:
+                doc_dict = {
+                    'doc_id': doc.doc_id,
+                    'content': doc.content,
+                    'enriched_content': doc.enriched_content,
+                    'metadata': doc.metadata,
+                    'keywords': doc.keywords,
+                    'summary': doc.summary,
+                    'preprocessed_tokens': doc.preprocessed_tokens
+                }
+                docs_data.append(doc_dict)
+            with open(self.docs_cache_file, 'w', encoding='utf-8') as f:
+                json.dump(docs_data, f, ensure_ascii=False, indent=2)
+            logger.info("BM25 index cached successfully")
+        except Exception as e:
+            logger.warning(f"Failed to cache BM25 index: {e}")
+    def search(self, query: str, n_results: int = 10,
+               min_score: float = 0.0) -> List[BM25SearchResult]:
+        """Search using BM25 algorithm"""
+        if not self.bm25_index or not self.documents:
+            logger.warning("BM25 index not available")
+            return []
+        try:
+            # Preprocess query
+            query_tokens = self.preprocessor.preprocess_text(query)
+            if not query_tokens:
+                logger.warning("No valid tokens in query")
+                return []
+            # Get BM25 scores
+            scores = self.bm25_index.get_scores(query_tokens)
+            # Create results with scores
+            results = []
+            for i, score in enumerate(scores):
+                if score <= min_score:
+                    continue
+                doc = self.documents[i]
+                # Find matched terms
+                matched_terms = self._find_matched_terms(query_tokens, doc.preprocessed_tokens)
+                # Calculate relevance score (normalized)
+                relevance_score = min(score / 10.0, 1.0)  # Normalize to 0-1 range
+                result = BM25SearchResult(
+                    doc_id=doc.doc_id,
+                    content=doc.content,
+                    metadata=doc.metadata,
+                    bm25_score=score,
+                    matched_terms=matched_terms,
+                    relevance_score=relevance_score
+                )
+                results.append(result)
+            # Sort by BM25 score (descending)
+            results.sort(key=lambda x: x.bm25_score, reverse=True)
+            # Return top N results
+            return results[:n_results]
+        except Exception as e:
+            logger.error(f"BM25 search failed: {e}")
+            return []
+    def _find_matched_terms(self, query_tokens: List[str], doc_tokens: List[str]) -> List[str]:
+        """Find which query terms matched in the document"""
+        doc_token_set = set(doc_tokens)
+        matched = [token for token in query_tokens if token in doc_token_set]
+        return matched
+    def get_stats(self) -> Dict:
+        """Get BM25 retriever statistics"""
+        return {
+            'total_documents': len(self.documents),
+            'index_available': self.bm25_index is not None,
+            'cache_files_exist': {
+                'index': self.bm25_cache_file.exists(),
+                'documents': self.docs_cache_file.exists()
+            },
+            'storage_path': str(self.storage_path)
+        }
+    def rebuild_index(self):
+        """Force rebuild of BM25 index"""
+        logger.info("Rebuilding BM25 index...")
+        # Clear existing index
+        self.bm25_index = None
+        self.documents = []
+        self.doc_lookup = {}
+        # Remove cache files
+        try:
+            if self.bm25_cache_file.exists():
+                self.bm25_cache_file.unlink()
+            if self.docs_cache_file.exists():
+                self.docs_cache_file.unlink()
+        except Exception as e:
+            logger.warning(f"Failed to remove cache files: {e}")
+        # Rebuild
+        self._build_bm25_index()
+def main():
+    """Test the BM25 retriever"""
+    print("🔍 Testing BM25 Retriever")
+    print("=" * 40)
+    # Initialize knowledge base and BM25 retriever
+    knowledge_base = KnowledgeBaseStorage()
+    bm25_retriever = BM25Retriever(knowledge_base)
+    # Get stats
+    stats = bm25_retriever.get_stats()
+    print(f"📊 BM25 Index Stats:")
+    print(f"   Documents: {stats['total_documents']}")
+    print(f"   Index Available: {stats['index_available']}")
+    if not stats['index_available']:
+        print("❌ BM25 index not available")
+        return
+    # Test queries
+    test_queries = [
+        "Netflix anomaly detection",
+        "isolation forest LinkedIn",
+        "fraud detection machine learning",
+        "bot detection Cloudflare",
+        "spam classification",
+        "graph neural networks"
+    ]
+    print(f"\n🔍 Testing BM25 Search:")
+    for query in test_queries:
+        print(f"\nQuery: '{query}'")
+        results = bm25_retriever.search(query, n_results=3)
+        print(f"Results: {len(results)} found")
+        for i, result in enumerate(results, 1):
+            company = result.metadata.get('company', 'Unknown')
+            title = result.metadata.get('source_title', 'No title')[:50]
+            print(f"  {i}. {company} - {title}...")
+            print(f"     BM25 Score: {result.bm25_score:.3f}")
+            print(f"     Matched Terms: {', '.join(result.matched_terms[:5])}")
+    print(f"\n✅ BM25 retriever test complete!")
+if __name__ == "__main__":
+    main()

src/search/ml_agentic_retriever.py ADDED Viewed

	@@ -0,0 +1,975 @@

+"""
+ML Agentic Retriever for SentrySearch
+Implements Agentic RAG approach with intelligent query optimization.
+Provides intelligent ML-focused retrieval for threat intelligence with:
+- Query optimization for threat-to-ML translation
+- Source identification for relevant paper filtering
+- Enhanced hybrid retrieval with post-processing
+- Context-aware result ranking and structuring
+Usage:
+    retriever = MLAgenticRetriever(anthropic_client, knowledge_base)
+    ml_guidance = retriever.get_ml_guidance(threat_characteristics)
+"""
+import os
+import json
+import logging
+import time
+import random
+import hashlib
+from typing import List, Dict, Optional, Tuple, Set
+from dataclasses import dataclass
+import re
+from anthropic import Anthropic
+import anthropic
+from src.data.ml_knowledge_base_builder import KnowledgeBaseStorage
+from src.search.bm25_retriever import BM25Retriever, BM25SearchResult
+logger = logging.getLogger(__name__)
+@dataclass
+class ThreatCharacteristics:
+    """Represents threat characteristics for ML guidance generation"""
+    threat_name: str
+    threat_type: str  # e.g., "malware", "apt", "insider_threat"
+    attack_vectors: List[str]  # e.g., ["network", "email", "web"]
+    target_assets: List[str]  # e.g., ["user_accounts", "financial_data"]
+    behavior_patterns: List[str]  # e.g., ["lateral_movement", "data_exfiltration"]
+    time_characteristics: str  # e.g., "persistent", "burst", "periodic"
+@dataclass
+class OptimizedQuery:
+    """Represents an optimized query for ML retrieval"""
+    original_query: str
+    optimized_queries: List[str]
+    ml_focus_areas: List[str]
+    reasoning: str
+@dataclass
+class SourceSelection:
+    """Represents filtered source selection"""
+    relevant_companies: List[str]
+    relevant_years: List[str]
+    relevant_techniques: List[str]
+    reasoning: str
+@dataclass
+class MLRetrievalResult:
+    """Represents a structured ML retrieval result"""
+    content: str
+    metadata: Dict
+    relevance_score: float
+    source_paper: str
+    ml_techniques: List[str]
+    implementation_details: str
+    applicability_score: float
+    retrieval_method: str = 'unknown'  # 'vector', 'bm25', or 'hybrid'
+    bm25_score: float = 0.0
+    hybrid_score: float = 0.0
+    matched_terms: List[str] = None
+class QueryOptimizer:
+    """Optimizes queries to focus on ML anomaly detection approaches"""
+    def __init__(self, anthropic_client):
+        self.client = anthropic_client
+    def _api_call_with_retry(self, **kwargs):
+        """Make API call with intelligent retry logic using retry-after header"""
+        max_retries = 3
+        base_delay = 5
+        for attempt in range(max_retries):
+            try:
+                print(f"DEBUG: Query Optimizer API call attempt {attempt + 1}/{max_retries}")
+                return self.client.messages.create(**kwargs)
+            except anthropic.RateLimitError as e:
+                if attempt == max_retries - 1:
+                    print(f"DEBUG: Query Optimizer rate limit exceeded after {max_retries} attempts")
+                    raise e
+                # Check if the error response has retry-after information
+                retry_after = None
+                if hasattr(e, 'response') and e.response:
+                    retry_after_header = e.response.headers.get('retry-after')
+                    if retry_after_header:
+                        try:
+                            retry_after = float(retry_after_header)
+                            print(f"DEBUG: Query Optimizer API provided retry-after: {retry_after} seconds")
+                        except (ValueError, TypeError):
+                            pass
+                # Use retry-after if available, otherwise exponential backoff
+                if retry_after:
+                    delay = retry_after + random.uniform(1, 3)
+                else:
+                    delay = base_delay * (2 ** attempt) + random.uniform(1, 5)
+                    delay = min(delay, 120)
+                print(f"DEBUG: Query Optimizer rate limit hit. Waiting {delay:.1f} seconds before retry {attempt + 2}")
+                time.sleep(delay)
+            except Exception as e:
+                print(f"DEBUG: Query Optimizer non-rate-limit error: {e}")
+                raise e
+    def optimize_query(self, threat_characteristics: ThreatCharacteristics) -> OptimizedQuery:
+        """Convert threat characteristics into ML-focused queries"""
+        prompt = f"""
+You are an expert in both cybersecurity threats and machine learning anomaly detection.
+Convert this threat information into 3-5 specific queries about ML approaches for detection.
+Threat Information:
+- Name: {threat_characteristics.threat_name}
+- Type: {threat_characteristics.threat_type}
+- Attack Vectors: {', '.join(threat_characteristics.attack_vectors)}
+- Target Assets: {', '.join(threat_characteristics.target_assets)}
+- Behavior Patterns: {', '.join(threat_characteristics.behavior_patterns)}
+- Time Characteristics: {threat_characteristics.time_characteristics}
+Generate queries that focus on:
+1. Specific ML techniques for detecting this threat type
+2. Feature engineering approaches for the attack vectors
+3. Behavioral analysis methods for the patterns observed
+4. Implementation considerations for the target environment
+Format your response as:
+QUERIES:
+1. [Query 1]
+2. [Query 2]
+3. [Query 3]
+etc.
+ML_FOCUS_AREAS: [comma-separated focus areas]
+REASONING: [1-2 sentences explaining the ML approach rationale]
+"""
+        try:
+            response = self._api_call_with_retry(
+                model="claude-sonnet-4-20250514",
+                max_tokens=600,
+                messages=[{"role": "user", "content": prompt}]
+            )
+            # Safe access to response content
+            if not response.content or len(response.content) == 0:
+                raise ValueError("Empty response from API")
+            content = response.content[0].text.strip()
+            return self._parse_optimization_response(content, threat_characteristics)
+        except Exception as e:
+            logger.error(f"Query optimization failed: {e}")
+            # Fallback to simple query
+            fallback_query = f"Machine learning approaches for detecting {threat_characteristics.threat_name}"
+            return OptimizedQuery(
+                original_query=threat_characteristics.threat_name,
+                optimized_queries=[fallback_query],
+                ml_focus_areas=["anomaly_detection"],
+                reasoning="Fallback query due to optimization failure"
+            )
+    def _parse_optimization_response(self, response: str, threat_characteristics: ThreatCharacteristics) -> OptimizedQuery:
+        """Parse the LLM response into structured query optimization"""
+        queries = []
+        ml_focus_areas = []
+        reasoning = ""
+        lines = response.split('\n')
+        current_section = None
+        for line in lines:
+            line = line.strip()
+            if line.startswith('QUERIES:'):
+                current_section = 'queries'
+                continue
+            elif line.startswith('ML_FOCUS_AREAS:'):
+                current_section = 'focus'
+                ml_focus_areas = [area.strip() for area in line.replace('ML_FOCUS_AREAS:', '').split(',')]
+                continue
+            elif line.startswith('REASONING:'):
+                current_section = 'reasoning'
+                reasoning = line.replace('REASONING:', '').strip()
+                continue
+            if current_section == 'queries' and line:
+                # Extract query from numbered list
+                query_match = re.match(r'\d+\.\s*(.+)', line)
+                if query_match:
+                    queries.append(query_match.group(1).strip())
+            elif current_section == 'reasoning' and line:
+                reasoning += ' ' + line
+        # Ensure we have at least one query
+        if not queries:
+            queries = [f"Machine learning detection approaches for {threat_characteristics.threat_name}"]
+        return OptimizedQuery(
+            original_query=threat_characteristics.threat_name,
+            optimized_queries=queries,
+            ml_focus_areas=ml_focus_areas if ml_focus_areas else ["anomaly_detection"],
+            reasoning=reasoning.strip()
+        )
+class SourceIdentifier:
+    """Identifies most relevant papers/sources for a given threat"""
+    def __init__(self, knowledge_base: KnowledgeBaseStorage):
+        self.knowledge_base = knowledge_base
+        self._load_source_mappings()
+    def _load_source_mappings(self):
+        """Load mappings between threat types and relevant sources"""
+        # Company expertise mappings
+        self.company_expertise = {
+            'Netflix': ['performance_monitoring', 'infrastructure_anomalies', 'streaming_security'],
+            'LinkedIn': ['user_behavior', 'abuse_detection', 'social_platform_security'],
+            'Slack': ['communication_security', 'invite_spam', 'workspace_security'],
+            'Cloudflare': ['network_security', 'bot_detection', 'traffic_analysis'],
+            'Uber': ['fraud_detection', 'real_time_systems', 'human_in_the_loop'],
+            'Grab': ['financial_fraud', 'graph_analysis', 'transaction_security'],
+            'OLX Group': ['marketplace_fraud', 'deep_learning', 'user_verification'],
+            'Stack Exchange': ['content_moderation', 'spam_detection', 'community_security'],
+            'Mercari': ['e_commerce_security', 'content_moderation', 'automated_review']
+        }
+        # Attack vector to technique mappings
+        self.attack_vector_techniques = {
+            'network': ['traffic_analysis', 'graph_ml', 'behavioral_analysis'],
+            'email': ['text_classification', 'nlp', 'spam_detection'],
+            'web': ['bot_detection', 'traffic_analysis', 'behavioral_analysis'],
+            'insider': ['user_behavior', 'behavioral_analysis', 'anomaly_detection'],
+            'financial': ['fraud_detection', 'transaction_analysis', 'graph_ml']
+        }
+        # Threat type to ML approach mappings
+        self.threat_ml_mappings = {
+            'malware': ['behavioral_analysis', 'static_analysis', 'dynamic_analysis'],
+            'apt': ['behavioral_analysis', 'network_analysis', 'long_term_patterns'],
+            'fraud': ['fraud_detection', 'transaction_analysis', 'user_behavior'],
+            'spam': ['text_classification', 'content_moderation', 'nlp'],
+            'abuse': ['user_behavior', 'abuse_detection', 'behavioral_analysis']
+        }
+    def identify_relevant_sources(self, optimized_query: OptimizedQuery,
+                                threat_characteristics: ThreatCharacteristics) -> SourceSelection:
+        """Identify most relevant papers/sources for the optimized queries"""
+        relevant_companies = set()
+        relevant_techniques = set()
+        # Map attack vectors to companies
+        for vector in threat_characteristics.attack_vectors:
+            for company, expertise in self.company_expertise.items():
+                for expert_area in expertise:
+                    if any(keyword in expert_area for keyword in [vector, threat_characteristics.threat_type]):
+                        relevant_companies.add(company)
+            # Add techniques for this attack vector
+            if vector in self.attack_vector_techniques:
+                relevant_techniques.update(self.attack_vector_techniques[vector])
+        # Map threat type to companies and techniques
+        threat_type = threat_characteristics.threat_type.lower()
+        for company, expertise in self.company_expertise.items():
+            if any(threat_type in expert_area or expert_area in threat_type for expert_area in expertise):
+                relevant_companies.add(company)
+        # Add techniques for threat type
+        if threat_type in self.threat_ml_mappings:
+            relevant_techniques.update(self.threat_ml_mappings[threat_type])
+        # Add techniques from ML focus areas
+        relevant_techniques.update(optimized_query.ml_focus_areas)
+        # Always include available companies from knowledge base
+        stats = self.knowledge_base.get_stats()
+        available_companies = set(stats['companies'])
+        # For now, include all available companies to ensure we get results
+        # In production, you can make this more selective
+        relevant_companies.update(available_companies)
+        # If still empty somehow, use all available
+        if not relevant_companies:
+            relevant_companies = available_companies
+        # Generate reasoning
+        reasoning = f"Selected companies based on expertise in {threat_characteristics.threat_type} and {', '.join(threat_characteristics.attack_vectors)}. Focus on {', '.join(list(relevant_techniques)[:3])} techniques."
+        return SourceSelection(
+            relevant_companies=list(relevant_companies),
+            relevant_years=['2019', '2020', '2021', '2022'],  # All available years
+            relevant_techniques=list(relevant_techniques),
+            reasoning=reasoning
+        )
+class EnhancedRetriever:
+    """Enhanced hybrid retriever with vector search + BM25 and post-processing"""
+    def __init__(self, knowledge_base: KnowledgeBaseStorage):
+        self.knowledge_base = knowledge_base
+        self.bm25_retriever = BM25Retriever(knowledge_base)
+    def retrieve_with_context(self, optimized_query: OptimizedQuery,
+                            source_selection: SourceSelection,
+                            max_results: int = 10,
+                            trace_exporter=None) -> List[MLRetrievalResult]:
+        """Hybrid retrieval using both vector search and BM25, with context-aware processing"""
+        vector_results = []
+        bm25_results = []
+        # Log query optimization to trace
+        if trace_exporter:
+            trace_exporter.log_query_optimization(
+                optimized_query.original_query,
+                optimized_query.optimized_queries,
+                optimized_query.reasoning,
+                optimized_query.ml_focus_areas
+            )
+        # 1. Vector Search - Search with each optimized query
+        if trace_exporter:
+            trace_exporter.log_stage_start("vector_retrieval")
+        for query in optimized_query.optimized_queries:
+            results = self.knowledge_base.search(query, n_results=max_results)
+            for result in results:
+                # Filter by relevant sources
+                metadata = result['metadata']
+                company = metadata.get('company', '')
+                if company in source_selection.relevant_companies:
+                    ml_result = self._create_ml_result(result, optimized_query, source_selection)
+                    ml_result.retrieval_method = 'vector'
+                    vector_results.append(ml_result)
+        if trace_exporter:
+            trace_exporter.log_stage_end("vector_retrieval", result_count=len(vector_results))
+            # Convert to format expected by trace exporter
+            vector_trace_results = [self._convert_to_trace_format(r) for r in vector_results]
+            trace_exporter.log_retrieval_results(vector_trace_results, "vector")
+        # 2. BM25 Search - Search with each optimized query
+        if trace_exporter:
+            trace_exporter.log_stage_start("bm25_retrieval")
+        for query in optimized_query.optimized_queries:
+            bm25_search_results = self.bm25_retriever.search(query, n_results=max_results)
+            for bm25_result in bm25_search_results:
+                # Filter by relevant sources
+                company = bm25_result.metadata.get('company', '')
+                if company in source_selection.relevant_companies:
+                    # Convert BM25 result to ML result format
+                    ml_result = self._create_ml_result_from_bm25(bm25_result, optimized_query, source_selection)
+                    ml_result.retrieval_method = 'bm25'
+                    bm25_results.append(ml_result)
+        if trace_exporter:
+            trace_exporter.log_stage_end("bm25_retrieval", result_count=len(bm25_results))
+            # Convert to format expected by trace exporter
+            bm25_trace_results = [self._convert_to_trace_format(r) for r in bm25_results]
+            trace_exporter.log_retrieval_results(bm25_trace_results, "bm25")
+        # 3. Combine and deduplicate results
+        if trace_exporter:
+            trace_exporter.log_stage_start("hybrid_fusion")
+        all_results = self._fuse_hybrid_results(vector_results, bm25_results)
+        # 4. Post-process results
+        processed_results = self._post_process_results(all_results)
+        # 5. Rank by hybrid score combining relevance and applicability
+        ranked_results = sorted(processed_results,
+                              key=lambda x: x.hybrid_score,
+                              reverse=True)
+        final_results = ranked_results[:max_results]
+        # Log hybrid fusion results
+        if trace_exporter:
+            trace_exporter.log_stage_end("hybrid_fusion", final_result_count=len(final_results))
+            # Convert to format expected by trace exporter
+            hybrid_trace_results = [self._convert_to_trace_format(r) for r in final_results]
+            trace_exporter.log_retrieval_results(hybrid_trace_results, "hybrid")
+            # Log hybrid scoring if available
+            if final_results:
+                avg_vector_score = sum(r.relevance_score for r in final_results) / len(final_results)
+                avg_bm25_score = sum(getattr(r, 'bm25_score', 0) for r in final_results) / len(final_results)
+                avg_hybrid_score = sum(r.hybrid_score for r in final_results) / len(final_results)
+                avg_applicability_score = sum(r.applicability_score for r in final_results) / len(final_results)
+                trace_exporter.log_hybrid_scoring(
+                    avg_vector_score, avg_bm25_score, avg_hybrid_score, avg_applicability_score
+                )
+        return final_results
+    def _create_ml_result(self, search_result: Dict,
+                         optimized_query: OptimizedQuery,
+                         source_selection: SourceSelection) -> MLRetrievalResult:
+        """Convert search result to structured ML result"""
+        metadata = search_result['metadata']
+        # Calculate applicability score based on technique overlap
+        ml_techniques_raw = metadata.get('ml_techniques', '')
+        # Ensure we have a string before splitting
+        ml_techniques_str = str(ml_techniques_raw) if ml_techniques_raw else ''
+        paper_techniques = set(str(t).strip() for t in ml_techniques_str.split('|') if str(t).strip())
+        relevant_techniques = set(source_selection.relevant_techniques)
+        technique_overlap = len(paper_techniques.intersection(relevant_techniques))
+        applicability_score = min(technique_overlap / max(len(relevant_techniques), 1), 1.0)
+        # Extract implementation details from content
+        content = search_result['document']
+        implementation_details = self._extract_implementation_details(content)
+        result = MLRetrievalResult(
+            content=content,
+            metadata=metadata,
+            relevance_score=search_result['score'],
+            source_paper=metadata.get('source_title', ''),
+            ml_techniques=list(paper_techniques),
+            implementation_details=implementation_details,
+            applicability_score=applicability_score,
+            retrieval_method='vector'
+        )
+        # Calculate hybrid score (for vector results, this is just the combination)
+        result.hybrid_score = (result.relevance_score * 0.6 + result.applicability_score * 0.4)
+        return result
+    def _extract_implementation_details(self, content: str) -> str:
+        """Extract key implementation details from content"""
+        # Look for implementation-specific keywords
+        impl_keywords = [
+            'architecture', 'framework', 'algorithm', 'model',
+            'feature', 'training', 'deployment', 'performance',
+            'accuracy', 'precision', 'recall', 'latency'
+        ]
+        sentences = content.split('.')
+        impl_sentences = []
+        for sentence in sentences:
+            if any(keyword in sentence.lower() for keyword in impl_keywords):
+                impl_sentences.append(sentence.strip())
+        return '. '.join(impl_sentences[:3])  # Top 3 relevant sentences
+    def _post_process_results(self, results: List[MLRetrievalResult]) -> List[MLRetrievalResult]:
+        """Post-process results for deduplication and enhancement"""
+        # Simple deduplication by content hash
+        seen_hashes = set()
+        deduplicated = []
+        for result in results:
+            content_hash = hash(result.content[:200])  # Hash first 200 chars
+            if content_hash not in seen_hashes:
+                seen_hashes.add(content_hash)
+                deduplicated.append(result)
+        return deduplicated
+    def _convert_to_trace_format(self, ml_result: MLRetrievalResult) -> Dict:
+        """Convert MLRetrievalResult to format expected by trace exporter"""
+        return {
+            "content": ml_result.content,
+            "metadata": ml_result.metadata,
+            "score": ml_result.relevance_score,
+            "method": ml_result.retrieval_method,
+            "matched_terms": getattr(ml_result, 'matched_terms', None),
+            "source_company": ml_result.metadata.get('company'),
+            "ml_techniques": ml_result.ml_techniques
+        }
+    def _create_ml_result_from_bm25(self, bm25_result: BM25SearchResult,
+                                   optimized_query: OptimizedQuery,
+                                   source_selection: SourceSelection) -> MLRetrievalResult:
+        """Convert BM25 search result to ML result format"""
+        metadata = bm25_result.metadata
+        # Calculate applicability score based on technique overlap
+        ml_techniques_raw = metadata.get('ml_techniques', '')
+        ml_techniques_str = str(ml_techniques_raw) if ml_techniques_raw else ''
+        paper_techniques = set(str(t).strip() for t in ml_techniques_str.split('|') if str(t).strip())
+        relevant_techniques = set(source_selection.relevant_techniques)
+        technique_overlap = len(paper_techniques.intersection(relevant_techniques))
+        applicability_score = min(technique_overlap / max(len(relevant_techniques), 1), 1.0)
+        # Extract implementation details from content
+        implementation_details = self._extract_implementation_details(bm25_result.content)
+        result = MLRetrievalResult(
+            content=bm25_result.content,
+            metadata=metadata,
+            relevance_score=bm25_result.relevance_score,
+            source_paper=metadata.get('source_title', ''),
+            ml_techniques=list(paper_techniques),
+            implementation_details=implementation_details,
+            applicability_score=applicability_score,
+            retrieval_method='bm25',
+            bm25_score=bm25_result.bm25_score,
+            matched_terms=bm25_result.matched_terms
+        )
+        # Calculate hybrid score (for BM25 results, give more weight to exact matches)
+        bm25_weight = min(bm25_result.bm25_score / 5.0, 1.0)  # Normalize BM25 score
+        term_match_bonus = len(bm25_result.matched_terms) * 0.1  # Bonus for matched terms
+        result.hybrid_score = (bm25_weight * 0.5 + result.applicability_score * 0.4 + term_match_bonus)
+        return result
+    def _fuse_hybrid_results(self, vector_results: List[MLRetrievalResult],
+                           bm25_results: List[MLRetrievalResult]) -> List[MLRetrievalResult]:
+        """Fuse vector and BM25 results using reciprocal rank fusion"""
+        # Create dictionaries for fast lookup
+        vector_lookup = {self._get_result_key(r): r for r in vector_results}
+        bm25_lookup = {self._get_result_key(r): r for r in bm25_results}
+        # Get all unique result keys
+        all_keys = set(vector_lookup.keys()) | set(bm25_lookup.keys())
+        fused_results = []
+        for key in all_keys:
+            vector_result = vector_lookup.get(key)
+            bm25_result = bm25_lookup.get(key)
+            if vector_result and bm25_result:
+                # Both methods found this result - create hybrid
+                hybrid_result = self._create_hybrid_result(vector_result, bm25_result)
+                fused_results.append(hybrid_result)
+            elif vector_result:
+                # Only vector search found this
+                fused_results.append(vector_result)
+            elif bm25_result:
+                # Only BM25 found this
+                fused_results.append(bm25_result)
+        return fused_results
+    def _get_result_key(self, result: MLRetrievalResult) -> str:
+        """Generate a unique key for a result based on content hash"""
+        # Use first 100 characters of content as key to detect near-duplicates
+        content_key = result.content[:100] if result.content else ""
+        return hashlib.md5(content_key.encode()).hexdigest()[:16]
+    def _create_hybrid_result(self, vector_result: MLRetrievalResult,
+                            bm25_result: MLRetrievalResult) -> MLRetrievalResult:
+        """Create a hybrid result by combining vector and BM25 results"""
+        # Use vector result as base and enhance with BM25 data
+        hybrid_result = MLRetrievalResult(
+            content=vector_result.content,
+            metadata=vector_result.metadata,
+            relevance_score=max(vector_result.relevance_score, bm25_result.relevance_score),
+            source_paper=vector_result.source_paper,
+            ml_techniques=vector_result.ml_techniques,
+            implementation_details=vector_result.implementation_details,
+            applicability_score=max(vector_result.applicability_score, bm25_result.applicability_score),
+            retrieval_method='hybrid',
+            bm25_score=bm25_result.bm25_score,
+            matched_terms=bm25_result.matched_terms
+        )
+        # Calculate enhanced hybrid score using reciprocal rank fusion concept
+        vector_score = vector_result.hybrid_score
+        bm25_score = bm25_result.hybrid_score
+        # Reciprocal rank fusion with k=60 (standard value)
+        k = 60
+        vector_rank = 1.0 / (k + vector_score * 100)  # Convert score to rank
+        bm25_rank = 1.0 / (k + bm25_score * 100)
+        # Combine ranks and add bonus for being found by both methods
+        hybrid_result.hybrid_score = (vector_rank + bm25_rank) + 0.2  # 0.2 bonus for hybrid
+        return hybrid_result
+class MLAgenticRetriever:
+    """Main agentic retriever orchestrating all components"""
+    def __init__(self, anthropic_client, knowledge_base_path: str = "./ml_knowledge_base"):
+        self.client = anthropic_client
+        self.knowledge_base = KnowledgeBaseStorage(knowledge_base_path)
+        # Initialize agents
+        self.query_optimizer = QueryOptimizer(anthropic_client)
+        self.source_identifier = SourceIdentifier(self.knowledge_base)
+        self.enhanced_retriever = EnhancedRetriever(self.knowledge_base)
+        logger.info("ML Agentic Retriever initialized")
+    def get_ml_guidance(self, threat_characteristics: ThreatCharacteristics, trace_exporter=None) -> Dict:
+        """Get comprehensive ML guidance for threat detection"""
+        try:
+            logger.info(f"Getting ML guidance for: {threat_characteristics.threat_name}")
+            # Step 1: Query Optimization
+            optimized_query = self.query_optimizer.optimize_query(threat_characteristics)
+            logger.info(f"Generated {len(optimized_query.optimized_queries)} optimized queries")
+            # Step 2: Source Identification
+            source_selection = self.source_identifier.identify_relevant_sources(
+                optimized_query, threat_characteristics
+            )
+            logger.info(f"Identified {len(source_selection.relevant_companies)} relevant companies")
+            # Step 3: Enhanced Retrieval
+            ml_results = self.enhanced_retriever.retrieve_with_context(
+                optimized_query, source_selection, max_results=8,
+                trace_exporter=trace_exporter
+            )
+            logger.info(f"Retrieved {len(ml_results)} relevant ML approaches")
+            # Step 4: Structure results
+            guidance = self._structure_ml_guidance(
+                threat_characteristics, optimized_query, source_selection, ml_results
+            )
+            return guidance
+        except Exception as e:
+            logger.error(f"ML guidance generation failed: {e}")
+            return self._create_fallback_guidance(threat_characteristics)
+    def get_enhanced_ml_guidance(self, threat_characteristics: ThreatCharacteristics,
+                               complete_threat_data: Dict, trace_exporter=None) -> Dict:
+        """Get enhanced ML guidance leveraging complete threat intelligence context"""
+        try:
+            logger.info(f"Getting enhanced ML guidance for: {threat_characteristics.threat_name}")
+            # Step 1: Enhanced Query Optimization with threat context
+            optimized_query = self._optimize_query_with_context(threat_characteristics, complete_threat_data)
+            logger.info(f"Generated {len(optimized_query.optimized_queries)} context-enhanced queries")
+            # Step 2: Enhanced Source Identification
+            source_selection = self.source_identifier.identify_relevant_sources(
+                optimized_query, threat_characteristics
+            )
+            logger.info(f"Identified {len(source_selection.relevant_companies)} relevant companies")
+            # Step 3: Enhanced Retrieval with threat context
+            ml_results = self.enhanced_retriever.retrieve_with_context(
+                optimized_query, source_selection, max_results=10,  # More results for enhanced mode
+                trace_exporter=trace_exporter
+            )
+            logger.info(f"Retrieved {len(ml_results)} relevant ML approaches")
+            # Step 4: Structure results with enhanced context
+            guidance = self._structure_enhanced_ml_guidance(
+                threat_characteristics, optimized_query, source_selection, ml_results, complete_threat_data
+            )
+            return guidance
+        except Exception as e:
+            logger.error(f"Enhanced ML guidance generation failed: {e}")
+            return self._create_enhanced_fallback_guidance(threat_characteristics, complete_threat_data)
+    def _structure_ml_guidance(self, threat_characteristics: ThreatCharacteristics,
+                             optimized_query: OptimizedQuery,
+                             source_selection: SourceSelection,
+                             ml_results: List[MLRetrievalResult]) -> Dict:
+        """Structure the ML guidance into organized sections"""
+        # Group results by ML technique
+        techniques_map = {}
+        for result in ml_results:
+            for technique in result.ml_techniques:
+                if technique not in techniques_map:
+                    techniques_map[technique] = []
+                techniques_map[technique].append(result)
+        # Create structured guidance
+        guidance = {
+            'threat_name': threat_characteristics.threat_name,
+            'ml_approaches': [],
+            'implementation_considerations': [],
+            'source_papers': [],
+            'query_optimization': {
+                'original_query': optimized_query.original_query,
+                'optimized_queries': optimized_query.optimized_queries,
+                'reasoning': optimized_query.reasoning
+            },
+            'source_selection': {
+                'relevant_companies': source_selection.relevant_companies,
+                'reasoning': source_selection.reasoning
+            }
+        }
+        # Add ML approaches
+        for technique, results in techniques_map.items():
+            if results:  # Only include techniques with results
+                best_result = max(results, key=lambda x: x.applicability_score)
+                approach = {
+                    'technique': str(technique),
+                    'description': best_result.implementation_details,
+                    'source_company': str(best_result.metadata.get('company', '')),
+                    'source_paper': str(best_result.source_paper),
+                    'applicability_score': best_result.applicability_score,
+                    'relevance_score': best_result.relevance_score,
+                    'retrieval_method': best_result.retrieval_method,
+                    'hybrid_score': best_result.hybrid_score,
+                    'bm25_score': best_result.bm25_score,
+                    'matched_terms': best_result.matched_terms if best_result.matched_terms else []
+                }
+                guidance['ml_approaches'].append(approach)
+        # Add implementation considerations
+        for result in ml_results[:3]:  # Top 3 results
+            consideration = {
+                'aspect': f"{result.metadata.get('company', '')} Implementation",
+                'details': result.implementation_details,
+                'source': result.source_paper
+            }
+            guidance['implementation_considerations'].append(consideration)
+        # Add source papers
+        seen_papers = set()
+        for result in ml_results:
+            paper_title = result.source_paper
+            if paper_title not in seen_papers:
+                seen_papers.add(paper_title)
+                paper_info = {
+                    'title': paper_title,
+                    'company': result.metadata.get('company', ''),
+                    'year': result.metadata.get('year', ''),
+                    'url': result.metadata.get('source_url', ''),
+                    'techniques': result.ml_techniques
+                }
+                guidance['source_papers'].append(paper_info)
+        return guidance
+    def _create_fallback_guidance(self, threat_characteristics: ThreatCharacteristics) -> Dict:
+        """Create fallback guidance when main pipeline fails"""
+        return {
+            'threat_name': threat_characteristics.threat_name,
+            'ml_approaches': [{
+                'technique': 'anomaly_detection',
+                'description': 'General anomaly detection approaches using statistical methods and machine learning',
+                'source_company': 'General',
+                'source_paper': 'Fallback recommendation',
+                'applicability_score': 0.5,
+                'relevance_score': 0.5
+            }],
+            'implementation_considerations': [{
+                'aspect': 'General Implementation',
+                'details': 'Consider implementing statistical anomaly detection as a baseline approach',
+                'source': 'Fallback recommendation'
+            }],
+            'source_papers': [],
+            'error': 'ML guidance generation failed - fallback recommendations provided'
+        }
+def create_test_threat_characteristics() -> ThreatCharacteristics:
+    """Create test threat characteristics for validation"""
+    return ThreatCharacteristics(
+        threat_name="ShadowPad",
+        threat_type="malware",
+        attack_vectors=["network", "lateral_movement"],
+        target_assets=["corporate_networks", "sensitive_data"],
+        behavior_patterns=["persistence", "data_exfiltration", "command_control"],
+        time_characteristics="persistent"
+    )
+# Add enhanced methods to MLAgenticRetriever class
+def _optimize_query_with_context(self, threat_characteristics: ThreatCharacteristics,
+                               complete_threat_data: Dict) -> OptimizedQuery:
+    """Create enhanced queries using complete threat intelligence context"""
+    # Extract additional context for query enhancement
+    context_elements = []
+    # Technical capabilities
+    if tech_details := complete_threat_data.get('technicalDetails'):
+        if capabilities := tech_details.get('capabilities'):
+            # Ensure capabilities is a list before slicing
+            if isinstance(capabilities, list):
+                cap_names = [cap.get('name', str(cap)) if isinstance(cap, dict) else str(cap)
+                           for cap in capabilities[:3]]
+                context_elements.extend(cap_names)
+    # C2 protocols
+    if c2_data := complete_threat_data.get('commandAndControl'):
+        if methods := c2_data.get('communicationMethods'):
+            # Ensure methods is a list before slicing
+            if isinstance(methods, list):
+                protocols = [method.get('protocol', str(method)) if isinstance(method, dict) else str(method)
+                           for method in methods[:2]]
+                context_elements.extend(protocols)
+    # Use the regular optimizer with enhanced threat characteristics
+    enhanced_characteristics = ThreatCharacteristics(
+        threat_name=threat_characteristics.threat_name,
+        threat_type=threat_characteristics.threat_type,
+        attack_vectors=threat_characteristics.attack_vectors + context_elements[:2],
+        target_assets=threat_characteristics.target_assets,
+        behavior_patterns=threat_characteristics.behavior_patterns + context_elements[2:4],
+        time_characteristics=threat_characteristics.time_characteristics
+    )
+    return self.query_optimizer.optimize_query(enhanced_characteristics)
+def _structure_enhanced_ml_guidance(self, threat_characteristics: ThreatCharacteristics,
+                                  optimized_query: OptimizedQuery,
+                                  source_selection: SourceSelection,
+                                  ml_results: List,  # MLRetrievalResult type
+                                  complete_threat_data: Dict) -> Dict:
+    """Structure enhanced ML guidance with threat context"""
+    # Start with regular structuring
+    guidance = self._structure_ml_guidance(
+        threat_characteristics, optimized_query, source_selection, ml_results
+    )
+    # Enhance with threat context
+    guidance['threat_context_applied'] = True
+    guidance['context_sources'] = {
+        'technical_details': bool(complete_threat_data.get('technicalDetails')),
+        'command_and_control': bool(complete_threat_data.get('commandAndControl')),
+        'detection_and_mitigation': bool(complete_threat_data.get('detectionAndMitigation')),
+        'forensic_artifacts': bool(complete_threat_data.get('forensicArtifacts'))
+    }
+    # Add threat-specific implementation considerations
+    if tech_details := complete_threat_data.get('technicalDetails'):
+        if os_data := tech_details.get('operatingSystems'):
+            # Ensure os_data is a list before slicing
+            if isinstance(os_data, list):
+                os_names = [os.get('name', str(os)) if isinstance(os, dict) else str(os) for os in os_data[:2]]
+                guidance['implementation_considerations'].append({
+                    'aspect': 'OS Compatibility',
+                    'details': f'Ensure ML models are trained on {", ".join(os_names)} environments for optimal detection.',
+                    'source': 'Threat Intelligence Profile'
+                })
+    return guidance
+def _create_enhanced_fallback_guidance(self, threat_characteristics: ThreatCharacteristics,
+                                     complete_threat_data: Dict) -> Dict:
+    """Create enhanced fallback guidance with threat context"""
+    fallback = self._create_fallback_guidance(threat_characteristics)
+    # Add context-aware recommendations
+    fallback['threat_context_applied'] = True
+    fallback['enhanced_fallback'] = True
+    # Add context-specific ML approaches
+    if complete_threat_data.get('commandAndControl'):
+        fallback['ml_approaches'].append({
+            'technique': 'C2 Traffic Analysis',
+            'source_company': 'Context-Derived',
+            'description': 'ML-based detection of command and control communication patterns identified in the threat profile.',
+            'applicability_score': 0.8
+        })
+    if complete_threat_data.get('forensicArtifacts'):
+        fallback['ml_approaches'].append({
+            'technique': 'Artifact-Based Detection',
+            'source_company': 'Context-Derived',
+            'description': 'Machine learning models trained on forensic artifacts specific to this threat.',
+            'applicability_score': 0.7
+        })
+    return fallback
+# Monkey patch these methods onto the MLAgenticRetriever class
+MLAgenticRetriever._optimize_query_with_context = _optimize_query_with_context
+MLAgenticRetriever._structure_enhanced_ml_guidance = _structure_enhanced_ml_guidance
+MLAgenticRetriever._create_enhanced_fallback_guidance = _create_enhanced_fallback_guidance
+def main():
+    """Test the ML Agentic Retriever"""
+    # Initialize
+    api_key = os.getenv('ANTHROPIC_API_KEY')
+    if not api_key:
+        print("Error: ANTHROPIC_API_KEY environment variable not set")
+        return
+    print("🤖 Testing ML Agentic Retriever")
+    print("=" * 40)
+    # Create components
+    anthropic_client = Anthropic(api_key=api_key)
+    retriever = MLAgenticRetriever(anthropic_client)
+    # Test with sample threat
+    threat = create_test_threat_characteristics()
+    print(f"🎯 Testing with threat: {threat.threat_name}")
+    print(f"   Type: {threat.threat_type}")
+    print(f"   Attack Vectors: {', '.join(threat.attack_vectors)}")
+    print(f"   Behavior Patterns: {', '.join(threat.behavior_patterns)}")
+    # Get ML guidance
+    guidance = retriever.get_ml_guidance(threat)
+    print(f"\n🧠 ML Guidance Generated:")
+    print(f"   ML Approaches: {len(guidance['ml_approaches'])}")
+    print(f"   Implementation Considerations: {len(guidance['implementation_considerations'])}")
+    print(f"   Source Papers: {len(guidance['source_papers'])}")
+    # Show details
+    if guidance['ml_approaches']:
+        print(f"\n📊 Top ML Approaches:")
+        for i, approach in enumerate(guidance['ml_approaches'][:3], 1):
+            print(f"   {i}. {approach['technique']} ({approach['source_company']})")
+            print(f"      Applicability: {approach['applicability_score']:.2f}")
+            print(f"      Description: {approach['description'][:100]}...")
+    if guidance['source_papers']:
+        print(f"\n📚 Source Papers:")
+        for paper in guidance['source_papers'][:3]:
+            print(f"   • {paper['company']} ({paper['year']}): {paper['title'][:60]}...")
+    print(f"\n✅ Agentic retrieval test complete!")
+if __name__ == "__main__":
+    main()

src/{ml_workers_retriever.py → search/ml_workers_retriever.py} RENAMED Viewed

File without changes

src/{app.py → ui/app.py} RENAMED Viewed

@@ -2,8 +2,8 @@
 🔍 SentrySearch - Threat Intelligence Profile Generator
 """
 import gradio as gr
-from threat_intel_tool import ThreatIntelTool
-from markdown_generator import generate_markdown
 def generate_threat_profile(api_key, tool_name, enable_quality_control, progress=gr.Progress()):

 🔍 SentrySearch - Threat Intelligence Profile Generator
 """
 import gradio as gr
+from src.core.threat_intel_tool import ThreatIntelTool
+from src.core.markdown_generator import generate_markdown
 def generate_threat_profile(api_key, tool_name, enable_quality_control, progress=gr.Progress()):

wrangler.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+# Cloudflare Workers configuration for SentrySearch Hybrid Search
+name = "sentry-search-hybrid"
+main = "worker.js"
+compatibility_date = "2023-12-01"
+# KV Namespace binding
+[[kv_namespaces]]
+binding = "SENTRY_KV"
+id = "f97c6c83f96f4b548307ccb1ffaa2668"
+preview_id = "3fdb32abdd9d472d9d0506297dde4587"
+# Environment variables (non-secret)
+[vars]
+ENVIRONMENT = "production"
+MAX_SEARCH_RESULTS = "50"
+DEFAULT_SEARCH_TIMEOUT = "30000"
+# Routes (configure after deployment)
+# routes = [
+#   { pattern = "sentry-search.your-domain.com/*", zone_name = "your-domain.com" }
+# ]
+# Resource limits (removed for free plan compatibility)
+# [limits]
+# cpu_ms = 30000