Spaces:
Sleeping
Sleeping
| """Configuration loader — reads from config.yaml, falls back to defaults.""" | |
| import logging | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| # --------------------------------------------------------------------------- | |
| # Logging (always available, before config loads) | |
| # --------------------------------------------------------------------------- | |
| LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s" | |
| LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper() | |
| logging.basicConfig( | |
| format=LOG_FORMAT, | |
| level=getattr(logging, LOG_LEVEL, logging.INFO), | |
| stream=sys.stdout, | |
| ) | |
| # Quiet noisy libraries | |
| logging.getLogger("httpx").setLevel(logging.WARNING) | |
| logging.getLogger("httpcore").setLevel(logging.WARNING) | |
| logging.getLogger("apscheduler").setLevel(logging.WARNING) | |
| log = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # HuggingFace Spaces detection | |
| # --------------------------------------------------------------------------- | |
| IS_HF_SPACE = bool(os.environ.get("SPACE_ID")) | |
| DEMO_MODE = bool(os.environ.get("DEMO_MODE")) | |
| def _spaces_data_dir() -> Path: | |
| """Return /data on HF Spaces (persistent storage), otherwise local data/.""" | |
| if IS_HF_SPACE and Path("/data").exists(): | |
| # Verify /data is actually writable (persistent storage enabled) | |
| try: | |
| test_file = Path("/data/.write_test") | |
| test_file.touch() | |
| test_file.unlink() | |
| return Path("/data") | |
| except OSError: | |
| pass # /data exists but not writable — no persistent storage | |
| return Path("data") | |
| SPACES_DATA_DIR = _spaces_data_dir() | |
| # --------------------------------------------------------------------------- | |
| # Config file path | |
| # --------------------------------------------------------------------------- | |
| _default_config = str(SPACES_DATA_DIR / "config.yaml") if IS_HF_SPACE else "config.yaml" | |
| CONFIG_PATH = Path(os.environ.get("CONFIG_PATH", _default_config)) | |
| FIRST_RUN = not CONFIG_PATH.exists() | |
| # --------------------------------------------------------------------------- | |
| # Environment | |
| # --------------------------------------------------------------------------- | |
| ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") | |
| GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") | |
| def validate_env(): | |
| """Check required environment variables at startup. Warn on missing.""" | |
| if not ANTHROPIC_API_KEY: | |
| log.warning("ANTHROPIC_API_KEY not set — scoring will be disabled") | |
| if not GITHUB_TOKEN: | |
| log.info("GITHUB_TOKEN not set — GitHub API calls will be rate-limited") | |
| # --------------------------------------------------------------------------- | |
| # Load config.yaml (or defaults) | |
| # --------------------------------------------------------------------------- | |
| def _load_yaml() -> dict: | |
| """Load config.yaml if present, otherwise return empty dict.""" | |
| if CONFIG_PATH.exists(): | |
| try: | |
| import yaml | |
| with open(CONFIG_PATH) as f: | |
| data = yaml.safe_load(f) or {} | |
| log.info("Loaded config from %s", CONFIG_PATH) | |
| return data | |
| except Exception as e: | |
| log.error("Failed to load %s: %s — using defaults", CONFIG_PATH, e) | |
| return {} | |
| _cfg = _load_yaml() | |
| # --------------------------------------------------------------------------- | |
| # Claude API / Scoring models | |
| # --------------------------------------------------------------------------- | |
| _scoring_cfg = _cfg.get("scoring", {}) | |
| SCORING_MODEL = _scoring_cfg.get("model", _cfg.get("claude_model", "claude-haiku-4-5-20251001")) | |
| RESCORE_MODEL = _scoring_cfg.get("rescore_model", "claude-sonnet-4-5-20250929") | |
| RESCORE_TOP_N = _scoring_cfg.get("rescore_top_n", 15) | |
| BATCH_SIZE = _scoring_cfg.get("batch_size", _cfg.get("batch_size", 20)) | |
| # --------------------------------------------------------------------------- | |
| # Database | |
| # --------------------------------------------------------------------------- | |
| _default_db = str(SPACES_DATA_DIR / "researcher.db") if IS_HF_SPACE else "data/researcher.db" | |
| DB_PATH = Path(_cfg.get("database", {}).get("path", os.environ.get("DB_PATH", _default_db))) | |
| # --------------------------------------------------------------------------- | |
| # Web | |
| # --------------------------------------------------------------------------- | |
| WEB_HOST = _cfg.get("web", {}).get("host", "0.0.0.0") | |
| WEB_PORT = _cfg.get("web", {}).get("port", 8888) | |
| # --------------------------------------------------------------------------- | |
| # Schedule | |
| # --------------------------------------------------------------------------- | |
| SCHEDULE_CRON = _cfg.get("schedule", {}).get("cron", "0 22 * * 0") | |
| # --------------------------------------------------------------------------- | |
| # Domains from config | |
| # --------------------------------------------------------------------------- | |
| _domains_cfg = _cfg.get("domains", {}) | |
| # --------------------------------------------------------------------------- | |
| # Shared constants | |
| # --------------------------------------------------------------------------- | |
| HF_API = "https://huggingface.co/api" | |
| GITHUB_URL_RE = re.compile(r"https?://github\.com/[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+") | |
| MAX_ABSTRACT_CHARS_AIML = 2000 | |
| MAX_ABSTRACT_CHARS_SECURITY = 1500 | |
| HF_MAX_AGE_DAYS = 90 | |
| # --------------------------------------------------------------------------- | |
| # AI/ML pipeline constants | |
| # --------------------------------------------------------------------------- | |
| _aiml_cfg = _domains_cfg.get("aiml", {}) | |
| ARXIV_LARGE_CATS = _aiml_cfg.get("arxiv_categories", ["cs.CV", "cs.CL", "cs.LG"]) | |
| ARXIV_SMALL_CATS = ["eess.AS", "cs.SD"] | |
| _aiml_include = _aiml_cfg.get("include_patterns", []) | |
| _aiml_exclude = _aiml_cfg.get("exclude_patterns", []) | |
| _DEFAULT_INCLUDE = ( | |
| r"video.generat|world.model|image.generat|diffusion|text.to.image|text.to.video|" | |
| r"code.generat|foundation.model|open.weight|large.language|language.model|" | |
| r"text.to.speech|tts|speech.synth|voice.clon|audio.generat|" | |
| r"transformer|attention.mechanism|state.space|mamba|mixture.of.expert|\bmoe\b|" | |
| r"scaling.law|architecture|quantiz|distillat|pruning|" | |
| r"multimodal|vision.language|\bvlm\b|agent|reasoning|" | |
| r"reinforcement.learn|rlhf|dpo|preference.optim|" | |
| r"retrieval.augment|\brag\b|in.context.learn|" | |
| r"image.edit|video.edit|3d.generat|nerf|gaussian.splat|" | |
| r"robot|embodied|simulat|" | |
| r"benchmark|evaluat|leaderboard|" | |
| r"open.source|reproducib|" | |
| r"instruction.tun|fine.tun|align|" | |
| r"long.context|context.window|" | |
| r"token|vocab|embedding|" | |
| r"training.efficien|parallel|distributed.train|" | |
| r"synthetic.data|data.curat" | |
| ) | |
| _DEFAULT_EXCLUDE = ( | |
| r"medical.imag|clinical|radiology|pathology|histolog|" | |
| r"climate.model|weather.predict|meteorolog|" | |
| r"survey.of|comprehensive.survey|" | |
| r"sentiment.analysis|named.entity|" | |
| r"drug.discover|protein.fold|molecular.dock|" | |
| r"software.engineering.practice|code.smell|technical.debt|" | |
| r"autonomous.driv|traffic.signal|" | |
| r"remote.sens|satellite.imag|crop.yield|" | |
| r"stock.predict|financial.forecast|" | |
| r"electronic.health|patient.record|" | |
| r"seismic|geophys|oceanograph|" | |
| r"educational.data|student.perform|" | |
| r"blockchain|smart.contract|\bdefi\b|decentralized.finance|cryptocurrency|" | |
| r"jailbreak|guardrail|red.teaming|llm.safety|" | |
| r"safe.alignment|safety.tuning|harmful.content|toxicity" | |
| ) | |
| INCLUDE_RE = re.compile( | |
| "|".join(_aiml_include) if _aiml_include else _DEFAULT_INCLUDE, | |
| re.IGNORECASE, | |
| ) | |
| EXCLUDE_RE = re.compile( | |
| "|".join(_aiml_exclude) if _aiml_exclude else _DEFAULT_EXCLUDE, | |
| re.IGNORECASE, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Security pipeline constants | |
| # --------------------------------------------------------------------------- | |
| _sec_cfg = _domains_cfg.get("security", {}) | |
| SECURITY_KEYWORDS = re.compile( | |
| r"\b(?:attack|vulnerability|exploit|fuzzing|fuzz|malware|" | |
| r"intrusion|ransomware|phishing|adversarial|" | |
| r"defense|defence|secure|security|privacy|" | |
| r"cryptograph|authentication|authorization|" | |
| r"injection|xss|csrf|cve\-\d|penetration.test|" | |
| r"threat|anomaly.detect|ids\b|ips\b|firewall|" | |
| r"reverse.engineer|obfuscat|sandbox|" | |
| r"side.channel|buffer.overflow|zero.day|" | |
| r"botnet|rootkit|trojan|worm)\b", | |
| re.IGNORECASE, | |
| ) | |
| ADJACENT_CATEGORIES = ["cs.AI", "cs.SE", "cs.NI", "cs.DC", "cs.OS", "cs.LG"] | |
| SECURITY_EXCLUDE_RE = re.compile( | |
| r"blockchain|smart.contract|\bdefi\b|decentralized.finance|" | |
| r"memecoin|meme.coin|cryptocurrency.trading|\bnft\b|" | |
| r"comprehensive.survey|systematization.of.knowledge|" | |
| r"differential.privacy.(?:mechanism|framework)|" | |
| r"stock.predict|financial.forecast|crop.yield|" | |
| r"sentiment.analysis|educational.data", | |
| re.IGNORECASE, | |
| ) | |
| SECURITY_LLM_RE = re.compile( | |
| r"jailbreak|guardrail|red.teaming|" | |
| r"llm.safety|safe.alignment|safety.tuning|" | |
| r"harmful.(?:content|output)|toxicity|content.moderation|" | |
| r"prompt.injection|" | |
| r"reward.model.(?:for|safety|alignment)", | |
| re.IGNORECASE, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Dynamic scoring prompt builder | |
| # --------------------------------------------------------------------------- | |
| def _build_scoring_prompt(domain: str, axes: list[dict], preferences: dict) -> str: | |
| """Build a Claude scoring prompt from config axes + preferences.""" | |
| boost = preferences.get("boost_topics", []) | |
| penalize = preferences.get("penalize_topics", []) | |
| if domain == "aiml": | |
| return _build_aiml_prompt(axes, boost, penalize) | |
| elif domain == "security": | |
| return _build_security_prompt(axes, boost, penalize) | |
| return "" | |
| def _build_aiml_prompt(axes: list[dict], boost: list[str], penalize: list[str]) -> str: | |
| """Generate AI/ML scoring prompt from axes config.""" | |
| axis_fields = [] | |
| axis_section = [] | |
| for i, ax in enumerate(axes, 1): | |
| name = ax.get("name", f"axis_{i}") | |
| desc = ax.get("description", "") | |
| field = name.lower().replace(" ", "_").replace("&", "and").replace("/", "_").replace("-", "_") | |
| axis_fields.append(field) | |
| axis_section.append(f"{i}. **{field}** — {name}: {desc}") | |
| boost_line = ", ".join(boost) if boost else ( | |
| "New architectures, open-weight models, breakthrough methods, " | |
| "papers with code AND weights, efficiency improvements" | |
| ) | |
| penalize_line = ", ".join(penalize) if penalize else ( | |
| "Surveys, incremental SOTA, closed-model papers, " | |
| "medical/climate/remote sensing applications" | |
| ) | |
| return f"""\ | |
| You are an AI/ML research analyst. Score each paper on three axes (1-10): | |
| {chr(10).join(axis_section)} | |
| Scoring preferences: | |
| - Score UP: {boost_line} | |
| - Score DOWN: {penalize_line} | |
| Use HF ecosystem signals: hf_upvotes > 50 means community interest; hf_models present = weights available; | |
| hf_spaces = demo exists; github_repo = code available; source "both" = higher visibility. | |
| Also provide: | |
| - **summary**: 2-3 sentence practitioner-focused summary. | |
| - **reasoning**: 1-2 sentences explaining scoring. | |
| - **code_url**: Extract GitHub/GitLab URL from abstract/comments if present, else null. | |
| Respond with a JSON array of objects, one per paper, each with fields: | |
| arxiv_id, {", ".join(axis_fields)}, summary, reasoning, code_url | |
| """ | |
| def _build_security_prompt(axes: list[dict], boost: list[str], penalize: list[str]) -> str: | |
| """Generate security scoring prompt from axes config.""" | |
| axis_fields = [] | |
| axes_section = [] | |
| for i, ax in enumerate(axes, 1): | |
| name = ax.get("name", f"axis_{i}") | |
| desc = ax.get("description", "") | |
| field = name.lower().replace(" ", "_").replace("&", "and").replace("/", "_").replace("-", "_") | |
| axis_fields.append(field) | |
| axes_section.append(f"{i}. **{field}** (1-10) — {name}: {desc}") | |
| return f"""\ | |
| You are a security research analyst. Score each paper on three axes (1-10). | |
| === HARD RULES (apply BEFORE scoring) === | |
| 1. If the paper is primarily about LLM safety, alignment, jailbreaking, guardrails, | |
| red-teaming LLMs, or making AI models safer: cap ALL three axes at 3 max. | |
| Check the "llm_adjacent" field — if true, this rule almost certainly applies. | |
| 2. If the paper is a survey, SoK, or literature review: cap {axis_fields[1] if len(axis_fields) > 1 else 'axis_2'} at 2 max. | |
| 3. If the paper is about blockchain, DeFi, cryptocurrency, smart contracts: cap ALL three axes at 2 max. | |
| 4. If the paper is about theoretical differential privacy or federated learning | |
| without concrete security attacks: cap ALL three axes at 3 max. | |
| === SCORING AXES === | |
| {chr(10).join(axes_section)} | |
| === OUTPUT === | |
| For each paper also provide: | |
| - **summary**: 2-3 sentence practitioner-focused summary. | |
| - **reasoning**: 1-2 sentences explaining your scoring. | |
| - **code_url**: Extract GitHub/GitLab URL from abstract/comments if present, else null. | |
| Respond with a JSON array of objects, one per paper, each with fields: | |
| entry_id, {", ".join(axis_fields)}, summary, reasoning, code_url | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Scoring configs per domain | |
| # --------------------------------------------------------------------------- | |
| def _normalize_weights(weights: dict[str, float]) -> dict[str, float]: | |
| """Normalize weight values so they sum to 1.0. | |
| Falls back to equal distribution if all values are zero or negative. | |
| """ | |
| total = sum(weights.values()) | |
| if total <= 0: | |
| n = len(weights) or 1 | |
| return {k: 1.0 / n for k in weights} | |
| return {k: v / total for k, v in weights.items()} | |
| def _build_scoring_configs() -> dict: | |
| """Build SCORING_CONFIGS from config.yaml or defaults.""" | |
| configs = {} | |
| # AI/ML config | |
| aiml_axes_cfg = _aiml_cfg.get("scoring_axes", [ | |
| {"name": "Code & Weights", "weight": 0.30, "description": "Open weights on HF, code on GitHub"}, | |
| {"name": "Novelty", "weight": 0.35, "description": "Paradigm shifts over incremental"}, | |
| {"name": "Practical Applicability", "weight": 0.35, "description": "Usable by practitioners soon"}, | |
| ]) | |
| aiml_prefs = _aiml_cfg.get("preferences", {}) | |
| aiml_weight_keys = ["code_weights", "novelty", "practical"] | |
| aiml_weights = {} | |
| for i, ax in enumerate(aiml_axes_cfg): | |
| key = aiml_weight_keys[i] if i < len(aiml_weight_keys) else f"axis_{i+1}" | |
| aiml_weights[key] = ax.get("weight", 1.0 / len(aiml_axes_cfg)) | |
| aiml_weights = _normalize_weights(aiml_weights) | |
| aiml_axis_fields = [ | |
| ax.get("name", f"axis_{i+1}").lower().replace(" ", "_").replace("&", "and").replace("/", "_").replace("-", "_") | |
| for i, ax in enumerate(aiml_axes_cfg) | |
| ] | |
| configs["aiml"] = { | |
| "weights": aiml_weights, | |
| "axes": aiml_axis_fields, | |
| "axis_labels": [ax.get("name", f"Axis {i+1}") for i, ax in enumerate(aiml_axes_cfg)], | |
| "prompt": _build_scoring_prompt("aiml", aiml_axes_cfg, aiml_prefs), | |
| } | |
| # Security config | |
| sec_axes_cfg = _sec_cfg.get("scoring_axes", [ | |
| {"name": "Has Code/PoC", "weight": 0.25, "description": "Working tools, repos, artifacts"}, | |
| {"name": "Novel Attack Surface", "weight": 0.40, "description": "First-of-kind research"}, | |
| {"name": "Real-World Impact", "weight": 0.35, "description": "Affects production systems"}, | |
| ]) | |
| sec_prefs = _sec_cfg.get("preferences", {}) | |
| sec_weight_keys = ["code", "novelty", "impact"] | |
| sec_weights = {} | |
| for i, ax in enumerate(sec_axes_cfg): | |
| key = sec_weight_keys[i] if i < len(sec_weight_keys) else f"axis_{i+1}" | |
| sec_weights[key] = ax.get("weight", 1.0 / len(sec_axes_cfg)) | |
| sec_weights = _normalize_weights(sec_weights) | |
| sec_axis_fields = [ | |
| ax.get("name", f"axis_{i+1}").lower().replace(" ", "_").replace("&", "and").replace("/", "_").replace("-", "_") | |
| for i, ax in enumerate(sec_axes_cfg) | |
| ] | |
| configs["security"] = { | |
| "weights": sec_weights, | |
| "axes": sec_axis_fields, | |
| "axis_labels": [ax.get("name", f"Axis {i+1}") for i, ax in enumerate(sec_axes_cfg)], | |
| "prompt": _build_scoring_prompt("security", sec_axes_cfg, sec_prefs), | |
| } | |
| return configs | |
| SCORING_CONFIGS = _build_scoring_configs() | |
| # --------------------------------------------------------------------------- | |
| # Events config | |
| # --------------------------------------------------------------------------- | |
| RSS_FEEDS = _cfg.get("rss_feeds", [ | |
| {"name": "OpenAI Blog", "url": "https://openai.com/blog/rss.xml", "category": "news"}, | |
| {"name": "Anthropic Blog", "url": "https://www.anthropic.com/rss.xml", "category": "news"}, | |
| {"name": "Google DeepMind", "url": "https://deepmind.google/blog/rss.xml", "category": "news"}, | |
| {"name": "Meta AI", "url": "https://ai.meta.com/blog/rss/", "category": "news"}, | |
| {"name": "HuggingFace Blog", "url": "https://huggingface.co/blog/feed.xml", "category": "news"}, | |
| {"name": "Krebs on Security", "url": "https://krebsonsecurity.com/feed/", "category": "news"}, | |
| {"name": "The Record", "url": "https://therecord.media/feed", "category": "news"}, | |
| {"name": "Microsoft Security", "url": "https://www.microsoft.com/en-us/security/blog/feed/", "category": "news"}, | |
| ]) | |
| CONFERENCES = _cfg.get("conferences", [ | |
| {"name": "NeurIPS 2026", "url": "https://neurips.cc/", "domain": "aiml", | |
| "deadline": "2026-05-16", "date": "2026-12-07", | |
| "description": "Conference on Neural Information Processing Systems."}, | |
| {"name": "ICML 2026", "url": "https://icml.cc/", "domain": "aiml", | |
| "deadline": "2026-01-23", "date": "2026-07-19", | |
| "description": "International Conference on Machine Learning."}, | |
| {"name": "ICLR 2026", "url": "https://iclr.cc/", "domain": "aiml", | |
| "deadline": "2025-10-01", "date": "2026-04-24", | |
| "description": "International Conference on Learning Representations."}, | |
| {"name": "CVPR 2026", "url": "https://cvpr.thecvf.com/", "domain": "aiml", | |
| "deadline": "2025-11-14", "date": "2026-06-15", | |
| "description": "IEEE/CVF Conference on Computer Vision and Pattern Recognition."}, | |
| {"name": "ACL 2026", "url": "https://www.aclweb.org/", "domain": "aiml", | |
| "deadline": "2026-02-20", "date": "2026-08-02", | |
| "description": "Annual Meeting of the Association for Computational Linguistics."}, | |
| {"name": "IEEE S&P 2026", "url": "https://www.ieee-security.org/TC/SP/", "domain": "security", | |
| "deadline": "2026-06-05", "date": "2026-05-18", | |
| "description": "IEEE Symposium on Security and Privacy."}, | |
| {"name": "USENIX Security 2026", "url": "https://www.usenix.org/conference/usenixsecurity/", "domain": "security", | |
| "deadline": "2026-02-04", "date": "2026-08-12", | |
| "description": "USENIX Security Symposium."}, | |
| {"name": "CCS 2026", "url": "https://www.sigsac.org/ccs/", "domain": "security", | |
| "deadline": "2026-05-01", "date": "2026-11-09", | |
| "description": "ACM Conference on Computer and Communications Security."}, | |
| {"name": "Black Hat USA 2026", "url": "https://www.blackhat.com/", "domain": "security", | |
| "deadline": "2026-04-01", "date": "2026-08-04", | |
| "description": "Black Hat USA."}, | |
| {"name": "DEF CON 34", "url": "https://defcon.org/", "domain": "security", | |
| "deadline": "2026-05-01", "date": "2026-08-06", | |
| "description": "DEF CON hacker conference."}, | |
| ]) | |
| # --------------------------------------------------------------------------- | |
| # GitHub projects (OSSInsight) config | |
| # --------------------------------------------------------------------------- | |
| OSSINSIGHT_API = "https://api.ossinsight.io/v1" | |
| _github_cfg = _cfg.get("github", {}) | |
| OSSINSIGHT_COLLECTIONS = {} | |
| for _coll in _github_cfg.get("collections", []): | |
| if isinstance(_coll, dict): | |
| OSSINSIGHT_COLLECTIONS[_coll["id"]] = (_coll["name"], _coll.get("domain", "aiml")) | |
| elif isinstance(_coll, int): | |
| OSSINSIGHT_COLLECTIONS[_coll] = (str(_coll), "aiml") | |
| if not OSSINSIGHT_COLLECTIONS: | |
| OSSINSIGHT_COLLECTIONS = { | |
| 10010: ("Artificial Intelligence", "aiml"), | |
| 10076: ("LLM Tools", "aiml"), | |
| 10098: ("AI Agent Frameworks", "aiml"), | |
| 10087: ("LLM DevTools", "aiml"), | |
| 10079: ("Stable Diffusion Ecosystem", "aiml"), | |
| 10075: ("ChatGPT Alternatives", "aiml"), | |
| 10094: ("Vector Database", "aiml"), | |
| 10095: ("GraphRAG", "aiml"), | |
| 10099: ("MCP Client", "aiml"), | |
| 10058: ("MLOps Tools", "aiml"), | |
| 10051: ("Security Tool", "security"), | |
| 10082: ("Web Scanner", "security"), | |
| } | |
| OSSINSIGHT_TRENDING_LANGUAGES = ["Python", "Rust", "Go", "TypeScript", "C++"] | |
| GITHUB_AIML_KEYWORDS = re.compile( | |
| r"machine.learn|deep.learn|neural.net|transformer|llm|large.language|" | |
| r"diffusion|generat.ai|gpt|bert|llama|vision.model|multimodal|" | |
| r"reinforcement.learn|computer.vision|nlp|natural.language|" | |
| r"text.to|speech.to|image.generat|video.generat|" | |
| r"fine.tun|training|inference|quantiz|embedding|vector|" | |
| r"rag|retrieval.augment|agent|langchain|" | |
| r"hugging.?face|pytorch|tensorflow|jax|" | |
| r"stable.diffusion|comfyui|ollama|vllm|" | |
| r"tokeniz|dataset|benchmark|model.serv|mlops", | |
| re.IGNORECASE, | |
| ) | |
| GITHUB_SECURITY_KEYWORDS = re.compile( | |
| r"security|pentest|penetration.test|vulnerability|exploit|" | |
| r"fuzzing|fuzz|malware|scanner|scanning|" | |
| r"intrusion|ransomware|phishing|" | |
| r"reverse.engineer|decompil|disassembl|" | |
| r"ctf|capture.the.flag|" | |
| r"firewall|ids\b|ips\b|siem|" | |
| r"password|credential|auth|" | |
| r"xss|csrf|injection|" | |
| r"osint|reconnaissance|recon|" | |
| r"forensic|incident.response|" | |
| r"encryption|cryptograph|" | |
| r"burp|nuclei|nmap|metasploit|wireshark", | |
| re.IGNORECASE, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def get_enabled_domains() -> list[str]: | |
| """Return list of enabled domain keys.""" | |
| if not _domains_cfg: | |
| return ["aiml", "security"] | |
| return [k for k, v in _domains_cfg.items() if v.get("enabled", True)] | |
| def is_pipeline_enabled(pipeline: str) -> bool: | |
| """Check if a pipeline is enabled. | |
| For 'aiml'/'security': checks domain enabled flag. | |
| For 'github'/'events': checks feature enabled flag. | |
| """ | |
| if pipeline in ("aiml", "security"): | |
| if not _domains_cfg: | |
| return True | |
| return _domains_cfg.get(pipeline, {}).get("enabled", True) | |
| if pipeline in ("github", "events"): | |
| return _cfg.get(pipeline, {}).get("enabled", True) | |
| return False | |
| def get_domain_label(domain: str) -> str: | |
| """Return human-readable label for a domain.""" | |
| if _domains_cfg and domain in _domains_cfg: | |
| return _domains_cfg[domain].get("label", domain.upper()) | |
| return {"aiml": "AI/ML", "security": "Security"}.get(domain, domain.upper()) | |
| def save_config(data: dict): | |
| """Write config data to config.yaml.""" | |
| import yaml | |
| with open(CONFIG_PATH, "w") as f: | |
| yaml.dump(data, f, default_flow_style=False, sort_keys=False) | |
| log.info("Config saved to %s", CONFIG_PATH) | |
| global _cfg, FIRST_RUN, SCORING_CONFIGS, SCORING_MODEL, RESCORE_MODEL, RESCORE_TOP_N, BATCH_SIZE | |
| _cfg = data | |
| FIRST_RUN = False | |
| # Reload scoring model settings | |
| _sc = data.get("scoring", {}) | |
| SCORING_MODEL = _sc.get("model", data.get("claude_model", "claude-haiku-4-5-20251001")) | |
| RESCORE_MODEL = _sc.get("rescore_model", "claude-sonnet-4-5-20250929") | |
| RESCORE_TOP_N = _sc.get("rescore_top_n", 15) | |
| BATCH_SIZE = _sc.get("batch_size", data.get("batch_size", 20)) | |
| SCORING_CONFIGS.update(_build_scoring_configs()) | |