| | """ |
| | Configuration for PubGuard classifier. |
| | |
| | Mirrors openalex_classifier.config with multi-head additions. |
| | """ |
| |
|
| | from dataclasses import dataclass, field |
| | from pathlib import Path |
| | from typing import Dict, List, Optional |
| | import os |
| |
|
| |
|
| | def _find_models_dir() -> Path: |
| | """Locate PubGuard models directory. |
| | |
| | Checks for 'head_doc_type.npz' to distinguish PubGuard models |
| | from other model directories (e.g. OpenAlex) that may exist nearby. |
| | """ |
| | marker = "head_doc_type.npz" |
| |
|
| | if env_dir := os.environ.get("PUBGUARD_MODELS_DIR"): |
| | path = Path(env_dir) |
| | if path.exists(): |
| | return path |
| |
|
| | |
| | pkg = Path(__file__).parent / "models" |
| | if (pkg / marker).exists(): |
| | return pkg |
| |
|
| | |
| | cwd = Path.cwd() / "pubguard_models" |
| | if (cwd / marker).exists(): |
| | return cwd |
| |
|
| | |
| | repo = Path(__file__).parent.parent.parent / "models" |
| | if (repo / marker).exists(): |
| | return repo |
| |
|
| | |
| | home = Path.home() / ".pubguard" / "models" |
| | if (home / marker).exists(): |
| | return home |
| |
|
| | |
| | home.mkdir(parents=True, exist_ok=True) |
| | return home |
| |
|
| |
|
| | |
| |
|
| | DOC_TYPE_LABELS: List[str] = [ |
| | "scientific_paper", |
| | "poster", |
| | "abstract_only", |
| | "junk", |
| | ] |
| |
|
| | AI_DETECT_LABELS: List[str] = [ |
| | "human", |
| | "ai_generated", |
| | ] |
| |
|
| | TOXICITY_LABELS: List[str] = [ |
| | "clean", |
| | "toxic", |
| | ] |
| |
|
| |
|
| | @dataclass |
| | class PubGuardConfig: |
| | """Runtime configuration for PubGuard.""" |
| |
|
| | |
| | |
| | |
| | |
| | model_name: str = "minishlab/potion-base-32M" |
| | embedding_dim: int = 512 |
| |
|
| | |
| | |
| | |
| | |
| | doc_type_threshold: float = 0.50 |
| | ai_detect_threshold: float = 0.55 |
| | toxicity_threshold: float = 0.50 |
| |
|
| | |
| | |
| | |
| | |
| | |
| | require_scientific: bool = True |
| | block_ai_generated: bool = False |
| | block_toxic: bool = False |
| |
|
| | |
| | batch_size: int = 256 |
| | max_text_chars: int = 4000 |
| |
|
| | |
| | models_dir: Optional[Path] = None |
| |
|
| | def __post_init__(self): |
| | if self.models_dir is None: |
| | self.models_dir = _find_models_dir() |
| | self.models_dir = Path(self.models_dir) |
| |
|
| | |
| | @property |
| | def distilled_model_path(self) -> Path: |
| | return self.models_dir / "pubguard-embedding" |
| |
|
| | @property |
| | def doc_type_head_path(self) -> Path: |
| | return self.models_dir / "head_doc_type.npz" |
| |
|
| | @property |
| | def ai_detect_head_path(self) -> Path: |
| | return self.models_dir / "head_ai_detect.npz" |
| |
|
| | @property |
| | def toxicity_head_path(self) -> Path: |
| | return self.models_dir / "head_toxicity.npz" |
| |
|
| | @property |
| | def label_schemas(self) -> Dict[str, List[str]]: |
| | return { |
| | "doc_type": DOC_TYPE_LABELS, |
| | "ai_detect": AI_DETECT_LABELS, |
| | "toxicity": TOXICITY_LABELS, |
| | } |
| |
|