API Reference — core/
core/__init__.py — public exports
from core import (
# helpers.py
run, jload, write_tmp, relpath, have_binary,
# models.py
make_finding, dedup_findings, sort_findings,
# hf.py
hf_space_to_git, list_user_spaces, comment_on_space,
# bootstrap.py
bootstrap_binaries,
# baseline.py
make_fingerprint, save_baseline, load_baseline,
filter_by_baseline, parse_ignore_file, apply_ignore_rules,
)
core/scanner.py
scan_repo()
def scan_repo(
repo_url: str,
hf_token: Optional[str] = None,
deep_history: bool = False,
run_security: bool = True,
run_performance: bool = True,
run_llm: bool = True,
max_workers: int = 8,
progress_cb: Optional[Callable[[float, str], None]] = None,
) -> Tuple[List[dict], List[str]]
Clone or copy the target, run all enabled scanners in parallel, return (findings, log).
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
repo_url |
str |
— | HTTPS URL (HF Space or git repo) or local directory path |
hf_token |
str |
None |
HF Bearer token for private/gated repos |
deep_history |
bool |
False |
If True, run full git clone (no --depth 1) and include gitleaks |
run_security |
bool |
True |
Enable security scanners (Semgrep, bandit, pip-audit, …) |
run_performance |
bool |
True |
Enable performance scanners (Semgrep:Perf, ruff) |
run_llm |
bool |
True |
Enable LLM/agent scanners (Semgrep:LLM, agent-audit) |
max_workers |
int |
8 |
Maximum thread-pool workers |
progress_cb |
Callable |
None |
Called with (fraction: float, description: str) as each scanner completes |
Returns: (findings, log) where findings is a deduplicated, sorted List[dict] and log is a List[str] of scanner messages (first entry is the summary "OK (N unique findings)").
Error handling: Never raises. Returns ([], [error_message]) on clone failure or invalid target.
Example:
from core.scanner import scan_repo
findings, log = scan_repo(
"https://huggingface.co/spaces/owner/myspace",
run_performance=False,
progress_cb=lambda f, d: print(f"{f:.0%} {d}"),
)
print(log[0]) # "OK (23 unique findings)"
core/baseline.py
make_fingerprint(finding)
def make_fingerprint(finding: dict) -> str
Return a 16-hex-char deterministic fingerprint:
sha256( tool:rule:file:line:message )[:16]
save_baseline(findings, path)
def save_baseline(findings: List[dict], path: Union[str, Path]) -> None
Persist fingerprints to a JSON file at path. Overwrites if it exists. The JSON format:
{
"created": "2025-01-01T12:00:00Z",
"scanner_version": "4.0.0",
"fingerprints": ["abc123...", ...]
}
load_baseline(path)
def load_baseline(path: Union[str, Path]) -> Set[str]
Return the set of fingerprint strings from a saved baseline JSON file.
filter_by_baseline(findings, baseline)
def filter_by_baseline(
findings: List[dict],
baseline: Set[str],
) -> Tuple[List[dict], List[dict]]
Return (kept, suppressed) — findings whose fingerprints are not in baseline vs those that are.
parse_ignore_file(path)
def parse_ignore_file(path: Union[str, Path]) -> List[IgnoreRule]
Parse a .hfscanignore file and return a list of IgnoreRule dataclass instances.
.hfscanignore syntax:
# comment
tests/ # suppress all findings under tests/
* rule:B101 # suppress rule everywhere
src/legacy/ severity:INFO # suppress INFO severity under path
src/gen/ rule:B608 # suppress rule under path
apply_ignore_rules(findings, rules)
def apply_ignore_rules(
findings: List[dict],
rules: List[IgnoreRule],
) -> Tuple[List[dict], int]
Return (kept_findings, ignored_count). Rules are evaluated in order; first match wins.
IgnoreRule dataclass
@dataclass
class IgnoreRule:
path_prefix: str # "" = wildcard (applies everywhere)
rule_id: str # "" = no rule filter
severity: str # "" = no severity filter
core/models.py
make_finding()
def make_finding(
tool: str,
rule: str,
severity: str,
file: str,
line: int,
message: str,
owasp: Union[str, List[str]],
category: str = "security",
confidence: str = None,
remediation: str = None,
) -> dict
Build a normalized finding dict. All scanner runners call this to ensure uniform output shape.
confidence: ifNone, looked up fromTOOL_DEFAULT_CONFIDENCE; falls back to"possible".remediation: ifNone, looked up fromreport.remediation.REMEDIATIONbyrule; falls back to"".owasp:stris automatically wrapped in a list.
sort_findings(findings)
def sort_findings(findings: List[dict]) -> List[dict]
Sort by severity (ERROR < WARNING < INFO) → confidence (confirmed < likely < possible) → file → line.
dedup_findings(findings)
def dedup_findings(findings: List[dict]) -> List[dict]
Remove duplicates keyed on (tool, file, line, message). Preserves first occurrence order.
Constants
SEVERITY_RANK: dict # {"ERROR": 0, "HIGH": 0, "WARNING": 1, ...}
CONFIDENCE_RANK: dict # {"confirmed": 0, "likely": 1, "possible": 2}
TOOL_DEFAULT_CONFIDENCE: dict # per-tool default confidence levels
FORBIDDEN_FILES: list # file names that are always flagged
core/helpers.py
run(cmd, cwd=None, timeout=300)
def run(cmd: List[str], cwd: str = None, timeout: int = 300) -> Tuple[str, int]
Run a subprocess. Returns (stdout_stripped, returncode). Never raises.
| Exit code | Meaning |
|---|---|
| Normal | Return code from the process |
124 |
Timed out (subprocess.TimeoutExpired) |
127 |
Binary not found (FileNotFoundError) |
jload(txt)
def jload(txt: str) -> Optional[Any]
Parse JSON from a string. Returns None for empty strings or parse errors.
write_tmp(content, suffix=".yaml")
def write_tmp(content: str, suffix: str = ".yaml") -> str
Write content to a temp file and return its absolute path.
relpath(base, p)
def relpath(base: str, p: str) -> str
Return p relative to base. If p is not under base, return str(p) unchanged.
have_binary(name)
def have_binary(name: str) -> bool
Return True if name is on PATH (via shutil.which).
core/hf.py
hf_space_to_git(url, token=None)
def hf_space_to_git(url: str, token: str = None) -> Optional[str]
Convert https://huggingface.co/spaces/<ns>/<name> to a git-cloneable URL. Returns None for non-HF URLs. If token is provided, embeds it as HTTP basic auth (USER:<token>@).
list_user_spaces(username, hf_token=None, limit=500)
def list_user_spaces(
username: str,
hf_token: str = None,
limit: int = 500,
) -> Tuple[List[str], str]
Return (space_urls, status_message). Queries https://huggingface.co/api/spaces?author=<username>. Returns ([], error_message) on HTTP error or network failure.
core/bootstrap.py
bootstrap_binaries()
def bootstrap_binaries() -> dict
Download gitleaks and hadolint binaries for the current platform if not already on PATH. Returns a dict with keys "gitleaks" and "hadolint", values "ok" / "already installed" / "error: ...".
Binaries are placed in:
- Windows:
<venv>\Scripts\(next topython.exe, soshutil.whichfinds them) - macOS/Linux:
~/.local/bin/
Versions: GITLEAKS_VERSION = "8.18.4", HADOLINT_VERSION = "2.12.0" (defined as module constants).