Spaces:
Sleeping
Sleeping
Deploy OpenClaw PR API
Browse files- pyproject.toml +1 -8
- src/slop_farmer/__init__.py +1 -1
- src/slop_farmer/app/cli.py +307 -0
- src/slop_farmer/app/dataset_refresh.py +1021 -0
- src/slop_farmer/app/dataset_status.py +182 -0
- src/slop_farmer/app/deploy.py +11 -2
- src/slop_farmer/app/hf_checkpoint_import.py +10 -70
- src/slop_farmer/app/pipeline.py +12 -90
- src/slop_farmer/app/pr_search.py +74 -0
- src/slop_farmer/app/pr_search_api.py +61 -3
- src/slop_farmer/app/workflow.py +3 -0
- src/slop_farmer/app_config.py +22 -0
- src/slop_farmer/config.py +38 -0
- src/slop_farmer/data/dataset_card.py +107 -0
- src/slop_farmer/data/hf_dataset_repo.py +94 -0
- src/slop_farmer/data/search_duckdb.py +146 -0
- src/slop_farmer/data/snapshot_source.py +31 -0
- src/slop_farmer/reports/analysis.py +9 -17
- src/slop_farmer/reports/analysis_service.py +97 -25
- src/slop_farmer/reports/dashboard.py +9 -2
- src/slop_farmer/reports/new_contributor_report.py +11 -3
- src/slop_farmer/reports/pr_scope.py +9 -16
- src/slop_farmer/reports/pr_search_scope.py +12 -16
- src/slop_farmer/reports/pr_search_service.py +166 -1
- uv.lock +136 -136
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "slop-farmer"
|
| 7 |
-
version = "0.1.
|
| 8 |
description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.13.5"
|
|
@@ -60,13 +60,6 @@ select = [
|
|
| 60 |
]
|
| 61 |
ignore = ["E501"]
|
| 62 |
|
| 63 |
-
[tool.slop-farmer.analyze]
|
| 64 |
-
output-dir = "eval_data"
|
| 65 |
-
hf-repo-id = "evalstate/transformers-pr"
|
| 66 |
-
ranking-backend = "hybrid"
|
| 67 |
-
model = "gpt-5.4-mini"
|
| 68 |
-
max-clusters = 10
|
| 69 |
-
|
| 70 |
[tool.slop-farmer.dashboard-data]
|
| 71 |
output-dir = "web/public/data"
|
| 72 |
window-days = 14
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "slop-farmer"
|
| 7 |
+
version = "0.1.1"
|
| 8 |
description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.13.5"
|
|
|
|
| 60 |
]
|
| 61 |
ignore = ["E501"]
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
[tool.slop-farmer.dashboard-data]
|
| 64 |
output-dir = "web/public/data"
|
| 65 |
window-days = 14
|
src/slop_farmer/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
__all__ = ["__version__"]
|
| 2 |
|
| 3 |
-
__version__ = "0.1.
|
|
|
|
| 1 |
__all__ = ["__version__"]
|
| 2 |
|
| 3 |
+
__version__ = "0.1.1"
|
src/slop_farmer/app/cli.py
CHANGED
|
@@ -13,6 +13,8 @@ from slop_farmer.config import (
|
|
| 13 |
AnalysisOptions,
|
| 14 |
CheckpointImportOptions,
|
| 15 |
DashboardDataOptions,
|
|
|
|
|
|
|
| 16 |
DeployDashboardOptions,
|
| 17 |
FullPipelineOptions,
|
| 18 |
MarkdownReportOptions,
|
|
@@ -41,6 +43,7 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 41 |
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 42 |
|
| 43 |
_add_scrape_parser(subparsers, defaults["scrape"])
|
|
|
|
| 44 |
_add_analyze_parser(subparsers, defaults["analyze"])
|
| 45 |
_add_pr_scope_parser(subparsers, defaults["pr-scope"])
|
| 46 |
_add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
|
|
@@ -52,6 +55,7 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 52 |
_add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
|
| 53 |
_add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
|
| 54 |
_add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
|
|
|
|
| 55 |
_add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
|
| 56 |
return parser
|
| 57 |
|
|
@@ -59,6 +63,7 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 59 |
def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
|
| 60 |
commands = (
|
| 61 |
"scrape",
|
|
|
|
| 62 |
"analyze",
|
| 63 |
"import-hf-checkpoint",
|
| 64 |
"pr-scope",
|
|
@@ -68,6 +73,7 @@ def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]
|
|
| 68 |
"dashboard-data",
|
| 69 |
"publish-snapshot",
|
| 70 |
"deploy-dashboard",
|
|
|
|
| 71 |
"full-pipeline",
|
| 72 |
)
|
| 73 |
return {command: command_defaults(command, config_path=config_path) for command in commands}
|
|
@@ -184,6 +190,80 @@ def _add_scrape_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 184 |
)
|
| 185 |
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 188 |
analyze = subparsers.add_parser(
|
| 189 |
"analyze", help="Analyze a local snapshot and write a shortlist JSON report."
|
|
@@ -637,6 +717,61 @@ def _add_pr_search_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 637 |
status.add_argument("--repo", help="Optional repo override.")
|
| 638 |
status.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 639 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 642 |
new_contributor = subparsers.add_parser(
|
|
@@ -659,6 +794,24 @@ def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]
|
|
| 659 |
new_contributor.add_argument(
|
| 660 |
"--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
|
| 661 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
new_contributor.add_argument(
|
| 663 |
"--window-days",
|
| 664 |
type=int,
|
|
@@ -702,6 +855,24 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
|
|
| 702 |
type=Path,
|
| 703 |
help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
|
| 704 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
dashboard.add_argument(
|
| 706 |
"--window-days",
|
| 707 |
type=int,
|
|
@@ -761,6 +932,24 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 761 |
deploy_dashboard.add_argument(
|
| 762 |
"--contributors-input", type=Path, help="Optional contributor report JSON override."
|
| 763 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
deploy_dashboard.add_argument(
|
| 765 |
"--refresh-contributors",
|
| 766 |
action="store_true",
|
|
@@ -817,6 +1006,31 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 817 |
)
|
| 818 |
|
| 819 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 821 |
full_pipeline = subparsers.add_parser(
|
| 822 |
"full-pipeline",
|
|
@@ -933,6 +1147,33 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 933 |
print(run_pipeline(options))
|
| 934 |
|
| 935 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 937 |
from slop_farmer.reports.analysis import run_analysis
|
| 938 |
|
|
@@ -1041,12 +1282,18 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1041 |
explain_pr_search_pair,
|
| 1042 |
format_pr_search_candidate_clusters,
|
| 1043 |
format_pr_search_cluster,
|
|
|
|
|
|
|
| 1044 |
format_pr_search_pair,
|
| 1045 |
format_pr_search_probe,
|
|
|
|
| 1046 |
format_pr_search_similar,
|
| 1047 |
format_pr_search_status,
|
| 1048 |
get_pr_search_candidate_clusters,
|
| 1049 |
get_pr_search_cluster,
|
|
|
|
|
|
|
|
|
|
| 1050 |
get_pr_search_similar,
|
| 1051 |
get_pr_search_status,
|
| 1052 |
probe_pr_search_github,
|
|
@@ -1140,6 +1387,36 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1140 |
print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
|
| 1141 |
return
|
| 1142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
|
| 1144 |
|
| 1145 |
|
|
@@ -1181,6 +1458,7 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
|
|
| 1181 |
del config_path
|
| 1182 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 1183 |
|
|
|
|
| 1184 |
print(
|
| 1185 |
run_new_contributor_report(
|
| 1186 |
NewContributorReportOptions(
|
|
@@ -1188,6 +1466,9 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
|
|
| 1188 |
output_dir=args.output_dir,
|
| 1189 |
output=args.output,
|
| 1190 |
json_output=args.json_output,
|
|
|
|
|
|
|
|
|
|
| 1191 |
window_days=args.window_days,
|
| 1192 |
max_authors=args.max_authors,
|
| 1193 |
)
|
|
@@ -1199,6 +1480,7 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
|
|
| 1199 |
from slop_farmer.reports.dashboard import run_dashboard_data
|
| 1200 |
|
| 1201 |
dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
|
|
|
|
| 1202 |
print(
|
| 1203 |
run_dashboard_data(
|
| 1204 |
DashboardDataOptions(
|
|
@@ -1207,6 +1489,9 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
|
|
| 1207 |
analysis_input=args.analysis_input,
|
| 1208 |
contributors_input=args.contributors_input,
|
| 1209 |
pr_scope_input=args.pr_scope_input,
|
|
|
|
|
|
|
|
|
|
| 1210 |
window_days=args.window_days,
|
| 1211 |
snapshot_root=(
|
| 1212 |
Path(dashboard_defaults["snapshot-root"])
|
|
@@ -1222,6 +1507,7 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1222 |
del config_path
|
| 1223 |
from slop_farmer.app.deploy import run_deploy_dashboard
|
| 1224 |
|
|
|
|
| 1225 |
run_deploy_dashboard(
|
| 1226 |
DeployDashboardOptions(
|
| 1227 |
pipeline_data_dir=args.pipeline_data_dir,
|
|
@@ -1229,6 +1515,9 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1229 |
snapshot_dir=args.snapshot_dir,
|
| 1230 |
analysis_input=args.analysis_input,
|
| 1231 |
contributors_input=args.contributors_input,
|
|
|
|
|
|
|
|
|
|
| 1232 |
refresh_contributors=args.refresh_contributors,
|
| 1233 |
dashboard_window_days=args.dashboard_window_days,
|
| 1234 |
contributor_window_days=args.contributor_window_days,
|
|
@@ -1247,6 +1536,22 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1247 |
)
|
| 1248 |
|
| 1249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1250 |
def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1251 |
del config_path
|
| 1252 |
from slop_farmer.app.publish import run_publish_snapshot
|
|
@@ -1296,6 +1601,7 @@ def main() -> None:
|
|
| 1296 |
|
| 1297 |
handlers: dict[str, CommandHandler] = {
|
| 1298 |
"scrape": _run_scrape,
|
|
|
|
| 1299 |
"analyze": _run_analyze,
|
| 1300 |
"markdown-report": _run_markdown_report,
|
| 1301 |
"duplicate-prs": _run_duplicate_prs,
|
|
@@ -1306,6 +1612,7 @@ def main() -> None:
|
|
| 1306 |
"new-contributor-report": _run_new_contributor_report,
|
| 1307 |
"dashboard-data": _run_dashboard_data,
|
| 1308 |
"deploy-dashboard": _run_deploy_dashboard,
|
|
|
|
| 1309 |
"publish-snapshot": _run_publish_snapshot,
|
| 1310 |
"full-pipeline": _run_full_pipeline,
|
| 1311 |
}
|
|
|
|
| 13 |
AnalysisOptions,
|
| 14 |
CheckpointImportOptions,
|
| 15 |
DashboardDataOptions,
|
| 16 |
+
DatasetRefreshOptions,
|
| 17 |
+
DatasetStatusOptions,
|
| 18 |
DeployDashboardOptions,
|
| 19 |
FullPipelineOptions,
|
| 20 |
MarkdownReportOptions,
|
|
|
|
| 43 |
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 44 |
|
| 45 |
_add_scrape_parser(subparsers, defaults["scrape"])
|
| 46 |
+
_add_refresh_dataset_parser(subparsers, defaults["refresh-dataset"])
|
| 47 |
_add_analyze_parser(subparsers, defaults["analyze"])
|
| 48 |
_add_pr_scope_parser(subparsers, defaults["pr-scope"])
|
| 49 |
_add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
|
|
|
|
| 55 |
_add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
|
| 56 |
_add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
|
| 57 |
_add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
|
| 58 |
+
_add_dataset_status_parser(subparsers, defaults["dataset-status"])
|
| 59 |
_add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
|
| 60 |
return parser
|
| 61 |
|
|
|
|
| 63 |
def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
|
| 64 |
commands = (
|
| 65 |
"scrape",
|
| 66 |
+
"refresh-dataset",
|
| 67 |
"analyze",
|
| 68 |
"import-hf-checkpoint",
|
| 69 |
"pr-scope",
|
|
|
|
| 73 |
"dashboard-data",
|
| 74 |
"publish-snapshot",
|
| 75 |
"deploy-dashboard",
|
| 76 |
+
"dataset-status",
|
| 77 |
"full-pipeline",
|
| 78 |
)
|
| 79 |
return {command: command_defaults(command, config_path=config_path) for command in commands}
|
|
|
|
| 190 |
)
|
| 191 |
|
| 192 |
|
| 193 |
+
def _add_refresh_dataset_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 194 |
+
refresh = subparsers.add_parser(
|
| 195 |
+
"refresh-dataset",
|
| 196 |
+
help="Refresh the canonical Hugging Face dataset repo from remote watermark state.",
|
| 197 |
+
)
|
| 198 |
+
refresh.add_argument(
|
| 199 |
+
"--repo",
|
| 200 |
+
default=defaults.get("repo", "huggingface/transformers"),
|
| 201 |
+
help="GitHub repository in owner/name form.",
|
| 202 |
+
)
|
| 203 |
+
refresh.add_argument(
|
| 204 |
+
"--hf-repo-id",
|
| 205 |
+
default=defaults.get("hf-repo-id"),
|
| 206 |
+
required=defaults.get("hf-repo-id") is None,
|
| 207 |
+
help="Canonical Hugging Face dataset repo id to refresh.",
|
| 208 |
+
)
|
| 209 |
+
refresh.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
|
| 210 |
+
refresh.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
|
| 211 |
+
refresh.add_argument(
|
| 212 |
+
"--max-issue-comments", type=int, default=defaults.get("max-issue-comments")
|
| 213 |
+
)
|
| 214 |
+
refresh.add_argument(
|
| 215 |
+
"--max-reviews-per-pr", type=int, default=defaults.get("max-reviews-per-pr")
|
| 216 |
+
)
|
| 217 |
+
refresh.add_argument(
|
| 218 |
+
"--max-review-comments-per-pr",
|
| 219 |
+
type=int,
|
| 220 |
+
default=defaults.get("max-review-comments-per-pr"),
|
| 221 |
+
)
|
| 222 |
+
refresh.add_argument(
|
| 223 |
+
"--fetch-timeline",
|
| 224 |
+
action="store_true",
|
| 225 |
+
default=bool(defaults.get("fetch-timeline", False)),
|
| 226 |
+
)
|
| 227 |
+
refresh.add_argument(
|
| 228 |
+
"--new-contributor-report",
|
| 229 |
+
dest="new_contributor_report",
|
| 230 |
+
action="store_true",
|
| 231 |
+
default=bool(defaults.get("new-contributor-report", True)),
|
| 232 |
+
)
|
| 233 |
+
refresh.add_argument(
|
| 234 |
+
"--no-new-contributor-report",
|
| 235 |
+
dest="new_contributor_report",
|
| 236 |
+
action="store_false",
|
| 237 |
+
)
|
| 238 |
+
refresh.add_argument(
|
| 239 |
+
"--new-contributor-window-days",
|
| 240 |
+
type=int,
|
| 241 |
+
default=int(defaults.get("new-contributor-window-days", 42)),
|
| 242 |
+
)
|
| 243 |
+
refresh.add_argument(
|
| 244 |
+
"--new-contributor-max-authors",
|
| 245 |
+
type=int,
|
| 246 |
+
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 247 |
+
)
|
| 248 |
+
refresh.add_argument("--http-timeout", type=int, default=300)
|
| 249 |
+
refresh.add_argument("--http-max-retries", type=int, default=8)
|
| 250 |
+
refresh.add_argument("--checkpoint-every-comments", type=int, default=1000)
|
| 251 |
+
refresh.add_argument("--checkpoint-every-prs", type=int, default=25)
|
| 252 |
+
refresh.add_argument(
|
| 253 |
+
"--private-hf-repo",
|
| 254 |
+
dest="private_hf_repo",
|
| 255 |
+
action="store_true",
|
| 256 |
+
default=bool(defaults.get("private-hf-repo", False)),
|
| 257 |
+
help="Create the target dataset repo as private if needed.",
|
| 258 |
+
)
|
| 259 |
+
refresh.add_argument(
|
| 260 |
+
"--private",
|
| 261 |
+
dest="private_hf_repo",
|
| 262 |
+
action="store_true",
|
| 263 |
+
help=argparse.SUPPRESS,
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 268 |
analyze = subparsers.add_parser(
|
| 269 |
"analyze", help="Analyze a local snapshot and write a shortlist JSON report."
|
|
|
|
| 717 |
status.add_argument("--repo", help="Optional repo override.")
|
| 718 |
status.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 719 |
|
| 720 |
+
contributor = pr_search_subparsers.add_parser(
|
| 721 |
+
"contributor", help="Show indexed contributor summary for one author login."
|
| 722 |
+
)
|
| 723 |
+
contributor.add_argument("login", help="GitHub author login to query.")
|
| 724 |
+
contributor.add_argument(
|
| 725 |
+
"--db",
|
| 726 |
+
type=Path,
|
| 727 |
+
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 728 |
+
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 729 |
+
)
|
| 730 |
+
contributor.add_argument(
|
| 731 |
+
"--output-dir",
|
| 732 |
+
type=Path,
|
| 733 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 734 |
+
)
|
| 735 |
+
contributor.add_argument("--repo", help="Optional repo override.")
|
| 736 |
+
contributor.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 737 |
+
|
| 738 |
+
contributor_prs = pr_search_subparsers.add_parser(
|
| 739 |
+
"contributor-prs", help="List indexed PRs for one contributor login."
|
| 740 |
+
)
|
| 741 |
+
contributor_prs.add_argument("login", help="GitHub author login to query.")
|
| 742 |
+
contributor_prs.add_argument(
|
| 743 |
+
"--db",
|
| 744 |
+
type=Path,
|
| 745 |
+
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 746 |
+
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 747 |
+
)
|
| 748 |
+
contributor_prs.add_argument(
|
| 749 |
+
"--output-dir",
|
| 750 |
+
type=Path,
|
| 751 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 752 |
+
)
|
| 753 |
+
contributor_prs.add_argument("--repo", help="Optional repo override.")
|
| 754 |
+
contributor_prs.add_argument("--limit", type=int, default=20, help="Maximum rows to show.")
|
| 755 |
+
contributor_prs.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 756 |
+
|
| 757 |
+
pr_contributor = pr_search_subparsers.add_parser(
|
| 758 |
+
"pr-contributor", help="Show contributor summary for the author of one indexed PR."
|
| 759 |
+
)
|
| 760 |
+
pr_contributor.add_argument("pr_number", type=int, help="Pull request number to query.")
|
| 761 |
+
pr_contributor.add_argument(
|
| 762 |
+
"--db",
|
| 763 |
+
type=Path,
|
| 764 |
+
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 765 |
+
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 766 |
+
)
|
| 767 |
+
pr_contributor.add_argument(
|
| 768 |
+
"--output-dir",
|
| 769 |
+
type=Path,
|
| 770 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 771 |
+
)
|
| 772 |
+
pr_contributor.add_argument("--repo", help="Optional repo override.")
|
| 773 |
+
pr_contributor.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 774 |
+
|
| 775 |
|
| 776 |
def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 777 |
new_contributor = subparsers.add_parser(
|
|
|
|
| 794 |
new_contributor.add_argument(
|
| 795 |
"--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
|
| 796 |
)
|
| 797 |
+
new_contributor.add_argument(
|
| 798 |
+
"--hf-repo-id",
|
| 799 |
+
default=defaults.get("hf-repo-id"),
|
| 800 |
+
help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
|
| 801 |
+
)
|
| 802 |
+
new_contributor.add_argument(
|
| 803 |
+
"--hf-revision",
|
| 804 |
+
default=defaults.get("hf-revision"),
|
| 805 |
+
help="Optional Hub revision for metadata and README download.",
|
| 806 |
+
)
|
| 807 |
+
new_contributor.add_argument(
|
| 808 |
+
"--hf-materialize-dir",
|
| 809 |
+
type=Path,
|
| 810 |
+
default=Path(defaults["hf-materialize-dir"])
|
| 811 |
+
if defaults.get("hf-materialize-dir")
|
| 812 |
+
else None,
|
| 813 |
+
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 814 |
+
)
|
| 815 |
new_contributor.add_argument(
|
| 816 |
"--window-days",
|
| 817 |
type=int,
|
|
|
|
| 855 |
type=Path,
|
| 856 |
help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
|
| 857 |
)
|
| 858 |
+
dashboard.add_argument(
|
| 859 |
+
"--hf-repo-id",
|
| 860 |
+
default=defaults.get("hf-repo-id"),
|
| 861 |
+
help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
|
| 862 |
+
)
|
| 863 |
+
dashboard.add_argument(
|
| 864 |
+
"--hf-revision",
|
| 865 |
+
default=defaults.get("hf-revision"),
|
| 866 |
+
help="Optional Hub revision for metadata and README download.",
|
| 867 |
+
)
|
| 868 |
+
dashboard.add_argument(
|
| 869 |
+
"--hf-materialize-dir",
|
| 870 |
+
type=Path,
|
| 871 |
+
default=Path(defaults["hf-materialize-dir"])
|
| 872 |
+
if defaults.get("hf-materialize-dir")
|
| 873 |
+
else None,
|
| 874 |
+
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 875 |
+
)
|
| 876 |
dashboard.add_argument(
|
| 877 |
"--window-days",
|
| 878 |
type=int,
|
|
|
|
| 932 |
deploy_dashboard.add_argument(
|
| 933 |
"--contributors-input", type=Path, help="Optional contributor report JSON override."
|
| 934 |
)
|
| 935 |
+
deploy_dashboard.add_argument(
|
| 936 |
+
"--hf-repo-id",
|
| 937 |
+
default=defaults.get("hf-repo-id"),
|
| 938 |
+
help="Materialize a Hugging Face dataset repo instead of using the latest local snapshot.",
|
| 939 |
+
)
|
| 940 |
+
deploy_dashboard.add_argument(
|
| 941 |
+
"--hf-revision",
|
| 942 |
+
default=defaults.get("hf-revision"),
|
| 943 |
+
help="Optional Hub revision for metadata and README download.",
|
| 944 |
+
)
|
| 945 |
+
deploy_dashboard.add_argument(
|
| 946 |
+
"--hf-materialize-dir",
|
| 947 |
+
type=Path,
|
| 948 |
+
default=Path(defaults["hf-materialize-dir"])
|
| 949 |
+
if defaults.get("hf-materialize-dir")
|
| 950 |
+
else None,
|
| 951 |
+
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 952 |
+
)
|
| 953 |
deploy_dashboard.add_argument(
|
| 954 |
"--refresh-contributors",
|
| 955 |
action="store_true",
|
|
|
|
| 1006 |
)
|
| 1007 |
|
| 1008 |
|
| 1009 |
+
def _add_dataset_status_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 1010 |
+
dataset_status = subparsers.add_parser(
|
| 1011 |
+
"dataset-status",
|
| 1012 |
+
help="Inspect canonical dataset freshness and the local latest pointer.",
|
| 1013 |
+
)
|
| 1014 |
+
dataset_status.add_argument("--repo", default=defaults.get("repo"))
|
| 1015 |
+
dataset_status.add_argument(
|
| 1016 |
+
"--output-dir",
|
| 1017 |
+
type=Path,
|
| 1018 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 1019 |
+
help="Local workspace root containing snapshots/latest.json.",
|
| 1020 |
+
)
|
| 1021 |
+
dataset_status.add_argument(
|
| 1022 |
+
"--hf-repo-id",
|
| 1023 |
+
default=defaults.get("hf-repo-id"),
|
| 1024 |
+
help="Canonical Hugging Face dataset repo id to inspect.",
|
| 1025 |
+
)
|
| 1026 |
+
dataset_status.add_argument(
|
| 1027 |
+
"--hf-revision",
|
| 1028 |
+
default=defaults.get("hf-revision"),
|
| 1029 |
+
help="Optional Hub revision for metadata and README download.",
|
| 1030 |
+
)
|
| 1031 |
+
dataset_status.add_argument("--json", action="store_true", help="Emit machine-readable JSON.")
|
| 1032 |
+
|
| 1033 |
+
|
| 1034 |
def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 1035 |
full_pipeline = subparsers.add_parser(
|
| 1036 |
"full-pipeline",
|
|
|
|
| 1147 |
print(run_pipeline(options))
|
| 1148 |
|
| 1149 |
|
| 1150 |
+
def _run_refresh_dataset(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1151 |
+
del config_path
|
| 1152 |
+
from slop_farmer.app.dataset_refresh import run_dataset_refresh
|
| 1153 |
+
|
| 1154 |
+
result = run_dataset_refresh(
|
| 1155 |
+
DatasetRefreshOptions(
|
| 1156 |
+
repo=RepoRef.parse(args.repo),
|
| 1157 |
+
hf_repo_id=args.hf_repo_id,
|
| 1158 |
+
private_hf_repo=args.private_hf_repo,
|
| 1159 |
+
max_issues=args.max_issues,
|
| 1160 |
+
max_prs=args.max_prs,
|
| 1161 |
+
max_issue_comments=args.max_issue_comments,
|
| 1162 |
+
max_reviews_per_pr=args.max_reviews_per_pr,
|
| 1163 |
+
max_review_comments_per_pr=args.max_review_comments_per_pr,
|
| 1164 |
+
fetch_timeline=args.fetch_timeline,
|
| 1165 |
+
new_contributor_report=args.new_contributor_report,
|
| 1166 |
+
new_contributor_window_days=args.new_contributor_window_days,
|
| 1167 |
+
new_contributor_max_authors=args.new_contributor_max_authors,
|
| 1168 |
+
http_timeout=args.http_timeout,
|
| 1169 |
+
http_max_retries=args.http_max_retries,
|
| 1170 |
+
checkpoint_every_comments=args.checkpoint_every_comments,
|
| 1171 |
+
checkpoint_every_prs=args.checkpoint_every_prs,
|
| 1172 |
+
)
|
| 1173 |
+
)
|
| 1174 |
+
print(json.dumps(result, indent=2))
|
| 1175 |
+
|
| 1176 |
+
|
| 1177 |
def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1178 |
from slop_farmer.reports.analysis import run_analysis
|
| 1179 |
|
|
|
|
| 1282 |
explain_pr_search_pair,
|
| 1283 |
format_pr_search_candidate_clusters,
|
| 1284 |
format_pr_search_cluster,
|
| 1285 |
+
format_pr_search_contributor,
|
| 1286 |
+
format_pr_search_contributor_pulls,
|
| 1287 |
format_pr_search_pair,
|
| 1288 |
format_pr_search_probe,
|
| 1289 |
+
format_pr_search_pull_contributor,
|
| 1290 |
format_pr_search_similar,
|
| 1291 |
format_pr_search_status,
|
| 1292 |
get_pr_search_candidate_clusters,
|
| 1293 |
get_pr_search_cluster,
|
| 1294 |
+
get_pr_search_contributor,
|
| 1295 |
+
get_pr_search_contributor_pulls,
|
| 1296 |
+
get_pr_search_pull_contributor,
|
| 1297 |
get_pr_search_similar,
|
| 1298 |
get_pr_search_status,
|
| 1299 |
probe_pr_search_github,
|
|
|
|
| 1387 |
print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
|
| 1388 |
return
|
| 1389 |
|
| 1390 |
+
if args.pr_search_command == "contributor":
|
| 1391 |
+
result = get_pr_search_contributor(db_path, author_login=args.login, repo=args.repo)
|
| 1392 |
+
print(json.dumps(result, indent=2) if args.json else format_pr_search_contributor(result))
|
| 1393 |
+
return
|
| 1394 |
+
|
| 1395 |
+
if args.pr_search_command == "contributor-prs":
|
| 1396 |
+
result = get_pr_search_contributor_pulls(
|
| 1397 |
+
db_path,
|
| 1398 |
+
author_login=args.login,
|
| 1399 |
+
repo=args.repo,
|
| 1400 |
+
limit=args.limit,
|
| 1401 |
+
)
|
| 1402 |
+
print(
|
| 1403 |
+
json.dumps(result, indent=2)
|
| 1404 |
+
if args.json
|
| 1405 |
+
else format_pr_search_contributor_pulls(result)
|
| 1406 |
+
)
|
| 1407 |
+
return
|
| 1408 |
+
|
| 1409 |
+
if args.pr_search_command == "pr-contributor":
|
| 1410 |
+
result = get_pr_search_pull_contributor(
|
| 1411 |
+
db_path,
|
| 1412 |
+
pr_number=args.pr_number,
|
| 1413 |
+
repo=args.repo,
|
| 1414 |
+
)
|
| 1415 |
+
print(
|
| 1416 |
+
json.dumps(result, indent=2) if args.json else format_pr_search_pull_contributor(result)
|
| 1417 |
+
)
|
| 1418 |
+
return
|
| 1419 |
+
|
| 1420 |
raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
|
| 1421 |
|
| 1422 |
|
|
|
|
| 1458 |
del config_path
|
| 1459 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 1460 |
|
| 1461 |
+
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1462 |
print(
|
| 1463 |
run_new_contributor_report(
|
| 1464 |
NewContributorReportOptions(
|
|
|
|
| 1466 |
output_dir=args.output_dir,
|
| 1467 |
output=args.output,
|
| 1468 |
json_output=args.json_output,
|
| 1469 |
+
hf_repo_id=hf_repo_id,
|
| 1470 |
+
hf_revision=hf_revision,
|
| 1471 |
+
hf_materialize_dir=hf_materialize_dir,
|
| 1472 |
window_days=args.window_days,
|
| 1473 |
max_authors=args.max_authors,
|
| 1474 |
)
|
|
|
|
| 1480 |
from slop_farmer.reports.dashboard import run_dashboard_data
|
| 1481 |
|
| 1482 |
dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
|
| 1483 |
+
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1484 |
print(
|
| 1485 |
run_dashboard_data(
|
| 1486 |
DashboardDataOptions(
|
|
|
|
| 1489 |
analysis_input=args.analysis_input,
|
| 1490 |
contributors_input=args.contributors_input,
|
| 1491 |
pr_scope_input=args.pr_scope_input,
|
| 1492 |
+
hf_repo_id=hf_repo_id,
|
| 1493 |
+
hf_revision=hf_revision,
|
| 1494 |
+
hf_materialize_dir=hf_materialize_dir,
|
| 1495 |
window_days=args.window_days,
|
| 1496 |
snapshot_root=(
|
| 1497 |
Path(dashboard_defaults["snapshot-root"])
|
|
|
|
| 1507 |
del config_path
|
| 1508 |
from slop_farmer.app.deploy import run_deploy_dashboard
|
| 1509 |
|
| 1510 |
+
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1511 |
run_deploy_dashboard(
|
| 1512 |
DeployDashboardOptions(
|
| 1513 |
pipeline_data_dir=args.pipeline_data_dir,
|
|
|
|
| 1515 |
snapshot_dir=args.snapshot_dir,
|
| 1516 |
analysis_input=args.analysis_input,
|
| 1517 |
contributors_input=args.contributors_input,
|
| 1518 |
+
hf_repo_id=hf_repo_id,
|
| 1519 |
+
hf_revision=hf_revision,
|
| 1520 |
+
hf_materialize_dir=hf_materialize_dir,
|
| 1521 |
refresh_contributors=args.refresh_contributors,
|
| 1522 |
dashboard_window_days=args.dashboard_window_days,
|
| 1523 |
contributor_window_days=args.contributor_window_days,
|
|
|
|
| 1536 |
)
|
| 1537 |
|
| 1538 |
|
| 1539 |
+
def _run_dataset_status(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1540 |
+
del config_path
|
| 1541 |
+
from slop_farmer.app.dataset_status import format_dataset_status, get_dataset_status
|
| 1542 |
+
|
| 1543 |
+
result = get_dataset_status(
|
| 1544 |
+
DatasetStatusOptions(
|
| 1545 |
+
repo=args.repo,
|
| 1546 |
+
output_dir=args.output_dir,
|
| 1547 |
+
hf_repo_id=args.hf_repo_id,
|
| 1548 |
+
hf_revision=args.hf_revision,
|
| 1549 |
+
json_output=args.json,
|
| 1550 |
+
)
|
| 1551 |
+
)
|
| 1552 |
+
print(json.dumps(result, indent=2) if args.json else format_dataset_status(result))
|
| 1553 |
+
|
| 1554 |
+
|
| 1555 |
def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1556 |
del config_path
|
| 1557 |
from slop_farmer.app.publish import run_publish_snapshot
|
|
|
|
| 1601 |
|
| 1602 |
handlers: dict[str, CommandHandler] = {
|
| 1603 |
"scrape": _run_scrape,
|
| 1604 |
+
"refresh-dataset": _run_refresh_dataset,
|
| 1605 |
"analyze": _run_analyze,
|
| 1606 |
"markdown-report": _run_markdown_report,
|
| 1607 |
"duplicate-prs": _run_duplicate_prs,
|
|
|
|
| 1612 |
"new-contributor-report": _run_new_contributor_report,
|
| 1613 |
"dashboard-data": _run_dashboard_data,
|
| 1614 |
"deploy-dashboard": _run_deploy_dashboard,
|
| 1615 |
+
"dataset-status": _run_dataset_status,
|
| 1616 |
"publish-snapshot": _run_publish_snapshot,
|
| 1617 |
"full-pipeline": _run_full_pipeline,
|
| 1618 |
}
|
src/slop_farmer/app/dataset_refresh.py
ADDED
|
@@ -0,0 +1,1021 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import shutil
|
| 7 |
+
import tempfile
|
| 8 |
+
import time
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
from datetime import UTC, datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Any
|
| 13 |
+
|
| 14 |
+
from huggingface_hub import HfApi
|
| 15 |
+
|
| 16 |
+
from slop_farmer.app_config import command_defaults, extract_cli_config_path
|
| 17 |
+
from slop_farmer.config import (
|
| 18 |
+
DatasetRefreshOptions,
|
| 19 |
+
NewContributorReportOptions,
|
| 20 |
+
RepoRef,
|
| 21 |
+
resolve_github_token,
|
| 22 |
+
)
|
| 23 |
+
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 24 |
+
from slop_farmer.data.github_api import GitHubClient
|
| 25 |
+
from slop_farmer.data.hf_dataset_repo import (
|
| 26 |
+
list_remote_paths,
|
| 27 |
+
load_remote_file,
|
| 28 |
+
load_remote_json_file,
|
| 29 |
+
stable_snapshot_candidates,
|
| 30 |
+
)
|
| 31 |
+
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 32 |
+
from slop_farmer.data.normalize import (
|
| 33 |
+
issue_url_to_number,
|
| 34 |
+
normalize_comment,
|
| 35 |
+
normalize_issue,
|
| 36 |
+
normalize_pr_diff,
|
| 37 |
+
normalize_pr_file,
|
| 38 |
+
normalize_pull_request,
|
| 39 |
+
normalize_review,
|
| 40 |
+
normalize_review_comment,
|
| 41 |
+
normalize_timeline_event,
|
| 42 |
+
)
|
| 43 |
+
from slop_farmer.data.parquet_io import (
|
| 44 |
+
SCHEMAS,
|
| 45 |
+
read_parquet_rows,
|
| 46 |
+
write_json,
|
| 47 |
+
write_parquet,
|
| 48 |
+
write_text,
|
| 49 |
+
)
|
| 50 |
+
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 51 |
+
|
| 52 |
+
PRIMARY_KEYS: dict[str, tuple[str, ...]] = {
|
| 53 |
+
"issues": ("github_id",),
|
| 54 |
+
"pull_requests": ("github_id",),
|
| 55 |
+
"comments": ("github_id",),
|
| 56 |
+
"reviews": ("github_id",),
|
| 57 |
+
"review_comments": ("github_id",),
|
| 58 |
+
"pr_files": ("repo", "pull_request_number", "filename"),
|
| 59 |
+
"pr_diffs": ("repo", "pull_request_number"),
|
| 60 |
+
"links": (
|
| 61 |
+
"repo",
|
| 62 |
+
"source_type",
|
| 63 |
+
"source_number",
|
| 64 |
+
"source_github_id",
|
| 65 |
+
"target_owner",
|
| 66 |
+
"target_repo",
|
| 67 |
+
"target_number",
|
| 68 |
+
"link_type",
|
| 69 |
+
"link_origin",
|
| 70 |
+
),
|
| 71 |
+
"events": (
|
| 72 |
+
"repo",
|
| 73 |
+
"parent_kind",
|
| 74 |
+
"parent_number",
|
| 75 |
+
"event",
|
| 76 |
+
"created_at",
|
| 77 |
+
"actor_login",
|
| 78 |
+
"source_issue_number",
|
| 79 |
+
"source_issue_url",
|
| 80 |
+
"commit_id",
|
| 81 |
+
"label_name",
|
| 82 |
+
),
|
| 83 |
+
}
|
| 84 |
+
CHECKPOINT_PREFIXES = ("_checkpoints", "checkpoints")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def log(message: str) -> None:
|
| 88 |
+
stamp = datetime.now(tz=UTC).strftime("%H:%M:%SZ")
|
| 89 |
+
print(f"[{stamp}] {message}", flush=True)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def iso_now() -> str:
|
| 93 |
+
return datetime.now(tz=UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def snapshot_id() -> str:
|
| 97 |
+
return datetime.now(tz=UTC).strftime("%Y%m%dT%H%M%SZ")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def row_key(row: dict[str, Any], fields: tuple[str, ...]) -> str:
|
| 101 |
+
return json.dumps([row.get(field) for field in fields], default=str)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def merge_rows(
|
| 105 |
+
table_name: str,
|
| 106 |
+
previous_rows: list[dict[str, Any]],
|
| 107 |
+
delta_rows: list[dict[str, Any]],
|
| 108 |
+
) -> list[dict[str, Any]]:
|
| 109 |
+
if table_name == "pr_files":
|
| 110 |
+
refreshed_prs = {
|
| 111 |
+
(row.get("repo"), row.get("pull_request_number"))
|
| 112 |
+
for row in delta_rows
|
| 113 |
+
if row.get("pull_request_number") is not None
|
| 114 |
+
}
|
| 115 |
+
previous_rows = [
|
| 116 |
+
row
|
| 117 |
+
for row in previous_rows
|
| 118 |
+
if (row.get("repo"), row.get("pull_request_number")) not in refreshed_prs
|
| 119 |
+
]
|
| 120 |
+
merged: dict[str, dict[str, Any]] = {}
|
| 121 |
+
for row in previous_rows:
|
| 122 |
+
merged[row_key(row, PRIMARY_KEYS[table_name])] = row
|
| 123 |
+
for row in delta_rows:
|
| 124 |
+
merged[row_key(row, PRIMARY_KEYS[table_name])] = row
|
| 125 |
+
return list(merged.values())
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def checkpoint_dirs(remote_paths: set[str]) -> list[tuple[str, str]]:
|
| 129 |
+
by_snapshot_id: dict[str, str] = {}
|
| 130 |
+
for path in remote_paths:
|
| 131 |
+
parts = path.split("/")
|
| 132 |
+
if len(parts) < 3 or parts[0] not in CHECKPOINT_PREFIXES:
|
| 133 |
+
continue
|
| 134 |
+
snapshot_key = parts[1]
|
| 135 |
+
prefix = parts[0]
|
| 136 |
+
current = by_snapshot_id.get(snapshot_key)
|
| 137 |
+
if current is None or current.startswith("checkpoints/"):
|
| 138 |
+
by_snapshot_id[snapshot_key] = f"{prefix}/{snapshot_key}"
|
| 139 |
+
return [(sid, by_snapshot_id[sid]) for sid in sorted(by_snapshot_id)]
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def copy_remote_file_from_candidates(
|
| 143 |
+
api: HfApi,
|
| 144 |
+
repo_id: str,
|
| 145 |
+
local_dir: Path,
|
| 146 |
+
destination: Path,
|
| 147 |
+
candidate_paths: list[str],
|
| 148 |
+
) -> bool:
|
| 149 |
+
for candidate in candidate_paths:
|
| 150 |
+
downloaded = load_remote_file(api, repo_id, candidate, local_dir)
|
| 151 |
+
if downloaded is None:
|
| 152 |
+
continue
|
| 153 |
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 154 |
+
shutil.copy2(downloaded, destination)
|
| 155 |
+
return True
|
| 156 |
+
return False
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def materialize_previous_snapshot_dir(
|
| 160 |
+
*,
|
| 161 |
+
api: Any,
|
| 162 |
+
repo_id: str,
|
| 163 |
+
previous_root: Path,
|
| 164 |
+
stable_snapshot_id: str | None,
|
| 165 |
+
latest_pointer: dict[str, Any] | None,
|
| 166 |
+
previous_tables: dict[str, list[dict[str, Any]]],
|
| 167 |
+
) -> Path | None:
|
| 168 |
+
if not stable_snapshot_id:
|
| 169 |
+
return None
|
| 170 |
+
snapshot_dir = (previous_root / "materialized-snapshots" / stable_snapshot_id).resolve()
|
| 171 |
+
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
| 172 |
+
for table_name, rows in previous_tables.items():
|
| 173 |
+
write_parquet(rows, snapshot_dir / f"{table_name}.parquet", table_name)
|
| 174 |
+
for artifact_name in (
|
| 175 |
+
"manifest.json",
|
| 176 |
+
"new_contributors.parquet",
|
| 177 |
+
"new-contributors-report.json",
|
| 178 |
+
"new-contributors-report.md",
|
| 179 |
+
):
|
| 180 |
+
copy_remote_file_from_candidates(
|
| 181 |
+
api,
|
| 182 |
+
repo_id,
|
| 183 |
+
previous_root,
|
| 184 |
+
snapshot_dir / artifact_name,
|
| 185 |
+
stable_snapshot_candidates(latest_pointer, artifact_name),
|
| 186 |
+
)
|
| 187 |
+
return snapshot_dir
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def load_remote_table_from_candidates(
|
| 191 |
+
api: HfApi,
|
| 192 |
+
repo_id: str,
|
| 193 |
+
table_name: str,
|
| 194 |
+
local_dir: Path,
|
| 195 |
+
candidate_paths: list[str],
|
| 196 |
+
) -> list[dict[str, Any]]:
|
| 197 |
+
for candidate in candidate_paths:
|
| 198 |
+
downloaded = load_remote_file(api, repo_id, candidate, local_dir)
|
| 199 |
+
if downloaded is not None:
|
| 200 |
+
return read_parquet_rows(downloaded)
|
| 201 |
+
return []
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def viewer_comment_rows(
|
| 205 |
+
comments: list[dict[str, Any]],
|
| 206 |
+
pull_requests: list[dict[str, Any]],
|
| 207 |
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
| 208 |
+
pr_numbers = {int(row["number"]) for row in pull_requests if row.get("number") is not None}
|
| 209 |
+
issue_comments: list[dict[str, Any]] = []
|
| 210 |
+
pr_comments: list[dict[str, Any]] = []
|
| 211 |
+
for row in comments:
|
| 212 |
+
parent_number = row.get("parent_number")
|
| 213 |
+
parent_kind = row.get("parent_kind")
|
| 214 |
+
if parent_kind == "pull_request" or parent_number in pr_numbers:
|
| 215 |
+
pr_comments.append(row)
|
| 216 |
+
else:
|
| 217 |
+
issue_comments.append(row)
|
| 218 |
+
return issue_comments, pr_comments
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def upload_delta_checkpoint(
|
| 222 |
+
*,
|
| 223 |
+
api: HfApi,
|
| 224 |
+
repo_id: str,
|
| 225 |
+
work_dir: Path,
|
| 226 |
+
repo_slug: str,
|
| 227 |
+
sid: str,
|
| 228 |
+
stage: str,
|
| 229 |
+
delta_tables: dict[str, list[dict[str, Any]]],
|
| 230 |
+
progress: dict[str, Any],
|
| 231 |
+
) -> None:
|
| 232 |
+
checkpoint_root = work_dir / "checkpoint_upload"
|
| 233 |
+
if checkpoint_root.exists():
|
| 234 |
+
shutil.rmtree(checkpoint_root)
|
| 235 |
+
checkpoint_root.mkdir(parents=True, exist_ok=True)
|
| 236 |
+
|
| 237 |
+
for table_name, rows in delta_tables.items():
|
| 238 |
+
write_parquet(rows, checkpoint_root / f"{table_name}.parquet", table_name)
|
| 239 |
+
write_json(
|
| 240 |
+
{"repo": repo_slug, "snapshot_id": sid, **progress}, checkpoint_root / "progress.json"
|
| 241 |
+
)
|
| 242 |
+
write_json(
|
| 243 |
+
{"repo": repo_slug, "snapshot_id": sid, **progress},
|
| 244 |
+
checkpoint_root / "state" / "in_progress.json",
|
| 245 |
+
)
|
| 246 |
+
api.upload_folder(
|
| 247 |
+
folder_path=str(checkpoint_root),
|
| 248 |
+
path_in_repo=f"_checkpoints/{sid}",
|
| 249 |
+
repo_id=repo_id,
|
| 250 |
+
repo_type="dataset",
|
| 251 |
+
commit_message=f"Checkpoint {sid} ({stage})",
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def remaining_limit(limit: int | None, used: int) -> int | None:
|
| 256 |
+
if limit is None:
|
| 257 |
+
return None
|
| 258 |
+
return max(limit - used, 0)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def _build_argument_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
| 262 |
+
defaults = command_defaults("refresh-dataset", config_path=config_path)
|
| 263 |
+
parser = argparse.ArgumentParser()
|
| 264 |
+
parser.add_argument("--config", type=Path, help="Optional repo config file.")
|
| 265 |
+
parser.add_argument("--repo", default=defaults.get("repo", "huggingface/transformers"))
|
| 266 |
+
parser.add_argument("--hf-repo-id", default=defaults.get("hf-repo-id"))
|
| 267 |
+
parser.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
|
| 268 |
+
parser.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
|
| 269 |
+
parser.add_argument(
|
| 270 |
+
"--max-issue-comments",
|
| 271 |
+
type=int,
|
| 272 |
+
default=defaults.get("max-issue-comments"),
|
| 273 |
+
)
|
| 274 |
+
parser.add_argument(
|
| 275 |
+
"--max-reviews-per-pr",
|
| 276 |
+
type=int,
|
| 277 |
+
default=defaults.get("max-reviews-per-pr"),
|
| 278 |
+
)
|
| 279 |
+
parser.add_argument(
|
| 280 |
+
"--max-review-comments-per-pr",
|
| 281 |
+
type=int,
|
| 282 |
+
default=defaults.get("max-review-comments-per-pr"),
|
| 283 |
+
)
|
| 284 |
+
parser.add_argument(
|
| 285 |
+
"--fetch-timeline",
|
| 286 |
+
action="store_true",
|
| 287 |
+
default=bool(defaults.get("fetch-timeline", False)),
|
| 288 |
+
)
|
| 289 |
+
parser.add_argument(
|
| 290 |
+
"--new-contributor-report",
|
| 291 |
+
dest="new_contributor_report",
|
| 292 |
+
action="store_true",
|
| 293 |
+
default=bool(defaults.get("new-contributor-report", True)),
|
| 294 |
+
)
|
| 295 |
+
parser.add_argument(
|
| 296 |
+
"--no-new-contributor-report",
|
| 297 |
+
dest="new_contributor_report",
|
| 298 |
+
action="store_false",
|
| 299 |
+
)
|
| 300 |
+
parser.add_argument(
|
| 301 |
+
"--new-contributor-window-days",
|
| 302 |
+
type=int,
|
| 303 |
+
default=int(defaults.get("new-contributor-window-days", 42)),
|
| 304 |
+
)
|
| 305 |
+
parser.add_argument(
|
| 306 |
+
"--new-contributor-max-authors",
|
| 307 |
+
type=int,
|
| 308 |
+
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 309 |
+
)
|
| 310 |
+
parser.add_argument("--http-timeout", type=int, default=300)
|
| 311 |
+
parser.add_argument("--http-max-retries", type=int, default=8)
|
| 312 |
+
parser.add_argument("--checkpoint-every-comments", type=int, default=1000)
|
| 313 |
+
parser.add_argument("--checkpoint-every-prs", type=int, default=25)
|
| 314 |
+
parser.add_argument(
|
| 315 |
+
"--private-hf-repo",
|
| 316 |
+
dest="private_hf_repo",
|
| 317 |
+
action="store_true",
|
| 318 |
+
default=bool(defaults.get("private-hf-repo", False)),
|
| 319 |
+
)
|
| 320 |
+
parser.add_argument("--private", dest="private_hf_repo", action="store_true")
|
| 321 |
+
return parser
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 325 |
+
config_path = extract_cli_config_path(argv)
|
| 326 |
+
parser = _build_argument_parser(config_path=config_path)
|
| 327 |
+
args = parser.parse_args(argv)
|
| 328 |
+
if not args.hf_repo_id:
|
| 329 |
+
parser.error("--hf-repo-id is required (or set dataset_id in --config)")
|
| 330 |
+
return args
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
|
| 334 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 335 |
+
github_token = resolve_github_token()
|
| 336 |
+
if not github_token:
|
| 337 |
+
raise RuntimeError("GITHUB_TOKEN must be set or resolvable via gh auth/.env")
|
| 338 |
+
|
| 339 |
+
repo_slug = options.repo.slug
|
| 340 |
+
owner, repo_name = options.repo.owner, options.repo.name
|
| 341 |
+
sid = snapshot_id()
|
| 342 |
+
crawl_started_at = iso_now()
|
| 343 |
+
extracted_at = iso_now()
|
| 344 |
+
|
| 345 |
+
api = HfApi(token=hf_token)
|
| 346 |
+
api.create_repo(
|
| 347 |
+
repo_id=options.hf_repo_id,
|
| 348 |
+
repo_type="dataset",
|
| 349 |
+
private=options.private_hf_repo,
|
| 350 |
+
exist_ok=True,
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
with tempfile.TemporaryDirectory(prefix="slop-farmer-job-") as tmp:
|
| 354 |
+
root = Path(tmp)
|
| 355 |
+
previous_root = root / "previous"
|
| 356 |
+
output_root = root / "output"
|
| 357 |
+
previous_root.mkdir(parents=True, exist_ok=True)
|
| 358 |
+
output_root.mkdir(parents=True, exist_ok=True)
|
| 359 |
+
|
| 360 |
+
remote_paths = list_remote_paths(api, options.hf_repo_id)
|
| 361 |
+
previous_watermark = load_remote_json_file(
|
| 362 |
+
api, options.hf_repo_id, "state/watermark.json", previous_root
|
| 363 |
+
)
|
| 364 |
+
remote_manifest = load_remote_json_file(
|
| 365 |
+
api, options.hf_repo_id, "manifest.json", previous_root
|
| 366 |
+
)
|
| 367 |
+
latest_pointer = (
|
| 368 |
+
load_remote_json_file(api, options.hf_repo_id, "snapshots/latest.json", previous_root)
|
| 369 |
+
if "snapshots/latest.json" in remote_paths
|
| 370 |
+
else None
|
| 371 |
+
)
|
| 372 |
+
stable_snapshot_id = None
|
| 373 |
+
if previous_watermark:
|
| 374 |
+
stable_snapshot_id = previous_watermark.get("last_successful_snapshot_id")
|
| 375 |
+
elif latest_pointer:
|
| 376 |
+
stable_snapshot_id = latest_pointer.get("latest_snapshot_id")
|
| 377 |
+
elif remote_manifest:
|
| 378 |
+
stable_snapshot_id = remote_manifest.get("snapshot_id")
|
| 379 |
+
|
| 380 |
+
log(f"Starting dataset refresh for {repo_slug}")
|
| 381 |
+
log(f"Target dataset repo: {options.hf_repo_id}")
|
| 382 |
+
previous_tables = {
|
| 383 |
+
table_name: [] for table_name in SCHEMAS if table_name != "new_contributors"
|
| 384 |
+
}
|
| 385 |
+
for table_name in previous_tables:
|
| 386 |
+
previous_tables[table_name] = load_remote_table_from_candidates(
|
| 387 |
+
api,
|
| 388 |
+
options.hf_repo_id,
|
| 389 |
+
table_name,
|
| 390 |
+
previous_root,
|
| 391 |
+
stable_snapshot_candidates(latest_pointer, f"{table_name}.parquet"),
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
checkpoint_progress: dict[str, Any] | None = None
|
| 395 |
+
best_comment_checkpoint_progress: dict[str, Any] | None = None
|
| 396 |
+
for checkpoint_sid, checkpoint_dir in checkpoint_dirs(remote_paths):
|
| 397 |
+
if stable_snapshot_id is not None and checkpoint_sid <= str(stable_snapshot_id):
|
| 398 |
+
continue
|
| 399 |
+
progress_payload = load_remote_json_file(
|
| 400 |
+
api, options.hf_repo_id, f"{checkpoint_dir}/progress.json", previous_root
|
| 401 |
+
) or load_remote_json_file(
|
| 402 |
+
api,
|
| 403 |
+
options.hf_repo_id,
|
| 404 |
+
f"{checkpoint_dir}/state/in_progress.json",
|
| 405 |
+
previous_root,
|
| 406 |
+
)
|
| 407 |
+
if progress_payload is not None:
|
| 408 |
+
checkpoint_progress = progress_payload
|
| 409 |
+
if (
|
| 410 |
+
progress_payload.get("effective_since") is None
|
| 411 |
+
and (progress_payload.get("counts") or {}).get("comments", 0) > 0
|
| 412 |
+
and (
|
| 413 |
+
best_comment_checkpoint_progress is None
|
| 414 |
+
or (progress_payload.get("counts") or {}).get("comments", 0)
|
| 415 |
+
> (best_comment_checkpoint_progress.get("counts") or {}).get("comments", 0)
|
| 416 |
+
)
|
| 417 |
+
):
|
| 418 |
+
best_comment_checkpoint_progress = progress_payload
|
| 419 |
+
for table_name in previous_tables:
|
| 420 |
+
checkpoint_rows = load_remote_table_from_candidates(
|
| 421 |
+
api,
|
| 422 |
+
options.hf_repo_id,
|
| 423 |
+
table_name,
|
| 424 |
+
previous_root,
|
| 425 |
+
[f"{checkpoint_dir}/{table_name}.parquet"],
|
| 426 |
+
)
|
| 427 |
+
if checkpoint_rows:
|
| 428 |
+
previous_tables[table_name] = merge_rows(
|
| 429 |
+
table_name,
|
| 430 |
+
previous_tables[table_name],
|
| 431 |
+
checkpoint_rows,
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
effective_since = None
|
| 435 |
+
if checkpoint_progress and checkpoint_progress.get("effective_since") is not None:
|
| 436 |
+
effective_since = checkpoint_progress.get("effective_since")
|
| 437 |
+
log(f"Resuming from incomplete checkpoint window starting at {effective_since}")
|
| 438 |
+
elif previous_watermark and previous_watermark.get("next_since") is not None:
|
| 439 |
+
effective_since = previous_watermark.get("next_since")
|
| 440 |
+
log(f"Resuming from remote watermark {effective_since}")
|
| 441 |
+
elif (
|
| 442 |
+
remote_manifest
|
| 443 |
+
and isinstance(remote_manifest.get("watermark"), dict)
|
| 444 |
+
and remote_manifest["watermark"].get("next_since") is not None
|
| 445 |
+
):
|
| 446 |
+
effective_since = remote_manifest["watermark"].get("next_since")
|
| 447 |
+
log(f"Bootstrapping remote watermark from root manifest {effective_since}")
|
| 448 |
+
else:
|
| 449 |
+
log("No successful watermark found; running full snapshot")
|
| 450 |
+
|
| 451 |
+
client = GitHubClient(
|
| 452 |
+
token=github_token,
|
| 453 |
+
timeout=options.http_timeout,
|
| 454 |
+
max_retries=options.http_max_retries,
|
| 455 |
+
log=log,
|
| 456 |
+
)
|
| 457 |
+
previous_snapshot_dir = materialize_previous_snapshot_dir(
|
| 458 |
+
api=api,
|
| 459 |
+
repo_id=options.hf_repo_id,
|
| 460 |
+
previous_root=previous_root,
|
| 461 |
+
stable_snapshot_id=str(stable_snapshot_id) if stable_snapshot_id is not None else None,
|
| 462 |
+
latest_pointer=latest_pointer,
|
| 463 |
+
previous_tables=previous_tables,
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
rate_limit = client.get_json("/rate_limit")
|
| 467 |
+
core = (rate_limit.get("resources") or {}).get("core") or {}
|
| 468 |
+
limit = core.get("limit")
|
| 469 |
+
remaining = core.get("remaining")
|
| 470 |
+
reset_at = core.get("reset")
|
| 471 |
+
log(f"GitHub core rate limit: limit={limit} remaining={remaining} reset={reset_at}")
|
| 472 |
+
if limit is not None and int(limit) <= 60:
|
| 473 |
+
raise RuntimeError("GITHUB_TOKEN appears to be missing, invalid, or not being applied")
|
| 474 |
+
if remaining == 0 and reset_at:
|
| 475 |
+
sleep_for = max(int(reset_at) - int(time.time()), 1)
|
| 476 |
+
log(f"GitHub token exhausted before bootstrap; sleeping {sleep_for}s until reset")
|
| 477 |
+
time.sleep(sleep_for)
|
| 478 |
+
|
| 479 |
+
log("Fetching changed issue and pull request stubs from GitHub")
|
| 480 |
+
issue_stubs = list(
|
| 481 |
+
client.iter_repo_issues(owner, repo_name, effective_since, options.max_issues)
|
| 482 |
+
)
|
| 483 |
+
issues = [item for item in issue_stubs if "pull_request" not in item]
|
| 484 |
+
pr_stubs = [item for item in issue_stubs if "pull_request" in item]
|
| 485 |
+
if options.max_prs is not None:
|
| 486 |
+
pr_stubs = pr_stubs[: options.max_prs]
|
| 487 |
+
log(f"Fetched {len(issue_stubs)} changed stubs")
|
| 488 |
+
|
| 489 |
+
issue_number_to_kind = {
|
| 490 |
+
item["number"]: ("pull_request" if "pull_request" in item else "issue")
|
| 491 |
+
for item in issue_stubs
|
| 492 |
+
}
|
| 493 |
+
issue_rows = [normalize_issue(repo_slug, item, sid, extracted_at) for item in issues]
|
| 494 |
+
|
| 495 |
+
comment_rows: list[dict[str, Any]] = []
|
| 496 |
+
next_comment_checkpoint = options.checkpoint_every_comments
|
| 497 |
+
reuse_checkpoint_comments = (
|
| 498 |
+
stable_snapshot_id is None
|
| 499 |
+
and effective_since is None
|
| 500 |
+
and best_comment_checkpoint_progress is not None
|
| 501 |
+
and bool(previous_tables["comments"])
|
| 502 |
+
)
|
| 503 |
+
if reuse_checkpoint_comments:
|
| 504 |
+
log(
|
| 505 |
+
f"Reusing {len(previous_tables['comments'])} checkpoint comments from prior partial runs"
|
| 506 |
+
)
|
| 507 |
+
else:
|
| 508 |
+
for index, item in enumerate(issue_stubs, start=1):
|
| 509 |
+
if not item.get("comments"):
|
| 510 |
+
continue
|
| 511 |
+
remaining_comments = remaining_limit(options.max_issue_comments, len(comment_rows))
|
| 512 |
+
if remaining_comments == 0:
|
| 513 |
+
break
|
| 514 |
+
if index == 1 or index % 25 == 0:
|
| 515 |
+
log(f"Collecting discussion comments; {len(comment_rows)} collected so far")
|
| 516 |
+
for comment in client.iter_issue_comments_for_number(
|
| 517 |
+
owner,
|
| 518 |
+
repo_name,
|
| 519 |
+
int(item["number"]),
|
| 520 |
+
effective_since,
|
| 521 |
+
remaining_comments,
|
| 522 |
+
):
|
| 523 |
+
parent_number = issue_url_to_number(comment.get("issue_url"))
|
| 524 |
+
parent_kind = issue_number_to_kind.get(parent_number, "issue_or_pr")
|
| 525 |
+
comment_rows.append(
|
| 526 |
+
normalize_comment(
|
| 527 |
+
repo_slug,
|
| 528 |
+
comment,
|
| 529 |
+
parent_kind,
|
| 530 |
+
parent_number,
|
| 531 |
+
sid,
|
| 532 |
+
extracted_at,
|
| 533 |
+
)
|
| 534 |
+
)
|
| 535 |
+
remaining_comments = remaining_limit(
|
| 536 |
+
options.max_issue_comments,
|
| 537 |
+
len(comment_rows),
|
| 538 |
+
)
|
| 539 |
+
if (
|
| 540 |
+
options.checkpoint_every_comments
|
| 541 |
+
and len(comment_rows) >= next_comment_checkpoint
|
| 542 |
+
):
|
| 543 |
+
log(f"Pushing comment checkpoint to Hub at {len(comment_rows)} comments")
|
| 544 |
+
upload_delta_checkpoint(
|
| 545 |
+
api=api,
|
| 546 |
+
repo_id=options.hf_repo_id,
|
| 547 |
+
work_dir=root,
|
| 548 |
+
repo_slug=repo_slug,
|
| 549 |
+
sid=sid,
|
| 550 |
+
stage="comments",
|
| 551 |
+
delta_tables={
|
| 552 |
+
"issues": issue_rows,
|
| 553 |
+
"pull_requests": [],
|
| 554 |
+
"comments": comment_rows,
|
| 555 |
+
"reviews": [],
|
| 556 |
+
"review_comments": [],
|
| 557 |
+
"pr_files": [],
|
| 558 |
+
"pr_diffs": [],
|
| 559 |
+
"links": [],
|
| 560 |
+
"events": [],
|
| 561 |
+
},
|
| 562 |
+
progress={
|
| 563 |
+
"stage": "comments",
|
| 564 |
+
"effective_since": effective_since,
|
| 565 |
+
"counts": {
|
| 566 |
+
"issues": len(issue_rows),
|
| 567 |
+
"comments": len(comment_rows),
|
| 568 |
+
"pull_requests": 0,
|
| 569 |
+
"reviews": 0,
|
| 570 |
+
"review_comments": 0,
|
| 571 |
+
"pr_files": 0,
|
| 572 |
+
"pr_diffs": 0,
|
| 573 |
+
"links": 0,
|
| 574 |
+
"events": 0,
|
| 575 |
+
},
|
| 576 |
+
},
|
| 577 |
+
)
|
| 578 |
+
next_comment_checkpoint += options.checkpoint_every_comments
|
| 579 |
+
if remaining_comments == 0:
|
| 580 |
+
break
|
| 581 |
+
|
| 582 |
+
pr_rows: list[dict[str, Any]] = []
|
| 583 |
+
review_rows: list[dict[str, Any]] = []
|
| 584 |
+
review_comment_rows: list[dict[str, Any]] = []
|
| 585 |
+
pr_file_rows: list[dict[str, Any]] = []
|
| 586 |
+
pr_diff_rows: list[dict[str, Any]] = []
|
| 587 |
+
event_rows: list[dict[str, Any]] = []
|
| 588 |
+
next_pr_checkpoint = options.checkpoint_every_prs
|
| 589 |
+
|
| 590 |
+
previous_pr_rows_by_number = {
|
| 591 |
+
int(row["number"]): row
|
| 592 |
+
for row in previous_tables["pull_requests"]
|
| 593 |
+
if row.get("number") is not None
|
| 594 |
+
}
|
| 595 |
+
previous_review_rows_by_number: defaultdict[int, list[dict[str, Any]]] = defaultdict(list)
|
| 596 |
+
for row in previous_tables["reviews"]:
|
| 597 |
+
if row.get("pull_request_number") is not None:
|
| 598 |
+
previous_review_rows_by_number[int(row["pull_request_number"])].append(row)
|
| 599 |
+
previous_review_comment_rows_by_number: defaultdict[int, list[dict[str, Any]]] = (
|
| 600 |
+
defaultdict(list)
|
| 601 |
+
)
|
| 602 |
+
for row in previous_tables["review_comments"]:
|
| 603 |
+
if row.get("pull_request_number") is not None:
|
| 604 |
+
previous_review_comment_rows_by_number[int(row["pull_request_number"])].append(row)
|
| 605 |
+
previous_pr_file_rows_by_number: defaultdict[int, list[dict[str, Any]]] = defaultdict(list)
|
| 606 |
+
for row in previous_tables["pr_files"]:
|
| 607 |
+
if row.get("pull_request_number") is not None:
|
| 608 |
+
previous_pr_file_rows_by_number[int(row["pull_request_number"])].append(row)
|
| 609 |
+
previous_pr_diff_rows_by_number = {
|
| 610 |
+
int(row["pull_request_number"]): row
|
| 611 |
+
for row in previous_tables["pr_diffs"]
|
| 612 |
+
if row.get("pull_request_number") is not None
|
| 613 |
+
}
|
| 614 |
+
previous_pr_event_rows_by_number: defaultdict[int, list[dict[str, Any]]] = defaultdict(list)
|
| 615 |
+
for row in previous_tables["events"]:
|
| 616 |
+
if row.get("parent_kind") == "pull_request" and row.get("parent_number") is not None:
|
| 617 |
+
previous_pr_event_rows_by_number[int(row["parent_number"])].append(row)
|
| 618 |
+
|
| 619 |
+
hydration_pr_stubs: list[dict[str, Any]] = []
|
| 620 |
+
for pr_stub in pr_stubs:
|
| 621 |
+
number = int(pr_stub["number"])
|
| 622 |
+
previous_pr_row = previous_pr_rows_by_number.get(number)
|
| 623 |
+
if previous_pr_row and previous_pr_row.get("updated_at") == pr_stub.get("updated_at"):
|
| 624 |
+
pr_rows.append(previous_pr_row)
|
| 625 |
+
review_rows.extend(previous_review_rows_by_number[number])
|
| 626 |
+
review_comment_rows.extend(previous_review_comment_rows_by_number[number])
|
| 627 |
+
pr_file_rows.extend(previous_pr_file_rows_by_number[number])
|
| 628 |
+
if number in previous_pr_diff_rows_by_number:
|
| 629 |
+
pr_diff_rows.append(previous_pr_diff_rows_by_number[number])
|
| 630 |
+
event_rows.extend(previous_pr_event_rows_by_number[number])
|
| 631 |
+
continue
|
| 632 |
+
hydration_pr_stubs.append(pr_stub)
|
| 633 |
+
|
| 634 |
+
reused_pr_count = len(pr_rows)
|
| 635 |
+
if reused_pr_count:
|
| 636 |
+
log(f"Reusing hydrated data for {reused_pr_count} pull requests from prior checkpoints")
|
| 637 |
+
if options.checkpoint_every_prs:
|
| 638 |
+
while reused_pr_count >= next_pr_checkpoint:
|
| 639 |
+
next_pr_checkpoint += options.checkpoint_every_prs
|
| 640 |
+
|
| 641 |
+
total_prs = len(pr_stubs)
|
| 642 |
+
remaining_prs = len(hydration_pr_stubs)
|
| 643 |
+
for index, pr_stub in enumerate(hydration_pr_stubs, start=1):
|
| 644 |
+
number = int(pr_stub["number"])
|
| 645 |
+
hydrated_count = reused_pr_count + index
|
| 646 |
+
if index == 1 or hydrated_count % 10 == 0 or index == remaining_prs:
|
| 647 |
+
log(f"Hydrating pull requests: {hydrated_count}/{total_prs}")
|
| 648 |
+
detail = client.get_pull_request(owner, repo_name, number)
|
| 649 |
+
pr_rows.append(normalize_pull_request(repo_slug, pr_stub, detail, sid, extracted_at))
|
| 650 |
+
for review in client.iter_pull_reviews(
|
| 651 |
+
owner, repo_name, number, options.max_reviews_per_pr
|
| 652 |
+
):
|
| 653 |
+
review_rows.append(normalize_review(repo_slug, number, review, sid, extracted_at))
|
| 654 |
+
for comment in client.iter_pull_review_comments(
|
| 655 |
+
owner,
|
| 656 |
+
repo_name,
|
| 657 |
+
number,
|
| 658 |
+
options.max_review_comments_per_pr,
|
| 659 |
+
):
|
| 660 |
+
review_comment_rows.append(
|
| 661 |
+
normalize_review_comment(repo_slug, number, comment, sid, extracted_at)
|
| 662 |
+
)
|
| 663 |
+
for pr_file in client.iter_pull_files(owner, repo_name, number):
|
| 664 |
+
pr_file_rows.append(
|
| 665 |
+
normalize_pr_file(repo_slug, number, pr_file, sid, extracted_at)
|
| 666 |
+
)
|
| 667 |
+
pr_diff_rows.append(
|
| 668 |
+
normalize_pr_diff(
|
| 669 |
+
repo_slug,
|
| 670 |
+
number,
|
| 671 |
+
pr_stub.get("html_url"),
|
| 672 |
+
pr_stub.get("url"),
|
| 673 |
+
client.get_pull_request_diff(owner, repo_name, number),
|
| 674 |
+
sid,
|
| 675 |
+
extracted_at,
|
| 676 |
+
)
|
| 677 |
+
)
|
| 678 |
+
if options.fetch_timeline:
|
| 679 |
+
for event in client.iter_issue_timeline(owner, repo_name, number):
|
| 680 |
+
event_rows.append(
|
| 681 |
+
normalize_timeline_event(
|
| 682 |
+
repo_slug,
|
| 683 |
+
number,
|
| 684 |
+
"pull_request",
|
| 685 |
+
event,
|
| 686 |
+
sid,
|
| 687 |
+
extracted_at,
|
| 688 |
+
)
|
| 689 |
+
)
|
| 690 |
+
if options.checkpoint_every_prs and len(pr_rows) >= next_pr_checkpoint:
|
| 691 |
+
log(f"Pushing PR checkpoint to Hub at {len(pr_rows)} hydrated PRs")
|
| 692 |
+
upload_delta_checkpoint(
|
| 693 |
+
api=api,
|
| 694 |
+
repo_id=options.hf_repo_id,
|
| 695 |
+
work_dir=root,
|
| 696 |
+
repo_slug=repo_slug,
|
| 697 |
+
sid=sid,
|
| 698 |
+
stage="pull_requests",
|
| 699 |
+
delta_tables={
|
| 700 |
+
"issues": issue_rows,
|
| 701 |
+
"pull_requests": pr_rows,
|
| 702 |
+
"comments": comment_rows,
|
| 703 |
+
"reviews": review_rows,
|
| 704 |
+
"review_comments": review_comment_rows,
|
| 705 |
+
"pr_files": pr_file_rows,
|
| 706 |
+
"pr_diffs": pr_diff_rows,
|
| 707 |
+
"links": [],
|
| 708 |
+
"events": event_rows,
|
| 709 |
+
},
|
| 710 |
+
progress={
|
| 711 |
+
"stage": "pull_requests",
|
| 712 |
+
"effective_since": effective_since,
|
| 713 |
+
"counts": {
|
| 714 |
+
"issues": len(issue_rows),
|
| 715 |
+
"comments": len(comment_rows),
|
| 716 |
+
"pull_requests": len(pr_rows),
|
| 717 |
+
"reviews": len(review_rows),
|
| 718 |
+
"review_comments": len(review_comment_rows),
|
| 719 |
+
"pr_files": len(pr_file_rows),
|
| 720 |
+
"pr_diffs": len(pr_diff_rows),
|
| 721 |
+
"links": 0,
|
| 722 |
+
"events": len(event_rows),
|
| 723 |
+
},
|
| 724 |
+
},
|
| 725 |
+
)
|
| 726 |
+
next_pr_checkpoint += options.checkpoint_every_prs
|
| 727 |
+
|
| 728 |
+
if options.fetch_timeline:
|
| 729 |
+
log(f"Fetching issue timelines for {len(issues)} changed issues")
|
| 730 |
+
for issue in issues:
|
| 731 |
+
for event in client.iter_issue_timeline(owner, repo_name, int(issue["number"])):
|
| 732 |
+
event_rows.append(
|
| 733 |
+
normalize_timeline_event(
|
| 734 |
+
repo_slug,
|
| 735 |
+
int(issue["number"]),
|
| 736 |
+
"issue",
|
| 737 |
+
event,
|
| 738 |
+
sid,
|
| 739 |
+
extracted_at,
|
| 740 |
+
)
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
link_rows: list[dict[str, Any]] = []
|
| 744 |
+
for row in issue_rows:
|
| 745 |
+
link_rows.extend(
|
| 746 |
+
build_text_link_rows(
|
| 747 |
+
repo=repo_slug,
|
| 748 |
+
owner=owner,
|
| 749 |
+
repo_name=repo_name,
|
| 750 |
+
source_type="issue",
|
| 751 |
+
source_number=row["number"],
|
| 752 |
+
source_id=row["github_id"],
|
| 753 |
+
body=row["body"],
|
| 754 |
+
snapshot_id=sid,
|
| 755 |
+
extracted_at=extracted_at,
|
| 756 |
+
)
|
| 757 |
+
)
|
| 758 |
+
for row in pr_rows:
|
| 759 |
+
link_rows.extend(
|
| 760 |
+
build_text_link_rows(
|
| 761 |
+
repo=repo_slug,
|
| 762 |
+
owner=owner,
|
| 763 |
+
repo_name=repo_name,
|
| 764 |
+
source_type="pull_request",
|
| 765 |
+
source_number=row["number"],
|
| 766 |
+
source_id=row["github_id"],
|
| 767 |
+
body=row["body"],
|
| 768 |
+
snapshot_id=sid,
|
| 769 |
+
extracted_at=extracted_at,
|
| 770 |
+
)
|
| 771 |
+
)
|
| 772 |
+
for row in comment_rows or previous_tables["comments"]:
|
| 773 |
+
if row["parent_number"] is None:
|
| 774 |
+
continue
|
| 775 |
+
link_rows.extend(
|
| 776 |
+
build_text_link_rows(
|
| 777 |
+
repo=repo_slug,
|
| 778 |
+
owner=owner,
|
| 779 |
+
repo_name=repo_name,
|
| 780 |
+
source_type="comment",
|
| 781 |
+
source_number=row["parent_number"],
|
| 782 |
+
source_id=row["github_id"],
|
| 783 |
+
body=row["body"],
|
| 784 |
+
snapshot_id=sid,
|
| 785 |
+
extracted_at=extracted_at,
|
| 786 |
+
)
|
| 787 |
+
)
|
| 788 |
+
for row in review_rows:
|
| 789 |
+
link_rows.extend(
|
| 790 |
+
build_text_link_rows(
|
| 791 |
+
repo=repo_slug,
|
| 792 |
+
owner=owner,
|
| 793 |
+
repo_name=repo_name,
|
| 794 |
+
source_type="review",
|
| 795 |
+
source_number=row["pull_request_number"],
|
| 796 |
+
source_id=row["github_id"],
|
| 797 |
+
body=row["body"],
|
| 798 |
+
snapshot_id=sid,
|
| 799 |
+
extracted_at=extracted_at,
|
| 800 |
+
)
|
| 801 |
+
)
|
| 802 |
+
for row in review_comment_rows:
|
| 803 |
+
link_rows.extend(
|
| 804 |
+
build_text_link_rows(
|
| 805 |
+
repo=repo_slug,
|
| 806 |
+
owner=owner,
|
| 807 |
+
repo_name=repo_name,
|
| 808 |
+
source_type="review_comment",
|
| 809 |
+
source_number=row["pull_request_number"],
|
| 810 |
+
source_id=row["github_id"],
|
| 811 |
+
body=row["body"],
|
| 812 |
+
snapshot_id=sid,
|
| 813 |
+
extracted_at=extracted_at,
|
| 814 |
+
)
|
| 815 |
+
)
|
| 816 |
+
link_rows.extend(
|
| 817 |
+
build_pr_duplicate_candidate_rows(
|
| 818 |
+
repo=repo_slug,
|
| 819 |
+
pull_requests=pr_rows,
|
| 820 |
+
link_rows=link_rows,
|
| 821 |
+
snapshot_id=sid,
|
| 822 |
+
extracted_at=extracted_at,
|
| 823 |
+
)
|
| 824 |
+
)
|
| 825 |
+
for event in event_rows:
|
| 826 |
+
if event.get("source_issue_number"):
|
| 827 |
+
link_rows.append(
|
| 828 |
+
{
|
| 829 |
+
"repo": repo_slug,
|
| 830 |
+
"source_type": event["parent_kind"],
|
| 831 |
+
"source_number": event["parent_number"],
|
| 832 |
+
"source_github_id": None,
|
| 833 |
+
"target_owner": owner,
|
| 834 |
+
"target_repo": repo_name,
|
| 835 |
+
"target_number": event["source_issue_number"],
|
| 836 |
+
"link_type": f"timeline:{event['event']}",
|
| 837 |
+
"link_origin": "timeline",
|
| 838 |
+
"snapshot_id": sid,
|
| 839 |
+
"extracted_at": extracted_at,
|
| 840 |
+
}
|
| 841 |
+
)
|
| 842 |
+
|
| 843 |
+
delta_tables = {
|
| 844 |
+
"issues": issue_rows,
|
| 845 |
+
"pull_requests": pr_rows,
|
| 846 |
+
"comments": comment_rows,
|
| 847 |
+
"reviews": review_rows,
|
| 848 |
+
"review_comments": review_comment_rows,
|
| 849 |
+
"pr_files": pr_file_rows,
|
| 850 |
+
"pr_diffs": pr_diff_rows,
|
| 851 |
+
"links": link_rows,
|
| 852 |
+
"events": event_rows,
|
| 853 |
+
}
|
| 854 |
+
if any(delta_tables.values()):
|
| 855 |
+
log("Pushing final delta checkpoint to Hub before merge upload")
|
| 856 |
+
upload_delta_checkpoint(
|
| 857 |
+
api=api,
|
| 858 |
+
repo_id=options.hf_repo_id,
|
| 859 |
+
work_dir=root,
|
| 860 |
+
repo_slug=repo_slug,
|
| 861 |
+
sid=sid,
|
| 862 |
+
stage="final-delta",
|
| 863 |
+
delta_tables=delta_tables,
|
| 864 |
+
progress={
|
| 865 |
+
"stage": "final-delta",
|
| 866 |
+
"effective_since": effective_since,
|
| 867 |
+
"counts": {name: len(rows) for name, rows in delta_tables.items()},
|
| 868 |
+
},
|
| 869 |
+
)
|
| 870 |
+
|
| 871 |
+
final_tables = {
|
| 872 |
+
table_name: merge_rows(table_name, previous_tables[table_name], delta_rows)
|
| 873 |
+
for table_name, delta_rows in delta_tables.items()
|
| 874 |
+
}
|
| 875 |
+
manifest = {
|
| 876 |
+
"repo": repo_slug,
|
| 877 |
+
"snapshot_id": sid,
|
| 878 |
+
"crawl_started_at": crawl_started_at,
|
| 879 |
+
"extracted_at": extracted_at,
|
| 880 |
+
"watermark": {
|
| 881 |
+
"effective_since": effective_since,
|
| 882 |
+
"next_since": crawl_started_at,
|
| 883 |
+
"previous_snapshot_dir": (
|
| 884 |
+
str(previous_snapshot_dir) if previous_snapshot_dir is not None else None
|
| 885 |
+
),
|
| 886 |
+
},
|
| 887 |
+
"delta_counts": {
|
| 888 |
+
"issue_stubs": len(issue_stubs),
|
| 889 |
+
"issues": len(issue_rows),
|
| 890 |
+
"pull_requests": len(pr_rows),
|
| 891 |
+
"comments": len(comment_rows),
|
| 892 |
+
"reviews": len(review_rows),
|
| 893 |
+
"review_comments": len(review_comment_rows),
|
| 894 |
+
"pr_files": len(pr_file_rows),
|
| 895 |
+
"pr_diffs": len(pr_diff_rows),
|
| 896 |
+
"timeline_events": len(event_rows),
|
| 897 |
+
"links": len(link_rows),
|
| 898 |
+
},
|
| 899 |
+
"counts": {
|
| 900 |
+
"issues": len(final_tables["issues"]),
|
| 901 |
+
"pull_requests": len(final_tables["pull_requests"]),
|
| 902 |
+
"comments": len(final_tables["comments"]),
|
| 903 |
+
"reviews": len(final_tables["reviews"]),
|
| 904 |
+
"review_comments": len(final_tables["review_comments"]),
|
| 905 |
+
"pr_files": len(final_tables["pr_files"]),
|
| 906 |
+
"pr_diffs": len(final_tables["pr_diffs"]),
|
| 907 |
+
"timeline_events": len(final_tables["events"]),
|
| 908 |
+
"links": len(final_tables["links"]),
|
| 909 |
+
},
|
| 910 |
+
}
|
| 911 |
+
|
| 912 |
+
log("Writing updated dataset files")
|
| 913 |
+
for table_name, rows in final_tables.items():
|
| 914 |
+
write_parquet(rows, output_root / f"{table_name}.parquet", table_name)
|
| 915 |
+
issue_comment_rows, pr_comment_rows = viewer_comment_rows(
|
| 916 |
+
final_tables["comments"],
|
| 917 |
+
final_tables["pull_requests"],
|
| 918 |
+
)
|
| 919 |
+
write_parquet(issue_comment_rows, output_root / "issue_comments.parquet", "comments")
|
| 920 |
+
write_parquet(pr_comment_rows, output_root / "pr_comments.parquet", "comments")
|
| 921 |
+
if options.new_contributor_report:
|
| 922 |
+
write_json(manifest, output_root / "manifest.json")
|
| 923 |
+
log("Generating new contributor dataset/report artifacts")
|
| 924 |
+
run_new_contributor_report(
|
| 925 |
+
NewContributorReportOptions(
|
| 926 |
+
snapshot_dir=output_root,
|
| 927 |
+
output_dir=output_root,
|
| 928 |
+
output=None,
|
| 929 |
+
json_output=None,
|
| 930 |
+
hf_repo_id=None,
|
| 931 |
+
hf_revision=None,
|
| 932 |
+
hf_materialize_dir=None,
|
| 933 |
+
window_days=options.new_contributor_window_days,
|
| 934 |
+
max_authors=options.new_contributor_max_authors,
|
| 935 |
+
)
|
| 936 |
+
)
|
| 937 |
+
manifest["counts"]["new_contributors"] = len(
|
| 938 |
+
read_parquet_rows(output_root / "new_contributors.parquet")
|
| 939 |
+
)
|
| 940 |
+
manifest["artifacts"] = {
|
| 941 |
+
"new_contributors_parquet": "new_contributors.parquet",
|
| 942 |
+
"new_contributors_json": "new-contributors-report.json",
|
| 943 |
+
"new_contributors_markdown": "new-contributors-report.md",
|
| 944 |
+
}
|
| 945 |
+
manifest["watermark"].pop("previous_snapshot_dir", None)
|
| 946 |
+
write_json(manifest, output_root / "manifest.json")
|
| 947 |
+
write_text(
|
| 948 |
+
build_hf_dataset_card(
|
| 949 |
+
repo_slug,
|
| 950 |
+
sid,
|
| 951 |
+
include_new_contributors=options.new_contributor_report,
|
| 952 |
+
),
|
| 953 |
+
output_root / "README.md",
|
| 954 |
+
)
|
| 955 |
+
write_json(
|
| 956 |
+
{
|
| 957 |
+
"repo": repo_slug,
|
| 958 |
+
"last_successful_snapshot_id": sid,
|
| 959 |
+
"effective_since": effective_since,
|
| 960 |
+
"next_since": crawl_started_at,
|
| 961 |
+
"updated_at": extracted_at,
|
| 962 |
+
},
|
| 963 |
+
output_root / "state" / "watermark.json",
|
| 964 |
+
)
|
| 965 |
+
write_json(manifest, output_root / "snapshots" / sid / "manifest.json")
|
| 966 |
+
write_json(
|
| 967 |
+
{
|
| 968 |
+
"repo": repo_slug,
|
| 969 |
+
"latest_snapshot_id": sid,
|
| 970 |
+
"snapshot_dir": f"snapshots/{sid}",
|
| 971 |
+
"manifest_path": "manifest.json",
|
| 972 |
+
"archived_manifest_path": f"snapshots/{sid}/manifest.json",
|
| 973 |
+
"next_since": crawl_started_at,
|
| 974 |
+
},
|
| 975 |
+
output_root / "snapshots" / "latest.json",
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
+
log("Uploading updated dataset to the Hub")
|
| 979 |
+
api.upload_folder(
|
| 980 |
+
folder_path=str(output_root),
|
| 981 |
+
repo_id=options.hf_repo_id,
|
| 982 |
+
repo_type="dataset",
|
| 983 |
+
commit_message=f"Refresh {repo_name} dataset snapshot {sid}",
|
| 984 |
+
)
|
| 985 |
+
log(f"Dataset refresh complete for {options.hf_repo_id}")
|
| 986 |
+
return {
|
| 987 |
+
"repo": repo_slug,
|
| 988 |
+
"dataset_id": options.hf_repo_id,
|
| 989 |
+
"snapshot_id": sid,
|
| 990 |
+
"effective_since": effective_since,
|
| 991 |
+
"counts": manifest["counts"],
|
| 992 |
+
}
|
| 993 |
+
|
| 994 |
+
|
| 995 |
+
def main(argv: list[str] | None = None) -> None:
|
| 996 |
+
args = parse_args(argv)
|
| 997 |
+
result = run_dataset_refresh(
|
| 998 |
+
DatasetRefreshOptions(
|
| 999 |
+
repo=RepoRef.parse(args.repo),
|
| 1000 |
+
hf_repo_id=args.hf_repo_id,
|
| 1001 |
+
private_hf_repo=args.private_hf_repo,
|
| 1002 |
+
max_issues=args.max_issues,
|
| 1003 |
+
max_prs=args.max_prs,
|
| 1004 |
+
max_issue_comments=args.max_issue_comments,
|
| 1005 |
+
max_reviews_per_pr=args.max_reviews_per_pr,
|
| 1006 |
+
max_review_comments_per_pr=args.max_review_comments_per_pr,
|
| 1007 |
+
fetch_timeline=args.fetch_timeline,
|
| 1008 |
+
new_contributor_report=args.new_contributor_report,
|
| 1009 |
+
new_contributor_window_days=args.new_contributor_window_days,
|
| 1010 |
+
new_contributor_max_authors=args.new_contributor_max_authors,
|
| 1011 |
+
http_timeout=args.http_timeout,
|
| 1012 |
+
http_max_retries=args.http_max_retries,
|
| 1013 |
+
checkpoint_every_comments=args.checkpoint_every_comments,
|
| 1014 |
+
checkpoint_every_prs=args.checkpoint_every_prs,
|
| 1015 |
+
)
|
| 1016 |
+
)
|
| 1017 |
+
print(json.dumps(result, indent=2))
|
| 1018 |
+
|
| 1019 |
+
|
| 1020 |
+
if __name__ == "__main__":
|
| 1021 |
+
main()
|
src/slop_farmer/app/dataset_status.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import tempfile
|
| 4 |
+
from datetime import UTC, datetime
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from huggingface_hub import HfApi
|
| 9 |
+
|
| 10 |
+
from slop_farmer.config import DatasetStatusOptions
|
| 11 |
+
from slop_farmer.data.hf_dataset_repo import (
|
| 12 |
+
list_remote_paths,
|
| 13 |
+
load_remote_file,
|
| 14 |
+
load_remote_json_file,
|
| 15 |
+
stable_snapshot_candidates,
|
| 16 |
+
)
|
| 17 |
+
from slop_farmer.data.parquet_io import read_json
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _coerce_datetime(value: Any) -> datetime | None:
|
| 21 |
+
if not isinstance(value, str) or not value:
|
| 22 |
+
return None
|
| 23 |
+
try:
|
| 24 |
+
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
| 25 |
+
except ValueError:
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _age_summary(value: str | None) -> dict[str, Any]:
|
| 30 |
+
timestamp = _coerce_datetime(value)
|
| 31 |
+
if timestamp is None:
|
| 32 |
+
return {"seconds": None, "summary": "unknown", "staleness": "unknown"}
|
| 33 |
+
age_seconds = max(int((datetime.now(tz=UTC) - timestamp).total_seconds()), 0)
|
| 34 |
+
if age_seconds <= 6 * 3600:
|
| 35 |
+
staleness = "fresh"
|
| 36 |
+
elif age_seconds <= 24 * 3600:
|
| 37 |
+
staleness = "aging"
|
| 38 |
+
else:
|
| 39 |
+
staleness = "stale"
|
| 40 |
+
if age_seconds < 3600:
|
| 41 |
+
summary = f"{age_seconds // 60}m"
|
| 42 |
+
elif age_seconds < 24 * 3600:
|
| 43 |
+
summary = f"{age_seconds // 3600}h"
|
| 44 |
+
else:
|
| 45 |
+
summary = f"{age_seconds // 86400}d"
|
| 46 |
+
return {"seconds": age_seconds, "summary": summary, "staleness": staleness}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _local_status(output_dir: Path) -> dict[str, Any] | None:
|
| 50 |
+
latest_path = output_dir.resolve() / "snapshots" / "latest.json"
|
| 51 |
+
if not latest_path.exists():
|
| 52 |
+
return None
|
| 53 |
+
payload = read_json(latest_path)
|
| 54 |
+
snapshot_dir = payload.get("snapshot_dir")
|
| 55 |
+
manifest = {}
|
| 56 |
+
if isinstance(snapshot_dir, str) and snapshot_dir:
|
| 57 |
+
manifest_path = Path(snapshot_dir).resolve() / "manifest.json"
|
| 58 |
+
if manifest_path.exists():
|
| 59 |
+
manifest = read_json(manifest_path)
|
| 60 |
+
return {
|
| 61 |
+
"latest_path": str(latest_path),
|
| 62 |
+
"latest_pointer": payload,
|
| 63 |
+
"snapshot_dir": snapshot_dir,
|
| 64 |
+
"snapshot_id": manifest.get("snapshot_id") or payload.get("latest_snapshot_id"),
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
|
| 69 |
+
api = HfApi()
|
| 70 |
+
with tempfile.TemporaryDirectory(prefix="slop-farmer-dataset-status-") as tmp:
|
| 71 |
+
root = Path(tmp)
|
| 72 |
+
remote_paths = list_remote_paths(api, repo_id, revision=revision)
|
| 73 |
+
latest_pointer = load_remote_json_file(
|
| 74 |
+
api,
|
| 75 |
+
repo_id,
|
| 76 |
+
"snapshots/latest.json",
|
| 77 |
+
root,
|
| 78 |
+
revision=revision,
|
| 79 |
+
)
|
| 80 |
+
watermark = load_remote_json_file(
|
| 81 |
+
api,
|
| 82 |
+
repo_id,
|
| 83 |
+
"state/watermark.json",
|
| 84 |
+
root,
|
| 85 |
+
revision=revision,
|
| 86 |
+
)
|
| 87 |
+
manifest = None
|
| 88 |
+
if latest_pointer is not None:
|
| 89 |
+
for candidate in stable_snapshot_candidates(latest_pointer, "manifest.json"):
|
| 90 |
+
downloaded = load_remote_file(
|
| 91 |
+
api,
|
| 92 |
+
repo_id,
|
| 93 |
+
candidate,
|
| 94 |
+
root,
|
| 95 |
+
revision=revision,
|
| 96 |
+
)
|
| 97 |
+
if downloaded is None:
|
| 98 |
+
continue
|
| 99 |
+
manifest = read_json(downloaded)
|
| 100 |
+
break
|
| 101 |
+
snapshot_prefix = (
|
| 102 |
+
str(latest_pointer.get("snapshot_dir") or "").strip("/")
|
| 103 |
+
if isinstance(latest_pointer, dict)
|
| 104 |
+
else ""
|
| 105 |
+
)
|
| 106 |
+
contributors_present = any(
|
| 107 |
+
path in remote_paths
|
| 108 |
+
for path in (
|
| 109 |
+
"new_contributors.parquet",
|
| 110 |
+
"new-contributors-report.json",
|
| 111 |
+
"new-contributors-report.md",
|
| 112 |
+
)
|
| 113 |
+
)
|
| 114 |
+
if snapshot_prefix:
|
| 115 |
+
contributors_present = contributors_present or any(
|
| 116 |
+
path in remote_paths
|
| 117 |
+
for path in (
|
| 118 |
+
f"{snapshot_prefix}/new_contributors.parquet",
|
| 119 |
+
f"{snapshot_prefix}/new-contributors-report.json",
|
| 120 |
+
f"{snapshot_prefix}/new-contributors-report.md",
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
extracted_at = manifest.get("extracted_at") if manifest else None
|
| 124 |
+
return {
|
| 125 |
+
"dataset_id": repo_id,
|
| 126 |
+
"revision": revision,
|
| 127 |
+
"latest_pointer": latest_pointer,
|
| 128 |
+
"watermark": watermark,
|
| 129 |
+
"manifest": manifest,
|
| 130 |
+
"contributors_present": contributors_present,
|
| 131 |
+
"remote_path_count": len(remote_paths),
|
| 132 |
+
"age": _age_summary(extracted_at),
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def get_dataset_status(options: DatasetStatusOptions) -> dict[str, Any]:
|
| 137 |
+
remote = _remote_status(options.hf_repo_id, options.hf_revision) if options.hf_repo_id else None
|
| 138 |
+
local = _local_status(options.output_dir)
|
| 139 |
+
repo = options.repo
|
| 140 |
+
if repo is None and remote and remote.get("manifest"):
|
| 141 |
+
repo = remote["manifest"].get("repo")
|
| 142 |
+
if repo is None and local and isinstance(local.get("latest_pointer"), dict):
|
| 143 |
+
repo = local["latest_pointer"].get("repo")
|
| 144 |
+
return {
|
| 145 |
+
"repo": repo,
|
| 146 |
+
"dataset_id": options.hf_repo_id,
|
| 147 |
+
"remote": remote,
|
| 148 |
+
"local": local,
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def format_dataset_status(status: dict[str, Any]) -> str:
|
| 153 |
+
remote = status.get("remote") or {}
|
| 154 |
+
local = status.get("local") or {}
|
| 155 |
+
manifest = remote.get("manifest") or {}
|
| 156 |
+
watermark = remote.get("watermark") or {}
|
| 157 |
+
latest_pointer = remote.get("latest_pointer") or {}
|
| 158 |
+
age = remote.get("age") or {}
|
| 159 |
+
lines = [
|
| 160 |
+
f"Repo: {status.get('repo') or '?'}",
|
| 161 |
+
f"Dataset: {status.get('dataset_id') or 'not configured'}",
|
| 162 |
+
]
|
| 163 |
+
if remote:
|
| 164 |
+
lines.extend(
|
| 165 |
+
[
|
| 166 |
+
f"Remote latest snapshot: {manifest.get('snapshot_id') or latest_pointer.get('latest_snapshot_id') or '?'}",
|
| 167 |
+
f"Remote extracted at: {manifest.get('extracted_at') or '?'}",
|
| 168 |
+
f"Remote next_since: {watermark.get('next_since') or latest_pointer.get('next_since') or '?'}",
|
| 169 |
+
f"Contributor artifacts: {'yes' if remote.get('contributors_present') else 'no'}",
|
| 170 |
+
f"Freshness: {age.get('summary') or 'unknown'} ({age.get('staleness') or 'unknown'})",
|
| 171 |
+
]
|
| 172 |
+
)
|
| 173 |
+
if local:
|
| 174 |
+
lines.extend(
|
| 175 |
+
[
|
| 176 |
+
f"Local latest pointer: {local.get('latest_path')}",
|
| 177 |
+
f"Local snapshot id: {local.get('snapshot_id') or '?'}",
|
| 178 |
+
]
|
| 179 |
+
)
|
| 180 |
+
else:
|
| 181 |
+
lines.append("Local latest pointer: none")
|
| 182 |
+
return "\n".join(lines)
|
src/slop_farmer/app/deploy.py
CHANGED
|
@@ -5,6 +5,7 @@ import subprocess
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from slop_farmer.config import DeployDashboardOptions
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
@@ -17,6 +18,16 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
| 17 |
{
|
| 18 |
"PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
|
| 19 |
"WEB_DIR": str(options.web_dir),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
|
| 21 |
"CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
|
| 22 |
"CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
|
|
@@ -28,8 +39,6 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
| 28 |
"SPACE_SHORT_DESCRIPTION": options.space_short_description,
|
| 29 |
}
|
| 30 |
)
|
| 31 |
-
if options.snapshot_dir is not None:
|
| 32 |
-
env["SNAPSHOT_DIR"] = str(options.snapshot_dir)
|
| 33 |
if options.analysis_input is not None:
|
| 34 |
env["ANALYSIS_INPUT"] = str(options.analysis_input)
|
| 35 |
if options.contributors_input is not None:
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from slop_farmer.config import DeployDashboardOptions
|
| 8 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 9 |
|
| 10 |
|
| 11 |
def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
|
|
| 18 |
{
|
| 19 |
"PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
|
| 20 |
"WEB_DIR": str(options.web_dir),
|
| 21 |
+
"SNAPSHOT_DIR": str(
|
| 22 |
+
resolve_snapshot_source_dir(
|
| 23 |
+
snapshot_dir=options.snapshot_dir,
|
| 24 |
+
local_snapshots_root=options.pipeline_data_dir.resolve() / "snapshots",
|
| 25 |
+
hf_repo_id=options.hf_repo_id,
|
| 26 |
+
hf_revision=options.hf_revision,
|
| 27 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 28 |
+
hf_output_dir=options.pipeline_data_dir,
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
"DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
|
| 32 |
"CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
|
| 33 |
"CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
|
|
|
|
| 39 |
"SPACE_SHORT_DESCRIPTION": options.space_short_description,
|
| 40 |
}
|
| 41 |
)
|
|
|
|
|
|
|
| 42 |
if options.analysis_input is not None:
|
| 43 |
env["ANALYSIS_INPUT"] = str(options.analysis_input)
|
| 44 |
if options.contributors_input is not None:
|
src/slop_farmer/app/hf_checkpoint_import.py
CHANGED
|
@@ -28,6 +28,7 @@ from huggingface_hub import HfApi, hf_hub_download
|
|
| 28 |
|
| 29 |
from slop_farmer.app.publish import publish_snapshot
|
| 30 |
from slop_farmer.config import CheckpointImportOptions
|
|
|
|
| 31 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 32 |
from slop_farmer.data.parquet_io import (
|
| 33 |
SCHEMAS,
|
|
@@ -455,76 +456,15 @@ def _viewer_comment_rows(
|
|
| 455 |
def _dataset_card(
|
| 456 |
repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
|
| 457 |
) -> str:
|
| 458 |
-
return
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
data_files:
|
| 468 |
-
- split: train
|
| 469 |
-
path: pull_requests.parquet
|
| 470 |
-
- config_name: issue_comments
|
| 471 |
-
data_files:
|
| 472 |
-
- split: train
|
| 473 |
-
path: issue_comments.parquet
|
| 474 |
-
- config_name: pr_comments
|
| 475 |
-
data_files:
|
| 476 |
-
- split: train
|
| 477 |
-
path: pr_comments.parquet
|
| 478 |
-
- config_name: pr_reviews
|
| 479 |
-
data_files:
|
| 480 |
-
- split: train
|
| 481 |
-
path: reviews.parquet
|
| 482 |
-
- config_name: pr_files
|
| 483 |
-
data_files:
|
| 484 |
-
- split: train
|
| 485 |
-
path: pr_files.parquet
|
| 486 |
-
- config_name: pr_diffs
|
| 487 |
-
data_files:
|
| 488 |
-
- split: train
|
| 489 |
-
path: pr_diffs.parquet
|
| 490 |
-
- config_name: review_comments
|
| 491 |
-
data_files:
|
| 492 |
-
- split: train
|
| 493 |
-
path: review_comments.parquet
|
| 494 |
-
- config_name: links
|
| 495 |
-
data_files:
|
| 496 |
-
- split: train
|
| 497 |
-
path: links.parquet
|
| 498 |
-
- config_name: events
|
| 499 |
-
data_files:
|
| 500 |
-
- split: train
|
| 501 |
-
path: events.parquet
|
| 502 |
-
---
|
| 503 |
-
---
|
| 504 |
-
|
| 505 |
-
# Transformers PR Slop Dataset
|
| 506 |
-
|
| 507 |
-
Imported checkpoint snapshot for `{repo_slug}`.
|
| 508 |
-
|
| 509 |
-
Files:
|
| 510 |
-
- `issues.parquet`
|
| 511 |
-
- `pull_requests.parquet`
|
| 512 |
-
- `comments.parquet`
|
| 513 |
-
- `issue_comments.parquet`
|
| 514 |
-
- `pr_comments.parquet`
|
| 515 |
-
- `reviews.parquet`
|
| 516 |
-
- `pr_files.parquet`
|
| 517 |
-
- `pr_diffs.parquet`
|
| 518 |
-
- `review_comments.parquet`
|
| 519 |
-
- `links.parquet`
|
| 520 |
-
- `events.parquet`
|
| 521 |
-
|
| 522 |
-
Notes:
|
| 523 |
-
- source HF dataset: `{source_repo_id}`
|
| 524 |
-
- source checkpoint root: `{checkpoint_root}`
|
| 525 |
-
- latest imported checkpoint: `{snapshot_id}`
|
| 526 |
-
- links were regenerated locally from text references and timeline events
|
| 527 |
-
"""
|
| 528 |
|
| 529 |
|
| 530 |
def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
|
|
|
|
| 28 |
|
| 29 |
from slop_farmer.app.publish import publish_snapshot
|
| 30 |
from slop_farmer.config import CheckpointImportOptions
|
| 31 |
+
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 32 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 33 |
from slop_farmer.data.parquet_io import (
|
| 34 |
SCHEMAS,
|
|
|
|
| 456 |
def _dataset_card(
|
| 457 |
repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
|
| 458 |
) -> str:
|
| 459 |
+
return build_hf_dataset_card(
|
| 460 |
+
repo_slug,
|
| 461 |
+
snapshot_id,
|
| 462 |
+
notes=[
|
| 463 |
+
f"source HF dataset: `{source_repo_id}`",
|
| 464 |
+
f"source checkpoint root: `{checkpoint_root}`",
|
| 465 |
+
"links were regenerated locally from text references and timeline events",
|
| 466 |
+
],
|
| 467 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
|
| 470 |
def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
|
src/slop_farmer/app/pipeline.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Any, Protocol
|
|
| 9 |
|
| 10 |
from slop_farmer.app.publish import publish_snapshot
|
| 11 |
from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
|
|
|
|
| 12 |
from slop_farmer.data.github_api import GitHubClient
|
| 13 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 14 |
from slop_farmer.data.normalize import (
|
|
@@ -112,96 +113,14 @@ def _reference_time_for_age_caps(crawl_started_at: str) -> datetime:
|
|
| 112 |
def _dataset_card(
|
| 113 |
repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
|
| 114 |
) -> str:
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
new_contributor_file = """- `new_contributors.parquet`
|
| 124 |
-
- `new-contributors-report.json`
|
| 125 |
-
- `new-contributors-report.md`
|
| 126 |
-
"""
|
| 127 |
-
return f"""---
|
| 128 |
-
pretty_name: Transformers PR Slop Dataset
|
| 129 |
-
configs:
|
| 130 |
-
- config_name: issues
|
| 131 |
-
data_files:
|
| 132 |
-
- split: train
|
| 133 |
-
path: issues.parquet
|
| 134 |
-
default: true
|
| 135 |
-
- config_name: prs
|
| 136 |
-
data_files:
|
| 137 |
-
- split: train
|
| 138 |
-
path: pull_requests.parquet
|
| 139 |
-
- config_name: issue_comments
|
| 140 |
-
data_files:
|
| 141 |
-
- split: train
|
| 142 |
-
path: issue_comments.parquet
|
| 143 |
-
- config_name: pr_comments
|
| 144 |
-
data_files:
|
| 145 |
-
- split: train
|
| 146 |
-
path: pr_comments.parquet
|
| 147 |
-
- config_name: pr_reviews
|
| 148 |
-
data_files:
|
| 149 |
-
- split: train
|
| 150 |
-
path: reviews.parquet
|
| 151 |
-
- config_name: pr_files
|
| 152 |
-
data_files:
|
| 153 |
-
- split: train
|
| 154 |
-
path: pr_files.parquet
|
| 155 |
-
- config_name: pr_diffs
|
| 156 |
-
data_files:
|
| 157 |
-
- split: train
|
| 158 |
-
path: pr_diffs.parquet
|
| 159 |
-
- config_name: review_comments
|
| 160 |
-
data_files:
|
| 161 |
-
- split: train
|
| 162 |
-
path: review_comments.parquet
|
| 163 |
-
- config_name: links
|
| 164 |
-
data_files:
|
| 165 |
-
- split: train
|
| 166 |
-
path: links.parquet
|
| 167 |
-
- config_name: events
|
| 168 |
-
data_files:
|
| 169 |
-
- split: train
|
| 170 |
-
path: events.parquet
|
| 171 |
-
{new_contributor_config}---
|
| 172 |
-
---
|
| 173 |
-
|
| 174 |
-
# Transformers PR Slop Dataset
|
| 175 |
-
|
| 176 |
-
Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo}`.
|
| 177 |
-
|
| 178 |
-
Files:
|
| 179 |
-
- `issues.parquet`
|
| 180 |
-
- `pull_requests.parquet`
|
| 181 |
-
- `comments.parquet`
|
| 182 |
-
- `issue_comments.parquet` (derived view of issue discussion comments)
|
| 183 |
-
- `pr_comments.parquet` (derived view of pull request discussion comments)
|
| 184 |
-
- `reviews.parquet`
|
| 185 |
-
- `pr_files.parquet`
|
| 186 |
-
- `pr_diffs.parquet`
|
| 187 |
-
- `review_comments.parquet`
|
| 188 |
-
- `links.parquet`
|
| 189 |
-
- `events.parquet`
|
| 190 |
-
{new_contributor_file}
|
| 191 |
-
|
| 192 |
-
Use:
|
| 193 |
-
- duplicate PR and issue analysis
|
| 194 |
-
- triage and ranking experiments
|
| 195 |
-
- eval set creation
|
| 196 |
-
|
| 197 |
-
Notes:
|
| 198 |
-
- updated daily
|
| 199 |
-
- latest snapshot: `{snapshot_id}`
|
| 200 |
-
- raw data only; no labels or moderation decisions
|
| 201 |
-
- PR metadata, file-level patch hunks, and full unified diffs are included
|
| 202 |
-
- new contributor reviewer artifacts are included when generated for the snapshot
|
| 203 |
-
- full file contents for changed files are not included
|
| 204 |
-
"""
|
| 205 |
|
| 206 |
|
| 207 |
def _viewer_comment_rows(
|
|
@@ -1045,6 +964,9 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
|
|
| 1045 |
output_dir=options.output_dir,
|
| 1046 |
output=None,
|
| 1047 |
json_output=None,
|
|
|
|
|
|
|
|
|
|
| 1048 |
window_days=options.new_contributor_window_days,
|
| 1049 |
max_authors=options.new_contributor_max_authors,
|
| 1050 |
)
|
|
|
|
| 9 |
|
| 10 |
from slop_farmer.app.publish import publish_snapshot
|
| 11 |
from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
|
| 12 |
+
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 13 |
from slop_farmer.data.github_api import GitHubClient
|
| 14 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 15 |
from slop_farmer.data.normalize import (
|
|
|
|
| 113 |
def _dataset_card(
|
| 114 |
repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
|
| 115 |
) -> str:
|
| 116 |
+
notes = ["new contributor reviewer artifacts are included"] if include_new_contributors else []
|
| 117 |
+
del manifest
|
| 118 |
+
return build_hf_dataset_card(
|
| 119 |
+
repo,
|
| 120 |
+
snapshot_id,
|
| 121 |
+
include_new_contributors=include_new_contributors,
|
| 122 |
+
notes=notes,
|
| 123 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def _viewer_comment_rows(
|
|
|
|
| 964 |
output_dir=options.output_dir,
|
| 965 |
output=None,
|
| 966 |
json_output=None,
|
| 967 |
+
hf_repo_id=None,
|
| 968 |
+
hf_revision=None,
|
| 969 |
+
hf_materialize_dir=None,
|
| 970 |
window_days=options.new_contributor_window_days,
|
| 971 |
max_authors=options.new_contributor_max_authors,
|
| 972 |
)
|
src/slop_farmer/app/pr_search.py
CHANGED
|
@@ -10,9 +10,12 @@ get_pr_search_status = pr_search_service.get_pr_search_status
|
|
| 10 |
get_pr_search_similar = pr_search_service.get_pr_search_similar
|
| 11 |
get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
|
| 12 |
get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
|
|
|
|
|
|
|
| 13 |
get_pr_search_clusters = pr_search_service.get_pr_search_clusters
|
| 14 |
list_pr_search_clusters = pr_search_service.list_pr_search_clusters
|
| 15 |
get_pr_search_cluster = pr_search_service.get_pr_search_cluster
|
|
|
|
| 16 |
explain_pr_search_pair = pr_search_service.explain_pr_search_pair
|
| 17 |
probe_pr_search_live = pr_search_service.probe_pr_search_live
|
| 18 |
probe_pr_search_github = pr_search_service.probe_pr_search_github
|
|
@@ -31,6 +34,7 @@ def format_pr_search_status(result: Mapping[str, Any]) -> str:
|
|
| 31 |
(
|
| 32 |
"Rows: "
|
| 33 |
f"documents={counts['documents']} "
|
|
|
|
| 34 |
f"features={counts['features']} "
|
| 35 |
f"neighbors={counts['neighbors']} "
|
| 36 |
f"clusters={counts['clusters']} "
|
|
@@ -245,3 +249,73 @@ def format_pr_search_probe(result: Mapping[str, Any]) -> str:
|
|
| 245 |
if row.get("reason"):
|
| 246 |
lines.append(f" reason: {row['reason']}")
|
| 247 |
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
get_pr_search_similar = pr_search_service.get_pr_search_similar
|
| 11 |
get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
|
| 12 |
get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
|
| 13 |
+
get_pr_search_contributor = pr_search_service.get_pr_search_contributor
|
| 14 |
+
get_pr_search_contributor_pulls = pr_search_service.get_pr_search_contributor_pulls
|
| 15 |
get_pr_search_clusters = pr_search_service.get_pr_search_clusters
|
| 16 |
list_pr_search_clusters = pr_search_service.list_pr_search_clusters
|
| 17 |
get_pr_search_cluster = pr_search_service.get_pr_search_cluster
|
| 18 |
+
get_pr_search_pull_contributor = pr_search_service.get_pr_search_pull_contributor
|
| 19 |
explain_pr_search_pair = pr_search_service.explain_pr_search_pair
|
| 20 |
probe_pr_search_live = pr_search_service.probe_pr_search_live
|
| 21 |
probe_pr_search_github = pr_search_service.probe_pr_search_github
|
|
|
|
| 34 |
(
|
| 35 |
"Rows: "
|
| 36 |
f"documents={counts['documents']} "
|
| 37 |
+
f"contributors={counts.get('contributors', 0)} "
|
| 38 |
f"features={counts['features']} "
|
| 39 |
f"neighbors={counts['neighbors']} "
|
| 40 |
f"clusters={counts['clusters']} "
|
|
|
|
| 249 |
if row.get("reason"):
|
| 250 |
lines.append(f" reason: {row['reason']}")
|
| 251 |
return "\n".join(lines)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def format_pr_search_contributor(result: Mapping[str, Any]) -> str:
|
| 255 |
+
contributor = result["contributor"]
|
| 256 |
+
lines = [
|
| 257 |
+
f"Contributor {contributor['author_login']}",
|
| 258 |
+
f"Repo: {result['repo']}",
|
| 259 |
+
f"Snapshot: {result['snapshot_id']}",
|
| 260 |
+
f"Name: {contributor.get('name') or '-'}",
|
| 261 |
+
f"Profile: {contributor.get('profile_url') or '-'}",
|
| 262 |
+
f"Association: {contributor.get('repo_association') or '-'}",
|
| 263 |
+
f"First seen in snapshot: {'yes' if contributor.get('first_seen_in_snapshot') else 'no'}",
|
| 264 |
+
(
|
| 265 |
+
"Scores: "
|
| 266 |
+
f"follow-through={contributor.get('follow_through_score') or '-'} "
|
| 267 |
+
f"breadth={contributor.get('breadth_score') or '-'} "
|
| 268 |
+
f"risk={contributor.get('automation_risk_signal') or '-'}"
|
| 269 |
+
),
|
| 270 |
+
f"Heuristic: {contributor.get('heuristic_note') or '-'}",
|
| 271 |
+
f"Public orgs: {', '.join(contributor.get('public_orgs') or []) or '-'}",
|
| 272 |
+
"",
|
| 273 |
+
"Recent indexed PRs:",
|
| 274 |
+
]
|
| 275 |
+
pulls = result.get("pulls") or []
|
| 276 |
+
if not pulls:
|
| 277 |
+
lines.append("- none")
|
| 278 |
+
return "\n".join(lines)
|
| 279 |
+
for row in pulls:
|
| 280 |
+
lines.append(
|
| 281 |
+
f"- PR #{row['pr_number']}: {row.get('title') or ''} "
|
| 282 |
+
f"[state={row.get('state') or '-'} merged={'yes' if row.get('merged') else 'no'}]"
|
| 283 |
+
)
|
| 284 |
+
return "\n".join(lines)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def format_pr_search_contributor_pulls(result: Mapping[str, Any]) -> str:
|
| 288 |
+
contributor = result["contributor"]
|
| 289 |
+
lines = [
|
| 290 |
+
f"Contributor PRs: {contributor['author_login']}",
|
| 291 |
+
f"Repo: {result['repo']}",
|
| 292 |
+
f"Snapshot: {result['snapshot_id']}",
|
| 293 |
+
f"Pull requests: {result.get('pull_count', len(result.get('pulls') or []))}",
|
| 294 |
+
"",
|
| 295 |
+
]
|
| 296 |
+
pulls = result.get("pulls") or []
|
| 297 |
+
if not pulls:
|
| 298 |
+
lines.append("No indexed PRs found for that contributor.")
|
| 299 |
+
return "\n".join(lines)
|
| 300 |
+
for row in pulls:
|
| 301 |
+
lines.append(
|
| 302 |
+
f"- PR #{row['pr_number']}: {row.get('title') or ''} "
|
| 303 |
+
f"(updated={row.get('updated_at') or '-'}, state={row.get('state') or '-'})"
|
| 304 |
+
)
|
| 305 |
+
return "\n".join(lines)
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def format_pr_search_pull_contributor(result: Mapping[str, Any]) -> str:
|
| 309 |
+
pr = result["pr"]
|
| 310 |
+
contributor = result["contributor"]
|
| 311 |
+
return "\n".join(
|
| 312 |
+
[
|
| 313 |
+
f"PR #{pr['pr_number']}: {pr.get('title') or ''}",
|
| 314 |
+
f"Author: {contributor['author_login']}",
|
| 315 |
+
f"Risk: {contributor.get('automation_risk_signal') or '-'}",
|
| 316 |
+
f"Follow-through: {contributor.get('follow_through_score') or '-'}",
|
| 317 |
+
f"Breadth: {contributor.get('breadth_score') or '-'}",
|
| 318 |
+
f"Heuristic: {contributor.get('heuristic_note') or '-'}",
|
| 319 |
+
f"Profile: {contributor.get('profile_url') or '-'}",
|
| 320 |
+
]
|
| 321 |
+
)
|
src/slop_farmer/app/pr_search_api.py
CHANGED
|
@@ -22,6 +22,9 @@ from slop_farmer.reports.analysis_service import (
|
|
| 22 |
from slop_farmer.reports.pr_search_service import (
|
| 23 |
get_pr_search_cluster,
|
| 24 |
get_pr_search_clusters,
|
|
|
|
|
|
|
|
|
|
| 25 |
get_pr_search_similar_lookup,
|
| 26 |
get_pr_search_status,
|
| 27 |
list_pr_search_clusters,
|
|
@@ -34,6 +37,7 @@ class PrSearchApiSettings:
|
|
| 34 |
default_repo: str | None
|
| 35 |
index_path: Path
|
| 36 |
output_dir: Path
|
|
|
|
| 37 |
snapshot_dir: Path | None = None
|
| 38 |
hf_repo_id: str | None = None
|
| 39 |
hf_revision: str | None = None
|
|
@@ -66,6 +70,7 @@ class PrSearchApiSettings:
|
|
| 66 |
default_repo=os.environ.get("DEFAULT_REPO"),
|
| 67 |
index_path=index_path,
|
| 68 |
output_dir=output_dir,
|
|
|
|
| 69 |
snapshot_dir=snapshot_dir,
|
| 70 |
hf_repo_id=os.environ.get("HF_REPO_ID"),
|
| 71 |
hf_revision=os.environ.get("HF_REVISION"),
|
|
@@ -103,7 +108,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 103 |
app.state.startup_error = str(exc)
|
| 104 |
yield
|
| 105 |
|
| 106 |
-
app = FastAPI(title="slop PR search API", version="0.1.
|
| 107 |
|
| 108 |
@app.exception_handler(ValueError)
|
| 109 |
async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
|
|
@@ -212,6 +217,44 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 212 |
),
|
| 213 |
)
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
@app.get("/v1/repos/{owner}/{repo}/analysis/status")
|
| 216 |
async def analysis_status(
|
| 217 |
owner: str,
|
|
@@ -221,7 +264,12 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 221 |
) -> dict[str, Any]:
|
| 222 |
settings = request.app.state.settings
|
| 223 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 224 |
-
return get_analysis_status(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
|
| 227 |
async def pr_analysis(
|
|
@@ -238,6 +286,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 238 |
repo=repo_slug,
|
| 239 |
pr_number=number,
|
| 240 |
variant=variant,
|
|
|
|
| 241 |
)
|
| 242 |
|
| 243 |
@app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
|
|
@@ -254,6 +303,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 254 |
settings.index_path,
|
| 255 |
repo=repo_slug,
|
| 256 |
variant=variant,
|
|
|
|
| 257 |
limit=_limit(
|
| 258 |
limit,
|
| 259 |
default=settings.cluster_list_limit_default,
|
|
@@ -276,6 +326,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 276 |
repo=repo_slug,
|
| 277 |
cluster_id=cluster_id,
|
| 278 |
variant=variant,
|
|
|
|
| 279 |
)
|
| 280 |
|
| 281 |
@app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
|
|
@@ -292,6 +343,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 292 |
settings.index_path,
|
| 293 |
repo=repo_slug,
|
| 294 |
variant=variant,
|
|
|
|
| 295 |
limit=_limit(
|
| 296 |
limit,
|
| 297 |
default=settings.cluster_list_limit_default,
|
|
@@ -308,7 +360,12 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 308 |
) -> dict[str, Any]:
|
| 309 |
settings = request.app.state.settings
|
| 310 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 311 |
-
return get_analysis_best(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
return app
|
| 314 |
|
|
@@ -395,6 +452,7 @@ def _looks_not_found(exc: ValueError) -> bool:
|
|
| 395 |
message = str(exc).lower()
|
| 396 |
return (
|
| 397 |
"not found" in message
|
|
|
|
| 398 |
or "no analysis report was found" in message
|
| 399 |
or "no active pr search run" in message
|
| 400 |
or "was not found in the active indexed universe" in message
|
|
|
|
| 22 |
from slop_farmer.reports.pr_search_service import (
|
| 23 |
get_pr_search_cluster,
|
| 24 |
get_pr_search_clusters,
|
| 25 |
+
get_pr_search_contributor,
|
| 26 |
+
get_pr_search_contributor_pulls,
|
| 27 |
+
get_pr_search_pull_contributor,
|
| 28 |
get_pr_search_similar_lookup,
|
| 29 |
get_pr_search_status,
|
| 30 |
list_pr_search_clusters,
|
|
|
|
| 37 |
default_repo: str | None
|
| 38 |
index_path: Path
|
| 39 |
output_dir: Path
|
| 40 |
+
analysis_dir: Path | None = None
|
| 41 |
snapshot_dir: Path | None = None
|
| 42 |
hf_repo_id: str | None = None
|
| 43 |
hf_revision: str | None = None
|
|
|
|
| 70 |
default_repo=os.environ.get("DEFAULT_REPO"),
|
| 71 |
index_path=index_path,
|
| 72 |
output_dir=output_dir,
|
| 73 |
+
analysis_dir=_env_path("ANALYSIS_DIR") or (output_dir / "analysis"),
|
| 74 |
snapshot_dir=snapshot_dir,
|
| 75 |
hf_repo_id=os.environ.get("HF_REPO_ID"),
|
| 76 |
hf_revision=os.environ.get("HF_REVISION"),
|
|
|
|
| 108 |
app.state.startup_error = str(exc)
|
| 109 |
yield
|
| 110 |
|
| 111 |
+
app = FastAPI(title="slop PR search API", version="0.1.1", lifespan=lifespan)
|
| 112 |
|
| 113 |
@app.exception_handler(ValueError)
|
| 114 |
async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
|
|
|
|
| 217 |
),
|
| 218 |
)
|
| 219 |
|
| 220 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors/{login}")
|
| 221 |
+
async def contributor_view(
|
| 222 |
+
owner: str, repo: str, login: str, request: Request
|
| 223 |
+
) -> dict[str, Any]:
|
| 224 |
+
settings = request.app.state.settings
|
| 225 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 226 |
+
return get_pr_search_contributor(settings.index_path, repo=repo_slug, author_login=login)
|
| 227 |
+
|
| 228 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors/{login}/pulls")
|
| 229 |
+
async def contributor_pulls(
|
| 230 |
+
owner: str,
|
| 231 |
+
repo: str,
|
| 232 |
+
login: str,
|
| 233 |
+
request: Request,
|
| 234 |
+
limit: int | None = None,
|
| 235 |
+
) -> dict[str, Any]:
|
| 236 |
+
settings = request.app.state.settings
|
| 237 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 238 |
+
return get_pr_search_contributor_pulls(
|
| 239 |
+
settings.index_path,
|
| 240 |
+
repo=repo_slug,
|
| 241 |
+
author_login=login,
|
| 242 |
+
limit=_limit(
|
| 243 |
+
limit, default=settings.similar_limit_default, maximum=settings.similar_limit_max
|
| 244 |
+
),
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/contributor")
|
| 248 |
+
async def pull_contributor(
|
| 249 |
+
owner: str,
|
| 250 |
+
repo: str,
|
| 251 |
+
number: int,
|
| 252 |
+
request: Request,
|
| 253 |
+
) -> dict[str, Any]:
|
| 254 |
+
settings = request.app.state.settings
|
| 255 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 256 |
+
return get_pr_search_pull_contributor(settings.index_path, repo=repo_slug, pr_number=number)
|
| 257 |
+
|
| 258 |
@app.get("/v1/repos/{owner}/{repo}/analysis/status")
|
| 259 |
async def analysis_status(
|
| 260 |
owner: str,
|
|
|
|
| 264 |
) -> dict[str, Any]:
|
| 265 |
settings = request.app.state.settings
|
| 266 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 267 |
+
return get_analysis_status(
|
| 268 |
+
settings.index_path,
|
| 269 |
+
repo=repo_slug,
|
| 270 |
+
variant=variant,
|
| 271 |
+
analysis_root=settings.analysis_dir,
|
| 272 |
+
)
|
| 273 |
|
| 274 |
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
|
| 275 |
async def pr_analysis(
|
|
|
|
| 286 |
repo=repo_slug,
|
| 287 |
pr_number=number,
|
| 288 |
variant=variant,
|
| 289 |
+
analysis_root=settings.analysis_dir,
|
| 290 |
)
|
| 291 |
|
| 292 |
@app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
|
|
|
|
| 303 |
settings.index_path,
|
| 304 |
repo=repo_slug,
|
| 305 |
variant=variant,
|
| 306 |
+
analysis_root=settings.analysis_dir,
|
| 307 |
limit=_limit(
|
| 308 |
limit,
|
| 309 |
default=settings.cluster_list_limit_default,
|
|
|
|
| 326 |
repo=repo_slug,
|
| 327 |
cluster_id=cluster_id,
|
| 328 |
variant=variant,
|
| 329 |
+
analysis_root=settings.analysis_dir,
|
| 330 |
)
|
| 331 |
|
| 332 |
@app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
|
|
|
|
| 343 |
settings.index_path,
|
| 344 |
repo=repo_slug,
|
| 345 |
variant=variant,
|
| 346 |
+
analysis_root=settings.analysis_dir,
|
| 347 |
limit=_limit(
|
| 348 |
limit,
|
| 349 |
default=settings.cluster_list_limit_default,
|
|
|
|
| 360 |
) -> dict[str, Any]:
|
| 361 |
settings = request.app.state.settings
|
| 362 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 363 |
+
return get_analysis_best(
|
| 364 |
+
settings.index_path,
|
| 365 |
+
repo=repo_slug,
|
| 366 |
+
variant=variant,
|
| 367 |
+
analysis_root=settings.analysis_dir,
|
| 368 |
+
)
|
| 369 |
|
| 370 |
return app
|
| 371 |
|
|
|
|
| 452 |
message = str(exc).lower()
|
| 453 |
return (
|
| 454 |
"not found" in message
|
| 455 |
+
or "analysis report was not found" in message
|
| 456 |
or "no analysis report was found" in message
|
| 457 |
or "no active pr search run" in message
|
| 458 |
or "was not found in the active indexed universe" in message
|
src/slop_farmer/app/workflow.py
CHANGED
|
@@ -74,6 +74,9 @@ def run_full_pipeline(options: FullPipelineOptions) -> str:
|
|
| 74 |
analysis_input=analysis_path,
|
| 75 |
contributors_input=snapshot_dir / "new-contributors-report.json",
|
| 76 |
pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
|
|
|
|
|
|
|
|
|
|
| 77 |
window_days=options.dashboard_window_days,
|
| 78 |
)
|
| 79 |
)
|
|
|
|
| 74 |
analysis_input=analysis_path,
|
| 75 |
contributors_input=snapshot_dir / "new-contributors-report.json",
|
| 76 |
pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
|
| 77 |
+
hf_repo_id=None,
|
| 78 |
+
hf_revision=None,
|
| 79 |
+
hf_materialize_dir=None,
|
| 80 |
window_days=options.dashboard_window_days,
|
| 81 |
)
|
| 82 |
)
|
src/slop_farmer/app_config.py
CHANGED
|
@@ -184,6 +184,18 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 184 |
"new-contributor-window-days": contributor_window_days,
|
| 185 |
"new-contributor-max-authors": contributor_max_authors,
|
| 186 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
"analyze": {
|
| 188 |
"output-dir": str(data_dir) if data_dir else None,
|
| 189 |
"hf-repo-id": analysis.get("hf-repo-id", dataset_id),
|
|
@@ -201,6 +213,7 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 201 |
},
|
| 202 |
"pr-scope": {
|
| 203 |
"output-dir": str(data_dir) if data_dir else None,
|
|
|
|
| 204 |
"cluster-suppression-rules": cluster_suppression_rules,
|
| 205 |
},
|
| 206 |
"pr-search": {
|
|
@@ -210,12 +223,14 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 210 |
},
|
| 211 |
"new-contributor-report": {
|
| 212 |
"output-dir": str(data_dir) if data_dir else None,
|
|
|
|
| 213 |
"window-days": contributor_window_days,
|
| 214 |
"max-authors": contributor_max_authors,
|
| 215 |
},
|
| 216 |
"dashboard-data": {
|
| 217 |
"output-dir": str(dashboard_dir) if dashboard_dir else None,
|
| 218 |
"snapshot-root": str(data_dir / "snapshots") if data_dir else None,
|
|
|
|
| 219 |
"window-days": dashboard_window_days,
|
| 220 |
},
|
| 221 |
"publish-snapshot": {
|
|
@@ -236,6 +251,7 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 236 |
"deploy-dashboard": {
|
| 237 |
"pipeline-data-dir": str(data_dir) if data_dir else None,
|
| 238 |
"web-dir": str(web_dir) if web_dir else None,
|
|
|
|
| 239 |
"dashboard-window-days": dashboard_window_days,
|
| 240 |
"contributor-window-days": contributor_window_days,
|
| 241 |
"contributor-max-authors": contributor_max_authors,
|
|
@@ -248,6 +264,11 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 248 |
"dataset-id": dataset_id,
|
| 249 |
"space-tags": tags_value,
|
| 250 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
}
|
| 252 |
for command, values in defaults.items():
|
| 253 |
defaults[command] = {key: value for key, value in values.items() if value is not None}
|
|
@@ -259,6 +280,7 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 259 |
defaults[command].update(_resolve_command_paths(config_path, values))
|
| 260 |
|
| 261 |
defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
|
|
|
|
| 262 |
defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
|
| 263 |
defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
|
| 264 |
return defaults
|
|
|
|
| 184 |
"new-contributor-window-days": contributor_window_days,
|
| 185 |
"new-contributor-max-authors": contributor_max_authors,
|
| 186 |
},
|
| 187 |
+
"refresh-dataset": {
|
| 188 |
+
"repo": repo,
|
| 189 |
+
"hf-repo-id": dataset_id,
|
| 190 |
+
"fetch-timeline": scrape.get("fetch-timeline"),
|
| 191 |
+
"max-issues": scrape.get("max-issues"),
|
| 192 |
+
"max-prs": scrape.get("max-prs"),
|
| 193 |
+
"max-issue-comments": scrape.get("max-issue-comments"),
|
| 194 |
+
"max-reviews-per-pr": scrape.get("max-reviews-per-pr"),
|
| 195 |
+
"max-review-comments-per-pr": scrape.get("max-review-comments-per-pr"),
|
| 196 |
+
"new-contributor-window-days": contributor_window_days,
|
| 197 |
+
"new-contributor-max-authors": contributor_max_authors,
|
| 198 |
+
},
|
| 199 |
"analyze": {
|
| 200 |
"output-dir": str(data_dir) if data_dir else None,
|
| 201 |
"hf-repo-id": analysis.get("hf-repo-id", dataset_id),
|
|
|
|
| 213 |
},
|
| 214 |
"pr-scope": {
|
| 215 |
"output-dir": str(data_dir) if data_dir else None,
|
| 216 |
+
"hf-repo-id": dataset_id,
|
| 217 |
"cluster-suppression-rules": cluster_suppression_rules,
|
| 218 |
},
|
| 219 |
"pr-search": {
|
|
|
|
| 223 |
},
|
| 224 |
"new-contributor-report": {
|
| 225 |
"output-dir": str(data_dir) if data_dir else None,
|
| 226 |
+
"hf-repo-id": dataset_id,
|
| 227 |
"window-days": contributor_window_days,
|
| 228 |
"max-authors": contributor_max_authors,
|
| 229 |
},
|
| 230 |
"dashboard-data": {
|
| 231 |
"output-dir": str(dashboard_dir) if dashboard_dir else None,
|
| 232 |
"snapshot-root": str(data_dir / "snapshots") if data_dir else None,
|
| 233 |
+
"hf-repo-id": dataset_id,
|
| 234 |
"window-days": dashboard_window_days,
|
| 235 |
},
|
| 236 |
"publish-snapshot": {
|
|
|
|
| 251 |
"deploy-dashboard": {
|
| 252 |
"pipeline-data-dir": str(data_dir) if data_dir else None,
|
| 253 |
"web-dir": str(web_dir) if web_dir else None,
|
| 254 |
+
"hf-repo-id": dataset_id,
|
| 255 |
"dashboard-window-days": dashboard_window_days,
|
| 256 |
"contributor-window-days": contributor_window_days,
|
| 257 |
"contributor-max-authors": contributor_max_authors,
|
|
|
|
| 264 |
"dataset-id": dataset_id,
|
| 265 |
"space-tags": tags_value,
|
| 266 |
},
|
| 267 |
+
"dataset-status": {
|
| 268 |
+
"repo": repo,
|
| 269 |
+
"output-dir": str(data_dir) if data_dir else None,
|
| 270 |
+
"hf-repo-id": dataset_id,
|
| 271 |
+
},
|
| 272 |
}
|
| 273 |
for command, values in defaults.items():
|
| 274 |
defaults[command] = {key: value for key, value in values.items() if value is not None}
|
|
|
|
| 280 |
defaults[command].update(_resolve_command_paths(config_path, values))
|
| 281 |
|
| 282 |
defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
|
| 283 |
+
defaults["refresh-dataset"].update(_resolve_command_paths(config_path, scrape))
|
| 284 |
defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
|
| 285 |
defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
|
| 286 |
return defaults
|
src/slop_farmer/config.py
CHANGED
|
@@ -127,6 +127,9 @@ class NewContributorReportOptions:
|
|
| 127 |
json_output: Path | None
|
| 128 |
window_days: int
|
| 129 |
max_authors: int
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
@dataclass(slots=True)
|
|
@@ -137,6 +140,9 @@ class DashboardDataOptions:
|
|
| 137 |
contributors_input: Path | None
|
| 138 |
pr_scope_input: Path | None
|
| 139 |
window_days: int
|
|
|
|
|
|
|
|
|
|
| 140 |
snapshot_root: Path | None = None
|
| 141 |
|
| 142 |
|
|
@@ -155,6 +161,9 @@ class DeployDashboardOptions:
|
|
| 155 |
snapshot_dir: Path | None
|
| 156 |
analysis_input: Path | None
|
| 157 |
contributors_input: Path | None
|
|
|
|
|
|
|
|
|
|
| 158 |
refresh_contributors: bool
|
| 159 |
dashboard_window_days: int
|
| 160 |
contributor_window_days: int
|
|
@@ -233,3 +242,32 @@ class FullPipelineOptions:
|
|
| 233 |
max_issues: int | None
|
| 234 |
max_prs: int | None
|
| 235 |
open_prs_only: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
json_output: Path | None
|
| 128 |
window_days: int
|
| 129 |
max_authors: int
|
| 130 |
+
hf_repo_id: str | None = None
|
| 131 |
+
hf_revision: str | None = None
|
| 132 |
+
hf_materialize_dir: Path | None = None
|
| 133 |
|
| 134 |
|
| 135 |
@dataclass(slots=True)
|
|
|
|
| 140 |
contributors_input: Path | None
|
| 141 |
pr_scope_input: Path | None
|
| 142 |
window_days: int
|
| 143 |
+
hf_repo_id: str | None = None
|
| 144 |
+
hf_revision: str | None = None
|
| 145 |
+
hf_materialize_dir: Path | None = None
|
| 146 |
snapshot_root: Path | None = None
|
| 147 |
|
| 148 |
|
|
|
|
| 161 |
snapshot_dir: Path | None
|
| 162 |
analysis_input: Path | None
|
| 163 |
contributors_input: Path | None
|
| 164 |
+
hf_repo_id: str | None
|
| 165 |
+
hf_revision: str | None
|
| 166 |
+
hf_materialize_dir: Path | None
|
| 167 |
refresh_contributors: bool
|
| 168 |
dashboard_window_days: int
|
| 169 |
contributor_window_days: int
|
|
|
|
| 242 |
max_issues: int | None
|
| 243 |
max_prs: int | None
|
| 244 |
open_prs_only: bool = False
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
@dataclass(slots=True)
|
| 248 |
+
class DatasetRefreshOptions:
|
| 249 |
+
repo: RepoRef
|
| 250 |
+
hf_repo_id: str
|
| 251 |
+
private_hf_repo: bool
|
| 252 |
+
max_issues: int | None
|
| 253 |
+
max_prs: int | None
|
| 254 |
+
max_issue_comments: int | None
|
| 255 |
+
max_reviews_per_pr: int | None
|
| 256 |
+
max_review_comments_per_pr: int | None
|
| 257 |
+
fetch_timeline: bool
|
| 258 |
+
new_contributor_report: bool
|
| 259 |
+
new_contributor_window_days: int
|
| 260 |
+
new_contributor_max_authors: int
|
| 261 |
+
http_timeout: int
|
| 262 |
+
http_max_retries: int
|
| 263 |
+
checkpoint_every_comments: int
|
| 264 |
+
checkpoint_every_prs: int
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
@dataclass(slots=True)
|
| 268 |
+
class DatasetStatusOptions:
|
| 269 |
+
output_dir: Path
|
| 270 |
+
hf_repo_id: str | None
|
| 271 |
+
hf_revision: str | None
|
| 272 |
+
repo: str | None = None
|
| 273 |
+
json_output: bool = False
|
src/slop_farmer/data/dataset_card.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _repo_title(repo_slug: str) -> str:
|
| 5 |
+
name = repo_slug.split("/", 1)[-1]
|
| 6 |
+
return name.replace("-", " ").replace("_", " ").title()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def build_hf_dataset_card(
|
| 10 |
+
repo_slug: str,
|
| 11 |
+
snapshot_id: str,
|
| 12 |
+
*,
|
| 13 |
+
include_new_contributors: bool = False,
|
| 14 |
+
notes: list[str] | None = None,
|
| 15 |
+
) -> str:
|
| 16 |
+
repo_title = _repo_title(repo_slug)
|
| 17 |
+
dataset_title = f"{repo_title} PR Dataset"
|
| 18 |
+
new_contributor_config = ""
|
| 19 |
+
new_contributor_files = ""
|
| 20 |
+
if include_new_contributors:
|
| 21 |
+
new_contributor_config = """- config_name: new_contributors
|
| 22 |
+
data_files:
|
| 23 |
+
- split: train
|
| 24 |
+
path: new_contributors.parquet
|
| 25 |
+
"""
|
| 26 |
+
new_contributor_files = """- `new_contributors.parquet`
|
| 27 |
+
- `new-contributors-report.json`
|
| 28 |
+
- `new-contributors-report.md`
|
| 29 |
+
"""
|
| 30 |
+
note_lines = "\n".join(f"- {note}" for note in (notes or []))
|
| 31 |
+
if note_lines:
|
| 32 |
+
note_lines = f"{note_lines}\n"
|
| 33 |
+
return f"""---
|
| 34 |
+
pretty_name: {dataset_title}
|
| 35 |
+
configs:
|
| 36 |
+
- config_name: issues
|
| 37 |
+
data_files:
|
| 38 |
+
- split: train
|
| 39 |
+
path: issues.parquet
|
| 40 |
+
default: true
|
| 41 |
+
- config_name: prs
|
| 42 |
+
data_files:
|
| 43 |
+
- split: train
|
| 44 |
+
path: pull_requests.parquet
|
| 45 |
+
- config_name: issue_comments
|
| 46 |
+
data_files:
|
| 47 |
+
- split: train
|
| 48 |
+
path: issue_comments.parquet
|
| 49 |
+
- config_name: pr_comments
|
| 50 |
+
data_files:
|
| 51 |
+
- split: train
|
| 52 |
+
path: pr_comments.parquet
|
| 53 |
+
- config_name: pr_reviews
|
| 54 |
+
data_files:
|
| 55 |
+
- split: train
|
| 56 |
+
path: reviews.parquet
|
| 57 |
+
- config_name: pr_files
|
| 58 |
+
data_files:
|
| 59 |
+
- split: train
|
| 60 |
+
path: pr_files.parquet
|
| 61 |
+
- config_name: pr_diffs
|
| 62 |
+
data_files:
|
| 63 |
+
- split: train
|
| 64 |
+
path: pr_diffs.parquet
|
| 65 |
+
- config_name: review_comments
|
| 66 |
+
data_files:
|
| 67 |
+
- split: train
|
| 68 |
+
path: review_comments.parquet
|
| 69 |
+
- config_name: links
|
| 70 |
+
data_files:
|
| 71 |
+
- split: train
|
| 72 |
+
path: links.parquet
|
| 73 |
+
- config_name: events
|
| 74 |
+
data_files:
|
| 75 |
+
- split: train
|
| 76 |
+
path: events.parquet
|
| 77 |
+
{new_contributor_config}---
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
# {dataset_title}
|
| 81 |
+
|
| 82 |
+
Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo_slug}`.
|
| 83 |
+
|
| 84 |
+
Files:
|
| 85 |
+
- `issues.parquet`
|
| 86 |
+
- `pull_requests.parquet`
|
| 87 |
+
- `comments.parquet`
|
| 88 |
+
- `issue_comments.parquet` (derived view of issue discussion comments)
|
| 89 |
+
- `pr_comments.parquet` (derived view of pull request discussion comments)
|
| 90 |
+
- `reviews.parquet`
|
| 91 |
+
- `pr_files.parquet`
|
| 92 |
+
- `pr_diffs.parquet`
|
| 93 |
+
- `review_comments.parquet`
|
| 94 |
+
- `links.parquet`
|
| 95 |
+
- `events.parquet`
|
| 96 |
+
{new_contributor_files}
|
| 97 |
+
Use:
|
| 98 |
+
- duplicate PR and issue analysis
|
| 99 |
+
- triage and ranking experiments
|
| 100 |
+
- eval set creation
|
| 101 |
+
|
| 102 |
+
Notes:
|
| 103 |
+
- latest snapshot: `{snapshot_id}`
|
| 104 |
+
- raw data only; no labels or moderation decisions
|
| 105 |
+
- PR metadata, file-level patch hunks, and full unified diffs are included
|
| 106 |
+
- full file contents for changed files are not included
|
| 107 |
+
{note_lines}"""
|
src/slop_farmer/data/hf_dataset_repo.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def load_remote_file(
|
| 12 |
+
api: HfApi,
|
| 13 |
+
repo_id: str,
|
| 14 |
+
path_in_repo: str,
|
| 15 |
+
local_dir: Path,
|
| 16 |
+
*,
|
| 17 |
+
revision: str | None = None,
|
| 18 |
+
) -> Path | None:
|
| 19 |
+
del api
|
| 20 |
+
try:
|
| 21 |
+
downloaded = hf_hub_download(
|
| 22 |
+
repo_id=repo_id,
|
| 23 |
+
filename=path_in_repo,
|
| 24 |
+
repo_type="dataset",
|
| 25 |
+
revision=revision,
|
| 26 |
+
local_dir=str(local_dir),
|
| 27 |
+
token=os.getenv("HF_TOKEN"),
|
| 28 |
+
)
|
| 29 |
+
except Exception:
|
| 30 |
+
return None
|
| 31 |
+
return Path(downloaded)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_remote_json_file(
|
| 35 |
+
api: HfApi,
|
| 36 |
+
repo_id: str,
|
| 37 |
+
path_in_repo: str,
|
| 38 |
+
local_dir: Path,
|
| 39 |
+
*,
|
| 40 |
+
revision: str | None = None,
|
| 41 |
+
) -> dict[str, Any] | None:
|
| 42 |
+
downloaded = load_remote_file(
|
| 43 |
+
api,
|
| 44 |
+
repo_id,
|
| 45 |
+
path_in_repo,
|
| 46 |
+
local_dir,
|
| 47 |
+
revision=revision,
|
| 48 |
+
)
|
| 49 |
+
if downloaded is None:
|
| 50 |
+
return None
|
| 51 |
+
return json.loads(downloaded.read_text(encoding="utf-8"))
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def list_remote_paths(api: HfApi, repo_id: str, *, revision: str | None = None) -> set[str]:
|
| 55 |
+
try:
|
| 56 |
+
info = api.dataset_info(repo_id=repo_id, revision=revision, files_metadata=True)
|
| 57 |
+
except TypeError:
|
| 58 |
+
info = api.dataset_info(repo_id=repo_id, revision=revision)
|
| 59 |
+
except Exception:
|
| 60 |
+
return set()
|
| 61 |
+
return {sibling.rfilename for sibling in getattr(info, "siblings", [])}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def stable_snapshot_candidates(latest_payload: dict[str, Any] | None, filename: str) -> list[str]:
|
| 65 |
+
if latest_payload is None:
|
| 66 |
+
return [filename]
|
| 67 |
+
candidates: list[str] = []
|
| 68 |
+
manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
|
| 69 |
+
snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
|
| 70 |
+
latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
|
| 71 |
+
|
| 72 |
+
if filename == "manifest.json" and manifest_path:
|
| 73 |
+
candidates.append(manifest_path)
|
| 74 |
+
if snapshot_dir and snapshot_dir not in {".", "/"}:
|
| 75 |
+
candidates.append(f"{snapshot_dir}/{filename}")
|
| 76 |
+
archived_manifest_path = str(latest_payload.get("archived_manifest_path") or "").strip("/")
|
| 77 |
+
if filename == "manifest.json" and archived_manifest_path:
|
| 78 |
+
candidates.append(archived_manifest_path)
|
| 79 |
+
if manifest_path and "/" in manifest_path:
|
| 80 |
+
manifest_dir = manifest_path.rsplit("/", 1)[0]
|
| 81 |
+
candidates.append(f"{manifest_dir}/{filename}")
|
| 82 |
+
if latest_snapshot_id:
|
| 83 |
+
candidates.append(f"snapshots/{latest_snapshot_id}/{filename}")
|
| 84 |
+
candidates.append(filename)
|
| 85 |
+
|
| 86 |
+
deduped: list[str] = []
|
| 87 |
+
seen: set[str] = set()
|
| 88 |
+
for candidate in candidates:
|
| 89 |
+
normalized = candidate.lstrip("./")
|
| 90 |
+
if not normalized or normalized in seen:
|
| 91 |
+
continue
|
| 92 |
+
seen.add(normalized)
|
| 93 |
+
deduped.append(normalized)
|
| 94 |
+
return deduped
|
src/slop_farmer/data/search_duckdb.py
CHANGED
|
@@ -31,6 +31,7 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
|
|
| 31 |
"repo",
|
| 32 |
"pr_number",
|
| 33 |
"github_id",
|
|
|
|
| 34 |
"state",
|
| 35 |
"draft",
|
| 36 |
"merged",
|
|
@@ -46,6 +47,48 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
|
|
| 46 |
"review_comments_count",
|
| 47 |
"html_url",
|
| 48 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"pr_scope_features": (
|
| 50 |
"run_id",
|
| 51 |
"repo",
|
|
@@ -144,6 +187,7 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
|
|
| 144 |
repo VARCHAR,
|
| 145 |
pr_number BIGINT,
|
| 146 |
github_id BIGINT,
|
|
|
|
| 147 |
state VARCHAR,
|
| 148 |
draft BOOLEAN,
|
| 149 |
merged BOOLEAN,
|
|
@@ -159,6 +203,48 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
|
|
| 159 |
review_comments_count BIGINT,
|
| 160 |
html_url VARCHAR
|
| 161 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
CREATE TABLE IF NOT EXISTS pr_scope_features (
|
| 163 |
run_id VARCHAR,
|
| 164 |
repo VARCHAR,
|
|
@@ -232,6 +318,8 @@ CREATE TABLE IF NOT EXISTS pr_scope_cluster_candidates (
|
|
| 232 |
CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
|
| 233 |
CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
|
| 234 |
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
|
|
|
|
|
|
|
| 235 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
|
| 236 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
|
| 237 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
|
|
@@ -256,6 +344,9 @@ def connect_pr_search_db(path: Path, *, read_only: bool = False) -> duckdb.DuckD
|
|
| 256 |
|
| 257 |
def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
|
| 258 |
connection.execute(SCHEMA_SQL)
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
def insert_rows(
|
|
@@ -353,6 +444,7 @@ def resolve_active_run(
|
|
| 353 |
def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
|
| 354 |
return {
|
| 355 |
"documents": _count(connection, "pr_search_documents", run_id),
|
|
|
|
| 356 |
"features": _count(connection, "pr_scope_features", run_id),
|
| 357 |
"run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
|
| 358 |
"neighbors": _count(connection, "pr_scope_neighbors", run_id),
|
|
@@ -375,6 +467,60 @@ def get_document(
|
|
| 375 |
)
|
| 376 |
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
def get_feature(
|
| 379 |
connection: duckdb.DuckDBPyConnection,
|
| 380 |
*,
|
|
|
|
| 31 |
"repo",
|
| 32 |
"pr_number",
|
| 33 |
"github_id",
|
| 34 |
+
"author_login",
|
| 35 |
"state",
|
| 36 |
"draft",
|
| 37 |
"merged",
|
|
|
|
| 47 |
"review_comments_count",
|
| 48 |
"html_url",
|
| 49 |
),
|
| 50 |
+
"pr_search_contributors": (
|
| 51 |
+
"run_id",
|
| 52 |
+
"repo",
|
| 53 |
+
"snapshot_id",
|
| 54 |
+
"report_generated_at",
|
| 55 |
+
"window_days",
|
| 56 |
+
"author_login",
|
| 57 |
+
"name",
|
| 58 |
+
"profile_url",
|
| 59 |
+
"repo_pull_requests_url",
|
| 60 |
+
"repo_issues_url",
|
| 61 |
+
"repo_first_seen_at",
|
| 62 |
+
"repo_last_seen_at",
|
| 63 |
+
"repo_primary_artifact_count",
|
| 64 |
+
"repo_artifact_count",
|
| 65 |
+
"snapshot_issue_count",
|
| 66 |
+
"snapshot_pr_count",
|
| 67 |
+
"snapshot_comment_count",
|
| 68 |
+
"snapshot_review_count",
|
| 69 |
+
"snapshot_review_comment_count",
|
| 70 |
+
"repo_association",
|
| 71 |
+
"new_to_repo",
|
| 72 |
+
"first_seen_in_snapshot",
|
| 73 |
+
"report_reason",
|
| 74 |
+
"account_age_days",
|
| 75 |
+
"young_account",
|
| 76 |
+
"follow_through_score",
|
| 77 |
+
"breadth_score",
|
| 78 |
+
"automation_risk_signal",
|
| 79 |
+
"heuristic_note",
|
| 80 |
+
"public_orgs_json",
|
| 81 |
+
"visible_authored_pr_count",
|
| 82 |
+
"merged_pr_count",
|
| 83 |
+
"closed_unmerged_pr_count",
|
| 84 |
+
"open_pr_count",
|
| 85 |
+
"merged_pr_rate",
|
| 86 |
+
"closed_unmerged_pr_rate",
|
| 87 |
+
"still_open_pr_rate",
|
| 88 |
+
"distinct_repos_with_authored_prs",
|
| 89 |
+
"distinct_repos_with_open_prs",
|
| 90 |
+
"fetch_error",
|
| 91 |
+
),
|
| 92 |
"pr_scope_features": (
|
| 93 |
"run_id",
|
| 94 |
"repo",
|
|
|
|
| 187 |
repo VARCHAR,
|
| 188 |
pr_number BIGINT,
|
| 189 |
github_id BIGINT,
|
| 190 |
+
author_login VARCHAR,
|
| 191 |
state VARCHAR,
|
| 192 |
draft BOOLEAN,
|
| 193 |
merged BOOLEAN,
|
|
|
|
| 203 |
review_comments_count BIGINT,
|
| 204 |
html_url VARCHAR
|
| 205 |
);
|
| 206 |
+
CREATE TABLE IF NOT EXISTS pr_search_contributors (
|
| 207 |
+
run_id VARCHAR,
|
| 208 |
+
repo VARCHAR,
|
| 209 |
+
snapshot_id VARCHAR,
|
| 210 |
+
report_generated_at VARCHAR,
|
| 211 |
+
window_days BIGINT,
|
| 212 |
+
author_login VARCHAR,
|
| 213 |
+
name VARCHAR,
|
| 214 |
+
profile_url VARCHAR,
|
| 215 |
+
repo_pull_requests_url VARCHAR,
|
| 216 |
+
repo_issues_url VARCHAR,
|
| 217 |
+
repo_first_seen_at VARCHAR,
|
| 218 |
+
repo_last_seen_at VARCHAR,
|
| 219 |
+
repo_primary_artifact_count BIGINT,
|
| 220 |
+
repo_artifact_count BIGINT,
|
| 221 |
+
snapshot_issue_count BIGINT,
|
| 222 |
+
snapshot_pr_count BIGINT,
|
| 223 |
+
snapshot_comment_count BIGINT,
|
| 224 |
+
snapshot_review_count BIGINT,
|
| 225 |
+
snapshot_review_comment_count BIGINT,
|
| 226 |
+
repo_association VARCHAR,
|
| 227 |
+
new_to_repo BOOLEAN,
|
| 228 |
+
first_seen_in_snapshot BOOLEAN,
|
| 229 |
+
report_reason VARCHAR,
|
| 230 |
+
account_age_days BIGINT,
|
| 231 |
+
young_account BOOLEAN,
|
| 232 |
+
follow_through_score VARCHAR,
|
| 233 |
+
breadth_score VARCHAR,
|
| 234 |
+
automation_risk_signal VARCHAR,
|
| 235 |
+
heuristic_note VARCHAR,
|
| 236 |
+
public_orgs_json VARCHAR,
|
| 237 |
+
visible_authored_pr_count BIGINT,
|
| 238 |
+
merged_pr_count BIGINT,
|
| 239 |
+
closed_unmerged_pr_count BIGINT,
|
| 240 |
+
open_pr_count BIGINT,
|
| 241 |
+
merged_pr_rate DOUBLE,
|
| 242 |
+
closed_unmerged_pr_rate DOUBLE,
|
| 243 |
+
still_open_pr_rate DOUBLE,
|
| 244 |
+
distinct_repos_with_authored_prs BIGINT,
|
| 245 |
+
distinct_repos_with_open_prs BIGINT,
|
| 246 |
+
fetch_error VARCHAR
|
| 247 |
+
);
|
| 248 |
CREATE TABLE IF NOT EXISTS pr_scope_features (
|
| 249 |
run_id VARCHAR,
|
| 250 |
repo VARCHAR,
|
|
|
|
| 318 |
CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
|
| 319 |
CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
|
| 320 |
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
|
| 321 |
+
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_author ON pr_search_documents (run_id, author_login);
|
| 322 |
+
CREATE INDEX IF NOT EXISTS idx_pr_search_contributors_run_author ON pr_search_contributors (run_id, author_login);
|
| 323 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
|
| 324 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
|
| 325 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
|
|
|
|
| 344 |
|
| 345 |
def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
|
| 346 |
connection.execute(SCHEMA_SQL)
|
| 347 |
+
connection.execute(
|
| 348 |
+
"ALTER TABLE pr_search_documents ADD COLUMN IF NOT EXISTS author_login VARCHAR"
|
| 349 |
+
)
|
| 350 |
|
| 351 |
|
| 352 |
def insert_rows(
|
|
|
|
| 444 |
def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
|
| 445 |
return {
|
| 446 |
"documents": _count(connection, "pr_search_documents", run_id),
|
| 447 |
+
"contributors": _count(connection, "pr_search_contributors", run_id),
|
| 448 |
"features": _count(connection, "pr_scope_features", run_id),
|
| 449 |
"run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
|
| 450 |
"neighbors": _count(connection, "pr_scope_neighbors", run_id),
|
|
|
|
| 467 |
)
|
| 468 |
|
| 469 |
|
| 470 |
+
def get_contributor(
|
| 471 |
+
connection: duckdb.DuckDBPyConnection,
|
| 472 |
+
*,
|
| 473 |
+
run_id: str,
|
| 474 |
+
author_login: str,
|
| 475 |
+
) -> dict[str, Any] | None:
|
| 476 |
+
return fetch_one(
|
| 477 |
+
connection,
|
| 478 |
+
"""
|
| 479 |
+
SELECT *
|
| 480 |
+
FROM pr_search_contributors
|
| 481 |
+
WHERE run_id = ? AND lower(author_login) = lower(?)
|
| 482 |
+
""",
|
| 483 |
+
[run_id, author_login],
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def get_contributor_pulls(
|
| 488 |
+
connection: duckdb.DuckDBPyConnection,
|
| 489 |
+
*,
|
| 490 |
+
run_id: str,
|
| 491 |
+
author_login: str,
|
| 492 |
+
limit: int,
|
| 493 |
+
) -> list[dict[str, Any]]:
|
| 494 |
+
return fetch_rows(
|
| 495 |
+
connection,
|
| 496 |
+
"""
|
| 497 |
+
SELECT
|
| 498 |
+
pr_number,
|
| 499 |
+
github_id,
|
| 500 |
+
author_login,
|
| 501 |
+
state,
|
| 502 |
+
draft,
|
| 503 |
+
merged,
|
| 504 |
+
title,
|
| 505 |
+
base_ref,
|
| 506 |
+
created_at,
|
| 507 |
+
updated_at,
|
| 508 |
+
merged_at,
|
| 509 |
+
additions,
|
| 510 |
+
deletions,
|
| 511 |
+
changed_files,
|
| 512 |
+
comments_count,
|
| 513 |
+
review_comments_count,
|
| 514 |
+
html_url
|
| 515 |
+
FROM pr_search_documents
|
| 516 |
+
WHERE run_id = ? AND lower(author_login) = lower(?)
|
| 517 |
+
ORDER BY updated_at DESC NULLS LAST, pr_number DESC
|
| 518 |
+
LIMIT ?
|
| 519 |
+
""",
|
| 520 |
+
[run_id, author_login, limit],
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
def get_feature(
|
| 525 |
connection: duckdb.DuckDBPyConnection,
|
| 526 |
*,
|
src/slop_farmer/data/snapshot_source.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 6 |
+
from slop_farmer.data.snapshot_paths import (
|
| 7 |
+
default_hf_materialize_dir,
|
| 8 |
+
resolve_snapshot_dir_from_snapshots_root,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def resolve_snapshot_source_dir(
|
| 13 |
+
*,
|
| 14 |
+
snapshot_dir: Path | None,
|
| 15 |
+
local_snapshots_root: Path,
|
| 16 |
+
hf_repo_id: str | None,
|
| 17 |
+
hf_revision: str | None,
|
| 18 |
+
hf_materialize_dir: Path | None,
|
| 19 |
+
hf_output_dir: Path | None = None,
|
| 20 |
+
) -> Path:
|
| 21 |
+
if snapshot_dir is not None:
|
| 22 |
+
return snapshot_dir.resolve()
|
| 23 |
+
if hf_repo_id:
|
| 24 |
+
output_dir = (hf_output_dir or local_snapshots_root.parent).resolve()
|
| 25 |
+
return materialize_hf_dataset_snapshot(
|
| 26 |
+
repo_id=hf_repo_id,
|
| 27 |
+
local_dir=hf_materialize_dir
|
| 28 |
+
or default_hf_materialize_dir(output_dir, hf_repo_id, hf_revision),
|
| 29 |
+
revision=hf_revision,
|
| 30 |
+
).resolve()
|
| 31 |
+
return resolve_snapshot_dir_from_snapshots_root(local_snapshots_root.resolve(), None)
|
src/slop_farmer/reports/analysis.py
CHANGED
|
@@ -19,11 +19,7 @@ from rank_bm25 import BM25Okapi
|
|
| 19 |
from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
|
| 20 |
from slop_farmer.data.links import build_text_link_rows
|
| 21 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
|
| 22 |
-
from slop_farmer.data.
|
| 23 |
-
from slop_farmer.data.snapshot_paths import (
|
| 24 |
-
default_hf_materialize_dir,
|
| 25 |
-
resolve_snapshot_dir_from_output,
|
| 26 |
-
)
|
| 27 |
from slop_farmer.reports.analysis_cache import (
|
| 28 |
HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
| 29 |
PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
|
|
@@ -766,18 +762,14 @@ def _artifact_suffix(row: dict[str, Any] | None, kind: str) -> str:
|
|
| 766 |
|
| 767 |
|
| 768 |
def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
local_dir=materialize_dir,
|
| 778 |
-
revision=options.hf_revision,
|
| 779 |
-
).resolve()
|
| 780 |
-
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 781 |
|
| 782 |
|
| 783 |
def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
|
|
|
|
| 19 |
from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
|
| 20 |
from slop_farmer.data.links import build_text_link_rows
|
| 21 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
|
| 22 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from slop_farmer.reports.analysis_cache import (
|
| 24 |
HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
| 25 |
PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
|
|
|
|
| 762 |
|
| 763 |
|
| 764 |
def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
|
| 765 |
+
return resolve_snapshot_source_dir(
|
| 766 |
+
snapshot_dir=options.snapshot_dir,
|
| 767 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 768 |
+
hf_repo_id=options.hf_repo_id,
|
| 769 |
+
hf_revision=options.hf_revision,
|
| 770 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 771 |
+
hf_output_dir=options.output_dir,
|
| 772 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
|
| 775 |
def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
|
src/slop_farmer/reports/analysis_service.py
CHANGED
|
@@ -24,6 +24,8 @@ class ActiveSnapshotContext:
|
|
| 24 |
class AnalysisContext:
|
| 25 |
active_run: dict[str, Any]
|
| 26 |
report: dict[str, Any]
|
|
|
|
|
|
|
| 27 |
variant_requested: str
|
| 28 |
variant_used: str
|
| 29 |
|
|
@@ -33,26 +35,31 @@ def get_analysis_status(
|
|
| 33 |
*,
|
| 34 |
repo: str | None = None,
|
| 35 |
variant: str = "auto",
|
|
|
|
| 36 |
) -> dict[str, Any]:
|
| 37 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 38 |
-
report_path, variant_used = _resolve_analysis_report_path(
|
| 39 |
active.snapshot_dir,
|
|
|
|
| 40 |
variant,
|
|
|
|
| 41 |
required=False,
|
| 42 |
)
|
| 43 |
payload = {
|
| 44 |
"repo": str(active.active_run["repo"]),
|
| 45 |
-
"
|
| 46 |
"run_id": str(active.active_run["id"]),
|
| 47 |
"variant_requested": _normalize_analysis_variant(variant),
|
| 48 |
"available": report_path is not None,
|
| 49 |
}
|
| 50 |
-
if report_path is None or variant_used is None:
|
| 51 |
return payload
|
| 52 |
report = _load_report(report_path)
|
| 53 |
return {
|
| 54 |
**payload,
|
|
|
|
| 55 |
"variant_used": variant_used,
|
|
|
|
| 56 |
"llm_enrichment": bool(report.get("llm_enrichment")),
|
| 57 |
"generated_at": report.get("generated_at"),
|
| 58 |
"counts": _analysis_counts(report),
|
|
@@ -65,8 +72,14 @@ def get_pr_analysis(
|
|
| 65 |
pr_number: int,
|
| 66 |
repo: str | None = None,
|
| 67 |
variant: str = "auto",
|
|
|
|
| 68 |
) -> dict[str, Any]:
|
| 69 |
-
context = _load_analysis_context(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
|
| 71 |
duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
|
| 72 |
return {
|
|
@@ -84,8 +97,14 @@ def list_analysis_meta_bugs(
|
|
| 84 |
repo: str | None = None,
|
| 85 |
variant: str = "auto",
|
| 86 |
limit: int = 50,
|
|
|
|
| 87 |
) -> dict[str, Any]:
|
| 88 |
-
context = _load_analysis_context(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
meta_bugs = [
|
| 90 |
_meta_bug_payload(cluster, rank=index)
|
| 91 |
for index, cluster in enumerate(context.report.get("meta_bugs", [])[:limit], start=1)
|
|
@@ -103,8 +122,14 @@ def get_analysis_meta_bug(
|
|
| 103 |
cluster_id: str,
|
| 104 |
repo: str | None = None,
|
| 105 |
variant: str = "auto",
|
|
|
|
| 106 |
) -> dict[str, Any]:
|
| 107 |
-
context = _load_analysis_context(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
|
| 109 |
if str(cluster.get("cluster_id")) != cluster_id:
|
| 110 |
continue
|
|
@@ -113,7 +138,7 @@ def get_analysis_meta_bug(
|
|
| 113 |
"meta_bug": _meta_bug_payload(cluster, rank=index),
|
| 114 |
"duplicate_pr": _find_duplicate_pr_by_cluster_id(context.report, cluster_id),
|
| 115 |
}
|
| 116 |
-
raise ValueError(f"Analysis cluster {cluster_id!r} was not found in the active
|
| 117 |
|
| 118 |
|
| 119 |
def list_analysis_duplicate_prs(
|
|
@@ -122,8 +147,14 @@ def list_analysis_duplicate_prs(
|
|
| 122 |
repo: str | None = None,
|
| 123 |
variant: str = "auto",
|
| 124 |
limit: int = 50,
|
|
|
|
| 125 |
) -> dict[str, Any]:
|
| 126 |
-
context = _load_analysis_context(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
duplicate_prs = [
|
| 128 |
{"rank": index, **dict(entry)}
|
| 129 |
for index, entry in enumerate(context.report.get("duplicate_prs", [])[:limit], start=1)
|
|
@@ -140,8 +171,14 @@ def get_analysis_best(
|
|
| 140 |
*,
|
| 141 |
repo: str | None = None,
|
| 142 |
variant: str = "auto",
|
|
|
|
| 143 |
) -> dict[str, Any]:
|
| 144 |
-
context = _load_analysis_context(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
return {
|
| 146 |
**_analysis_base_payload(context),
|
| 147 |
"best_issue": _best_entry_with_cluster_id(
|
|
@@ -180,18 +217,24 @@ def _load_analysis_context(
|
|
| 180 |
*,
|
| 181 |
repo: str | None,
|
| 182 |
variant: str,
|
|
|
|
| 183 |
) -> AnalysisContext:
|
| 184 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 185 |
-
report_path, variant_used = _resolve_analysis_report_path(
|
| 186 |
active.snapshot_dir,
|
|
|
|
| 187 |
variant,
|
|
|
|
| 188 |
required=True,
|
| 189 |
)
|
| 190 |
assert report_path is not None
|
| 191 |
assert variant_used is not None
|
|
|
|
| 192 |
return AnalysisContext(
|
| 193 |
active_run=active.active_run,
|
| 194 |
report=_load_report(report_path),
|
|
|
|
|
|
|
| 195 |
variant_requested=_normalize_analysis_variant(variant),
|
| 196 |
variant_used=variant_used,
|
| 197 |
)
|
|
@@ -199,31 +242,56 @@ def _load_analysis_context(
|
|
| 199 |
|
| 200 |
def _resolve_analysis_report_path(
|
| 201 |
snapshot_dir: Path,
|
|
|
|
| 202 |
variant: str,
|
| 203 |
*,
|
|
|
|
| 204 |
required: bool,
|
| 205 |
-
) -> tuple[Path | None, str | None]:
|
| 206 |
normalized = _normalize_analysis_variant(variant)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
if normalized == "auto":
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
| 214 |
if not required:
|
| 215 |
-
return None, None
|
| 216 |
-
raise ValueError(
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
| 220 |
if not required:
|
| 221 |
-
return None, None
|
| 222 |
raise ValueError(
|
| 223 |
-
f"{normalized.capitalize()} analysis report was not found for the active snapshot."
|
| 224 |
)
|
| 225 |
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
def _normalize_analysis_variant(variant: str) -> str:
|
| 228 |
normalized = variant.strip().lower()
|
| 229 |
if normalized not in ANALYSIS_VARIANTS:
|
|
@@ -234,12 +302,16 @@ def _normalize_analysis_variant(variant: str) -> str:
|
|
| 234 |
|
| 235 |
|
| 236 |
def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
|
|
|
|
|
|
|
| 237 |
return {
|
| 238 |
"repo": str(context.active_run["repo"]),
|
| 239 |
-
"snapshot_id":
|
|
|
|
| 240 |
"run_id": str(context.active_run["id"]),
|
| 241 |
"variant_requested": context.variant_requested,
|
| 242 |
"variant_used": context.variant_used,
|
|
|
|
| 243 |
"llm_enrichment": bool(context.report.get("llm_enrichment")),
|
| 244 |
"generated_at": context.report.get("generated_at"),
|
| 245 |
}
|
|
|
|
| 24 |
class AnalysisContext:
|
| 25 |
active_run: dict[str, Any]
|
| 26 |
report: dict[str, Any]
|
| 27 |
+
report_path: Path
|
| 28 |
+
report_source: str
|
| 29 |
variant_requested: str
|
| 30 |
variant_used: str
|
| 31 |
|
|
|
|
| 35 |
*,
|
| 36 |
repo: str | None = None,
|
| 37 |
variant: str = "auto",
|
| 38 |
+
analysis_root: Path | None = None,
|
| 39 |
) -> dict[str, Any]:
|
| 40 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 41 |
+
report_path, variant_used, report_source = _resolve_analysis_report_path(
|
| 42 |
active.snapshot_dir,
|
| 43 |
+
str(active.active_run["repo"]),
|
| 44 |
variant,
|
| 45 |
+
analysis_root=analysis_root,
|
| 46 |
required=False,
|
| 47 |
)
|
| 48 |
payload = {
|
| 49 |
"repo": str(active.active_run["repo"]),
|
| 50 |
+
"active_snapshot_id": str(active.active_run["snapshot_id"]),
|
| 51 |
"run_id": str(active.active_run["id"]),
|
| 52 |
"variant_requested": _normalize_analysis_variant(variant),
|
| 53 |
"available": report_path is not None,
|
| 54 |
}
|
| 55 |
+
if report_path is None or variant_used is None or report_source is None:
|
| 56 |
return payload
|
| 57 |
report = _load_report(report_path)
|
| 58 |
return {
|
| 59 |
**payload,
|
| 60 |
+
"snapshot_id": str(report.get("snapshot_id") or active.active_run["snapshot_id"]),
|
| 61 |
"variant_used": variant_used,
|
| 62 |
+
"analysis_source": report_source,
|
| 63 |
"llm_enrichment": bool(report.get("llm_enrichment")),
|
| 64 |
"generated_at": report.get("generated_at"),
|
| 65 |
"counts": _analysis_counts(report),
|
|
|
|
| 72 |
pr_number: int,
|
| 73 |
repo: str | None = None,
|
| 74 |
variant: str = "auto",
|
| 75 |
+
analysis_root: Path | None = None,
|
| 76 |
) -> dict[str, Any]:
|
| 77 |
+
context = _load_analysis_context(
|
| 78 |
+
db_path,
|
| 79 |
+
repo=repo,
|
| 80 |
+
variant=variant,
|
| 81 |
+
analysis_root=analysis_root,
|
| 82 |
+
)
|
| 83 |
meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
|
| 84 |
duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
|
| 85 |
return {
|
|
|
|
| 97 |
repo: str | None = None,
|
| 98 |
variant: str = "auto",
|
| 99 |
limit: int = 50,
|
| 100 |
+
analysis_root: Path | None = None,
|
| 101 |
) -> dict[str, Any]:
|
| 102 |
+
context = _load_analysis_context(
|
| 103 |
+
db_path,
|
| 104 |
+
repo=repo,
|
| 105 |
+
variant=variant,
|
| 106 |
+
analysis_root=analysis_root,
|
| 107 |
+
)
|
| 108 |
meta_bugs = [
|
| 109 |
_meta_bug_payload(cluster, rank=index)
|
| 110 |
for index, cluster in enumerate(context.report.get("meta_bugs", [])[:limit], start=1)
|
|
|
|
| 122 |
cluster_id: str,
|
| 123 |
repo: str | None = None,
|
| 124 |
variant: str = "auto",
|
| 125 |
+
analysis_root: Path | None = None,
|
| 126 |
) -> dict[str, Any]:
|
| 127 |
+
context = _load_analysis_context(
|
| 128 |
+
db_path,
|
| 129 |
+
repo=repo,
|
| 130 |
+
variant=variant,
|
| 131 |
+
analysis_root=analysis_root,
|
| 132 |
+
)
|
| 133 |
for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
|
| 134 |
if str(cluster.get("cluster_id")) != cluster_id:
|
| 135 |
continue
|
|
|
|
| 138 |
"meta_bug": _meta_bug_payload(cluster, rank=index),
|
| 139 |
"duplicate_pr": _find_duplicate_pr_by_cluster_id(context.report, cluster_id),
|
| 140 |
}
|
| 141 |
+
raise ValueError(f"Analysis cluster {cluster_id!r} was not found in the active analysis view.")
|
| 142 |
|
| 143 |
|
| 144 |
def list_analysis_duplicate_prs(
|
|
|
|
| 147 |
repo: str | None = None,
|
| 148 |
variant: str = "auto",
|
| 149 |
limit: int = 50,
|
| 150 |
+
analysis_root: Path | None = None,
|
| 151 |
) -> dict[str, Any]:
|
| 152 |
+
context = _load_analysis_context(
|
| 153 |
+
db_path,
|
| 154 |
+
repo=repo,
|
| 155 |
+
variant=variant,
|
| 156 |
+
analysis_root=analysis_root,
|
| 157 |
+
)
|
| 158 |
duplicate_prs = [
|
| 159 |
{"rank": index, **dict(entry)}
|
| 160 |
for index, entry in enumerate(context.report.get("duplicate_prs", [])[:limit], start=1)
|
|
|
|
| 171 |
*,
|
| 172 |
repo: str | None = None,
|
| 173 |
variant: str = "auto",
|
| 174 |
+
analysis_root: Path | None = None,
|
| 175 |
) -> dict[str, Any]:
|
| 176 |
+
context = _load_analysis_context(
|
| 177 |
+
db_path,
|
| 178 |
+
repo=repo,
|
| 179 |
+
variant=variant,
|
| 180 |
+
analysis_root=analysis_root,
|
| 181 |
+
)
|
| 182 |
return {
|
| 183 |
**_analysis_base_payload(context),
|
| 184 |
"best_issue": _best_entry_with_cluster_id(
|
|
|
|
| 217 |
*,
|
| 218 |
repo: str | None,
|
| 219 |
variant: str,
|
| 220 |
+
analysis_root: Path | None,
|
| 221 |
) -> AnalysisContext:
|
| 222 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 223 |
+
report_path, variant_used, report_source = _resolve_analysis_report_path(
|
| 224 |
active.snapshot_dir,
|
| 225 |
+
str(active.active_run["repo"]),
|
| 226 |
variant,
|
| 227 |
+
analysis_root=analysis_root,
|
| 228 |
required=True,
|
| 229 |
)
|
| 230 |
assert report_path is not None
|
| 231 |
assert variant_used is not None
|
| 232 |
+
assert report_source is not None
|
| 233 |
return AnalysisContext(
|
| 234 |
active_run=active.active_run,
|
| 235 |
report=_load_report(report_path),
|
| 236 |
+
report_path=report_path,
|
| 237 |
+
report_source=report_source,
|
| 238 |
variant_requested=_normalize_analysis_variant(variant),
|
| 239 |
variant_used=variant_used,
|
| 240 |
)
|
|
|
|
| 242 |
|
| 243 |
def _resolve_analysis_report_path(
|
| 244 |
snapshot_dir: Path,
|
| 245 |
+
repo: str,
|
| 246 |
variant: str,
|
| 247 |
*,
|
| 248 |
+
analysis_root: Path | None,
|
| 249 |
required: bool,
|
| 250 |
+
) -> tuple[Path | None, str | None, str | None]:
|
| 251 |
normalized = _normalize_analysis_variant(variant)
|
| 252 |
+
candidate_dirs = _candidate_analysis_dirs(
|
| 253 |
+
snapshot_dir=snapshot_dir,
|
| 254 |
+
repo=repo,
|
| 255 |
+
analysis_root=analysis_root,
|
| 256 |
+
)
|
| 257 |
if normalized == "auto":
|
| 258 |
+
for source, directory in candidate_dirs:
|
| 259 |
+
hybrid_path = directory / ANALYSIS_REPORT_FILENAMES["hybrid"]
|
| 260 |
+
if hybrid_path.exists():
|
| 261 |
+
return hybrid_path, "hybrid", source
|
| 262 |
+
deterministic_path = directory / ANALYSIS_REPORT_FILENAMES["deterministic"]
|
| 263 |
+
if deterministic_path.exists():
|
| 264 |
+
return deterministic_path, "deterministic", source
|
| 265 |
if not required:
|
| 266 |
+
return None, None, None
|
| 267 |
+
raise ValueError(
|
| 268 |
+
"No analysis report was found for the current analysis path or active snapshot."
|
| 269 |
+
)
|
| 270 |
+
for source, directory in candidate_dirs:
|
| 271 |
+
report_path = directory / ANALYSIS_REPORT_FILENAMES[normalized]
|
| 272 |
+
if report_path.exists():
|
| 273 |
+
return report_path, normalized, source
|
| 274 |
if not required:
|
| 275 |
+
return None, None, None
|
| 276 |
raise ValueError(
|
| 277 |
+
f"{normalized.capitalize()} analysis report was not found for the current analysis path or active snapshot."
|
| 278 |
)
|
| 279 |
|
| 280 |
|
| 281 |
+
def _candidate_analysis_dirs(
|
| 282 |
+
*,
|
| 283 |
+
snapshot_dir: Path,
|
| 284 |
+
repo: str,
|
| 285 |
+
analysis_root: Path | None,
|
| 286 |
+
) -> list[tuple[str, Path]]:
|
| 287 |
+
owner, name = repo.split("/", 1)
|
| 288 |
+
candidates: list[tuple[str, Path]] = []
|
| 289 |
+
if analysis_root is not None:
|
| 290 |
+
candidates.append(("current", analysis_root / owner / name / "current"))
|
| 291 |
+
candidates.append(("snapshot", snapshot_dir))
|
| 292 |
+
return candidates
|
| 293 |
+
|
| 294 |
+
|
| 295 |
def _normalize_analysis_variant(variant: str) -> str:
|
| 296 |
normalized = variant.strip().lower()
|
| 297 |
if normalized not in ANALYSIS_VARIANTS:
|
|
|
|
| 302 |
|
| 303 |
|
| 304 |
def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
|
| 305 |
+
active_snapshot_id = str(context.active_run["snapshot_id"])
|
| 306 |
+
snapshot_id = str(context.report.get("snapshot_id") or active_snapshot_id)
|
| 307 |
return {
|
| 308 |
"repo": str(context.active_run["repo"]),
|
| 309 |
+
"snapshot_id": snapshot_id,
|
| 310 |
+
"active_snapshot_id": active_snapshot_id,
|
| 311 |
"run_id": str(context.active_run["id"]),
|
| 312 |
"variant_requested": context.variant_requested,
|
| 313 |
"variant_used": context.variant_used,
|
| 314 |
+
"analysis_source": context.report_source,
|
| 315 |
"llm_enrichment": bool(context.report.get("llm_enrichment")),
|
| 316 |
"generated_at": context.report.get("generated_at"),
|
| 317 |
}
|
src/slop_farmer/reports/dashboard.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any
|
|
| 8 |
|
| 9 |
from slop_farmer.config import DashboardDataOptions
|
| 10 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 11 |
-
from slop_farmer.data.
|
| 12 |
|
| 13 |
|
| 14 |
def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
@@ -88,7 +88,14 @@ def _resolve_snapshot_dir(options: DashboardDataOptions) -> Path:
|
|
| 88 |
if options.snapshot_root is not None
|
| 89 |
else (Path("data") / "snapshots").resolve()
|
| 90 |
)
|
| 91 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def _read_optional_json(path: Path) -> dict[str, Any]:
|
|
|
|
| 8 |
|
| 9 |
from slop_farmer.config import DashboardDataOptions
|
| 10 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 11 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 12 |
|
| 13 |
|
| 14 |
def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
|
|
| 88 |
if options.snapshot_root is not None
|
| 89 |
else (Path("data") / "snapshots").resolve()
|
| 90 |
)
|
| 91 |
+
return resolve_snapshot_source_dir(
|
| 92 |
+
snapshot_dir=options.snapshot_dir,
|
| 93 |
+
local_snapshots_root=snapshots_root,
|
| 94 |
+
hf_repo_id=options.hf_repo_id,
|
| 95 |
+
hf_revision=options.hf_revision,
|
| 96 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 97 |
+
hf_output_dir=snapshots_root.parent,
|
| 98 |
+
)
|
| 99 |
|
| 100 |
|
| 101 |
def _read_optional_json(path: Path) -> dict[str, Any]:
|
src/slop_farmer/reports/new_contributor_report.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import Any
|
|
| 12 |
from slop_farmer.config import NewContributorReportOptions, resolve_github_token
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
|
| 15 |
-
from slop_farmer.data.
|
| 16 |
from slop_farmer.reports.user_activity import summarize_user
|
| 17 |
|
| 18 |
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
@@ -131,7 +131,14 @@ def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
|
|
| 131 |
|
| 132 |
|
| 133 |
def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
|
| 134 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
@@ -244,7 +251,6 @@ def _report_contributors(
|
|
| 244 |
previous_report_reusable
|
| 245 |
and previous_entry is not None
|
| 246 |
and not previous_entry.get("fetch_error")
|
| 247 |
-
and not known_via_prior_merged_pr
|
| 248 |
):
|
| 249 |
contributors.append(
|
| 250 |
_reused_previous_report_entry(
|
|
@@ -256,6 +262,8 @@ def _report_contributors(
|
|
| 256 |
)
|
| 257 |
)
|
| 258 |
reused_previous_report += 1
|
|
|
|
|
|
|
| 259 |
continue
|
| 260 |
try:
|
| 261 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
|
|
|
| 12 |
from slop_farmer.config import NewContributorReportOptions, resolve_github_token
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
|
| 15 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 16 |
from slop_farmer.reports.user_activity import summarize_user
|
| 17 |
|
| 18 |
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
|
| 134 |
+
return resolve_snapshot_source_dir(
|
| 135 |
+
snapshot_dir=options.snapshot_dir,
|
| 136 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 137 |
+
hf_repo_id=options.hf_repo_id,
|
| 138 |
+
hf_revision=options.hf_revision,
|
| 139 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 140 |
+
hf_output_dir=options.output_dir,
|
| 141 |
+
)
|
| 142 |
|
| 143 |
|
| 144 |
def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 251 |
previous_report_reusable
|
| 252 |
and previous_entry is not None
|
| 253 |
and not previous_entry.get("fetch_error")
|
|
|
|
| 254 |
):
|
| 255 |
contributors.append(
|
| 256 |
_reused_previous_report_entry(
|
|
|
|
| 262 |
)
|
| 263 |
)
|
| 264 |
reused_previous_report += 1
|
| 265 |
+
if known_via_prior_merged_pr:
|
| 266 |
+
reused_known_merged += 1
|
| 267 |
continue
|
| 268 |
try:
|
| 269 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
src/slop_farmer/reports/pr_scope.py
CHANGED
|
@@ -42,11 +42,7 @@ from typing import Any
|
|
| 42 |
from pydantic import BaseModel, Field
|
| 43 |
|
| 44 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 45 |
-
from slop_farmer.data.
|
| 46 |
-
from slop_farmer.data.snapshot_paths import (
|
| 47 |
-
default_hf_materialize_dir,
|
| 48 |
-
resolve_snapshot_dir_from_output,
|
| 49 |
-
)
|
| 50 |
from slop_farmer.reports.pr_heuristics import (
|
| 51 |
compile_cluster_suppression_rules,
|
| 52 |
suppressed_pull_request_reasons,
|
|
@@ -260,17 +256,14 @@ def run_pr_scope_report(options: Any) -> Path:
|
|
| 260 |
|
| 261 |
|
| 262 |
def _resolve_snapshot_dir(options: Any) -> Path:
|
| 263 |
-
|
| 264 |
-
snapshot_dir
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
)
|
| 272 |
-
return snapshot_dir.resolve()
|
| 273 |
-
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 274 |
|
| 275 |
|
| 276 |
def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 42 |
from pydantic import BaseModel, Field
|
| 43 |
|
| 44 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 45 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
from slop_farmer.reports.pr_heuristics import (
|
| 47 |
compile_cluster_suppression_rules,
|
| 48 |
suppressed_pull_request_reasons,
|
|
|
|
| 256 |
|
| 257 |
|
| 258 |
def _resolve_snapshot_dir(options: Any) -> Path:
|
| 259 |
+
return resolve_snapshot_source_dir(
|
| 260 |
+
snapshot_dir=options.snapshot_dir,
|
| 261 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 262 |
+
hf_repo_id=options.hf_repo_id,
|
| 263 |
+
hf_revision=options.hf_revision,
|
| 264 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 265 |
+
hf_output_dir=options.output_dir,
|
| 266 |
+
)
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
|
| 269 |
def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
|
src/slop_farmer/reports/pr_search_scope.py
CHANGED
|
@@ -10,11 +10,7 @@ from typing import Any
|
|
| 10 |
|
| 11 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 12 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 13 |
-
from slop_farmer.data.
|
| 14 |
-
from slop_farmer.data.snapshot_paths import (
|
| 15 |
-
default_hf_materialize_dir,
|
| 16 |
-
resolve_snapshot_dir_from_output,
|
| 17 |
-
)
|
| 18 |
from slop_farmer.reports.pr_heuristics import (
|
| 19 |
compile_cluster_suppression_rules,
|
| 20 |
suppressed_pull_request_reasons,
|
|
@@ -36,17 +32,14 @@ DEFAULT_CANDIDATE_LIMIT = 5
|
|
| 36 |
|
| 37 |
|
| 38 |
def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
|
| 39 |
-
|
| 40 |
-
snapshot_dir
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
)
|
| 48 |
-
return snapshot_dir.resolve()
|
| 49 |
-
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 50 |
|
| 51 |
|
| 52 |
def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
@@ -54,6 +47,7 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
| 54 |
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 55 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 56 |
pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
|
|
|
|
| 57 |
repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
|
| 58 |
snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
|
| 59 |
return {
|
|
@@ -62,6 +56,7 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
| 62 |
"manifest": manifest,
|
| 63 |
"pull_requests": pull_requests,
|
| 64 |
"pr_files": pr_files,
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
|
|
@@ -412,6 +407,7 @@ def _document_row(row: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 412 |
return {
|
| 413 |
"pr_number": int(row["number"]),
|
| 414 |
"github_id": row.get("github_id"),
|
|
|
|
| 415 |
"state": row.get("state"),
|
| 416 |
"draft": bool(row.get("draft")),
|
| 417 |
"merged": bool(row.get("merged")),
|
|
|
|
| 10 |
|
| 11 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 12 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 13 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from slop_farmer.reports.pr_heuristics import (
|
| 15 |
compile_cluster_suppression_rules,
|
| 16 |
suppressed_pull_request_reasons,
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
|
| 35 |
+
return resolve_snapshot_source_dir(
|
| 36 |
+
snapshot_dir=options.snapshot_dir,
|
| 37 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 38 |
+
hf_repo_id=options.hf_repo_id,
|
| 39 |
+
hf_revision=options.hf_revision,
|
| 40 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 41 |
+
hf_output_dir=options.output_dir,
|
| 42 |
+
)
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 47 |
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 48 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 49 |
pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
|
| 50 |
+
contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet")
|
| 51 |
repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
|
| 52 |
snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
|
| 53 |
return {
|
|
|
|
| 56 |
"manifest": manifest,
|
| 57 |
"pull_requests": pull_requests,
|
| 58 |
"pr_files": pr_files,
|
| 59 |
+
"contributors": contributors,
|
| 60 |
}
|
| 61 |
|
| 62 |
|
|
|
|
| 407 |
return {
|
| 408 |
"pr_number": int(row["number"]),
|
| 409 |
"github_id": row.get("github_id"),
|
| 410 |
+
"author_login": row.get("author_login"),
|
| 411 |
"state": row.get("state"),
|
| 412 |
"draft": bool(row.get("draft")),
|
| 413 |
"merged": bool(row.get("merged")),
|
src/slop_farmer/reports/pr_search_service.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
-
from collections.abc import Iterable, Mapping
|
| 5 |
from contextlib import suppress
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any, Protocol
|
|
@@ -17,6 +17,8 @@ from slop_farmer.data.search_duckdb import (
|
|
| 17 |
get_cluster,
|
| 18 |
get_cluster_ids_for_prs,
|
| 19 |
get_cluster_members,
|
|
|
|
|
|
|
| 20 |
get_document,
|
| 21 |
get_feature,
|
| 22 |
get_pair_neighbor_row,
|
|
@@ -99,6 +101,16 @@ def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
|
|
| 99 |
"pr_search_documents",
|
| 100 |
_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
|
| 101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
insert_rows(
|
| 103 |
connection,
|
| 104 |
"pr_scope_features",
|
|
@@ -290,6 +302,85 @@ def get_pr_search_candidate_clusters(
|
|
| 290 |
connection.close()
|
| 291 |
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
def get_pr_search_similar_lookup(
|
| 294 |
db_path: Path,
|
| 295 |
*,
|
|
@@ -801,6 +892,15 @@ def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[st
|
|
| 801 |
return feature
|
| 802 |
|
| 803 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
def _json_list(raw: Any) -> list[str]:
|
| 805 |
if isinstance(raw, list):
|
| 806 |
return [str(item) for item in raw]
|
|
@@ -838,6 +938,71 @@ def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 838 |
return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
|
| 839 |
|
| 840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
def _normalize_lookup_mode(mode: str) -> str:
|
| 842 |
normalized = mode.strip().lower()
|
| 843 |
if normalized not in {"auto", "indexed", "live"}:
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
+
from collections.abc import Iterable, Mapping, Sequence
|
| 5 |
from contextlib import suppress
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any, Protocol
|
|
|
|
| 17 |
get_cluster,
|
| 18 |
get_cluster_ids_for_prs,
|
| 19 |
get_cluster_members,
|
| 20 |
+
get_contributor,
|
| 21 |
+
get_contributor_pulls,
|
| 22 |
get_document,
|
| 23 |
get_feature,
|
| 24 |
get_pair_neighbor_row,
|
|
|
|
| 101 |
"pr_search_documents",
|
| 102 |
_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
|
| 103 |
)
|
| 104 |
+
insert_rows(
|
| 105 |
+
connection,
|
| 106 |
+
"pr_search_contributors",
|
| 107 |
+
_contributor_rows(
|
| 108 |
+
snapshot["contributors"],
|
| 109 |
+
run_id=run_id,
|
| 110 |
+
repo=repo,
|
| 111 |
+
snapshot_id=str(snapshot["snapshot_id"]),
|
| 112 |
+
),
|
| 113 |
+
)
|
| 114 |
insert_rows(
|
| 115 |
connection,
|
| 116 |
"pr_scope_features",
|
|
|
|
| 302 |
connection.close()
|
| 303 |
|
| 304 |
|
| 305 |
+
def get_pr_search_contributor(
|
| 306 |
+
db_path: Path,
|
| 307 |
+
*,
|
| 308 |
+
author_login: str,
|
| 309 |
+
repo: str | None = None,
|
| 310 |
+
) -> dict[str, Any]:
|
| 311 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 312 |
+
try:
|
| 313 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 314 |
+
run_id = str(active_run["id"])
|
| 315 |
+
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 316 |
+
pulls = _document_rows(
|
| 317 |
+
get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
|
| 318 |
+
)
|
| 319 |
+
return {
|
| 320 |
+
"repo": active_run["repo"],
|
| 321 |
+
"snapshot_id": active_run["snapshot_id"],
|
| 322 |
+
"run_id": run_id,
|
| 323 |
+
"contributor": contributor,
|
| 324 |
+
"pulls": pulls,
|
| 325 |
+
"pull_count": len(pulls),
|
| 326 |
+
}
|
| 327 |
+
finally:
|
| 328 |
+
connection.close()
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def get_pr_search_contributor_pulls(
|
| 332 |
+
db_path: Path,
|
| 333 |
+
*,
|
| 334 |
+
author_login: str,
|
| 335 |
+
repo: str | None = None,
|
| 336 |
+
limit: int = 20,
|
| 337 |
+
) -> dict[str, Any]:
|
| 338 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 339 |
+
try:
|
| 340 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 341 |
+
run_id = str(active_run["id"])
|
| 342 |
+
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 343 |
+
pulls = _document_rows(
|
| 344 |
+
get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
|
| 345 |
+
)
|
| 346 |
+
return {
|
| 347 |
+
"repo": active_run["repo"],
|
| 348 |
+
"snapshot_id": active_run["snapshot_id"],
|
| 349 |
+
"run_id": run_id,
|
| 350 |
+
"contributor": contributor,
|
| 351 |
+
"pulls": pulls,
|
| 352 |
+
"pull_count": len(pulls),
|
| 353 |
+
}
|
| 354 |
+
finally:
|
| 355 |
+
connection.close()
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def get_pr_search_pull_contributor(
|
| 359 |
+
db_path: Path,
|
| 360 |
+
*,
|
| 361 |
+
pr_number: int,
|
| 362 |
+
repo: str | None = None,
|
| 363 |
+
) -> dict[str, Any]:
|
| 364 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 365 |
+
try:
|
| 366 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 367 |
+
run_id = str(active_run["id"])
|
| 368 |
+
document = _require_document(connection, run_id=run_id, pr_number=pr_number)
|
| 369 |
+
author_login = str(document.get("author_login") or "").strip()
|
| 370 |
+
if not author_login:
|
| 371 |
+
raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
|
| 372 |
+
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 373 |
+
return {
|
| 374 |
+
"repo": active_run["repo"],
|
| 375 |
+
"snapshot_id": active_run["snapshot_id"],
|
| 376 |
+
"run_id": run_id,
|
| 377 |
+
"pr": _without_json_fields(document),
|
| 378 |
+
"contributor": contributor,
|
| 379 |
+
}
|
| 380 |
+
finally:
|
| 381 |
+
connection.close()
|
| 382 |
+
|
| 383 |
+
|
| 384 |
def get_pr_search_similar_lookup(
|
| 385 |
db_path: Path,
|
| 386 |
*,
|
|
|
|
| 892 |
return feature
|
| 893 |
|
| 894 |
|
| 895 |
+
def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
|
| 896 |
+
contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
|
| 897 |
+
if contributor is None:
|
| 898 |
+
raise ValueError(
|
| 899 |
+
f"Contributor {author_login!r} was not found in the active indexed universe."
|
| 900 |
+
)
|
| 901 |
+
return _contributor_row(contributor)
|
| 902 |
+
|
| 903 |
+
|
| 904 |
def _json_list(raw: Any) -> list[str]:
|
| 905 |
if isinstance(raw, list):
|
| 906 |
return [str(item) for item in raw]
|
|
|
|
| 938 |
return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
|
| 939 |
|
| 940 |
|
| 941 |
+
def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
|
| 942 |
+
return [_without_json_fields(row) for row in rows]
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
def _contributor_rows(
|
| 946 |
+
rows: list[Mapping[str, Any]],
|
| 947 |
+
*,
|
| 948 |
+
run_id: str,
|
| 949 |
+
repo: str,
|
| 950 |
+
snapshot_id: str,
|
| 951 |
+
) -> list[dict[str, Any]]:
|
| 952 |
+
return [
|
| 953 |
+
{
|
| 954 |
+
"run_id": run_id,
|
| 955 |
+
"repo": repo,
|
| 956 |
+
"snapshot_id": snapshot_id,
|
| 957 |
+
"report_generated_at": row.get("report_generated_at"),
|
| 958 |
+
"window_days": row.get("window_days"),
|
| 959 |
+
"author_login": row.get("author_login"),
|
| 960 |
+
"name": row.get("name"),
|
| 961 |
+
"profile_url": row.get("profile_url"),
|
| 962 |
+
"repo_pull_requests_url": row.get("repo_pull_requests_url"),
|
| 963 |
+
"repo_issues_url": row.get("repo_issues_url"),
|
| 964 |
+
"repo_first_seen_at": row.get("repo_first_seen_at"),
|
| 965 |
+
"repo_last_seen_at": row.get("repo_last_seen_at"),
|
| 966 |
+
"repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
|
| 967 |
+
"repo_artifact_count": row.get("repo_artifact_count"),
|
| 968 |
+
"snapshot_issue_count": row.get("snapshot_issue_count"),
|
| 969 |
+
"snapshot_pr_count": row.get("snapshot_pr_count"),
|
| 970 |
+
"snapshot_comment_count": row.get("snapshot_comment_count"),
|
| 971 |
+
"snapshot_review_count": row.get("snapshot_review_count"),
|
| 972 |
+
"snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
|
| 973 |
+
"repo_association": row.get("repo_association"),
|
| 974 |
+
"new_to_repo": row.get("new_to_repo"),
|
| 975 |
+
"first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
|
| 976 |
+
"report_reason": row.get("report_reason"),
|
| 977 |
+
"account_age_days": row.get("account_age_days"),
|
| 978 |
+
"young_account": row.get("young_account"),
|
| 979 |
+
"follow_through_score": row.get("follow_through_score"),
|
| 980 |
+
"breadth_score": row.get("breadth_score"),
|
| 981 |
+
"automation_risk_signal": row.get("automation_risk_signal"),
|
| 982 |
+
"heuristic_note": row.get("heuristic_note"),
|
| 983 |
+
"public_orgs_json": row.get("public_orgs"),
|
| 984 |
+
"visible_authored_pr_count": row.get("visible_authored_pr_count"),
|
| 985 |
+
"merged_pr_count": row.get("merged_pr_count"),
|
| 986 |
+
"closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
|
| 987 |
+
"open_pr_count": row.get("open_pr_count"),
|
| 988 |
+
"merged_pr_rate": row.get("merged_pr_rate"),
|
| 989 |
+
"closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
|
| 990 |
+
"still_open_pr_rate": row.get("still_open_pr_rate"),
|
| 991 |
+
"distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
|
| 992 |
+
"distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
|
| 993 |
+
"fetch_error": row.get("fetch_error"),
|
| 994 |
+
}
|
| 995 |
+
for row in rows
|
| 996 |
+
]
|
| 997 |
+
|
| 998 |
+
|
| 999 |
+
def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
|
| 1000 |
+
return {
|
| 1001 |
+
**_without_json_fields(row),
|
| 1002 |
+
"public_orgs": _json_list(row.get("public_orgs_json")),
|
| 1003 |
+
}
|
| 1004 |
+
|
| 1005 |
+
|
| 1006 |
def _normalize_lookup_mode(mode: str) -> str:
|
| 1007 |
normalized = mode.strip().lower()
|
| 1008 |
if normalized not in {"auto", "indexed", "live"}:
|
uv.lock
CHANGED
|
@@ -4,7 +4,7 @@ requires-python = ">=3.13.5"
|
|
| 4 |
|
| 5 |
[[package]]
|
| 6 |
name = "a2a-sdk"
|
| 7 |
-
version = "0.3.
|
| 8 |
source = { registry = "https://pypi.org/simple" }
|
| 9 |
dependencies = [
|
| 10 |
{ name = "google-api-core" },
|
|
@@ -13,9 +13,9 @@ dependencies = [
|
|
| 13 |
{ name = "protobuf" },
|
| 14 |
{ name = "pydantic" },
|
| 15 |
]
|
| 16 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 17 |
wheels = [
|
| 18 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 19 |
]
|
| 20 |
|
| 21 |
[[package]]
|
|
@@ -53,7 +53,7 @@ wheels = [
|
|
| 53 |
|
| 54 |
[[package]]
|
| 55 |
name = "aiohttp"
|
| 56 |
-
version = "3.13.
|
| 57 |
source = { registry = "https://pypi.org/simple" }
|
| 58 |
dependencies = [
|
| 59 |
{ name = "aiohappyeyeballs" },
|
|
@@ -64,59 +64,59 @@ dependencies = [
|
|
| 64 |
{ name = "propcache" },
|
| 65 |
{ name = "yarl" },
|
| 66 |
]
|
| 67 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 68 |
-
wheels = [
|
| 69 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 70 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 71 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 72 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 73 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 74 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 75 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 76 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 77 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 78 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 79 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 80 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 81 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 82 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 83 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 84 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 85 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 86 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 87 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 88 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 89 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 90 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 91 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 92 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 93 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 94 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 95 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 96 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 97 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 98 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 99 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 100 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 101 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 102 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 103 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 104 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 105 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 106 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 107 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 108 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 109 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 110 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 111 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 112 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 113 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 114 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 115 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 116 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 117 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 118 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 119 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 120 |
]
|
| 121 |
|
| 122 |
[[package]]
|
|
@@ -151,7 +151,7 @@ wheels = [
|
|
| 151 |
|
| 152 |
[[package]]
|
| 153 |
name = "anthropic"
|
| 154 |
-
version = "0.
|
| 155 |
source = { registry = "https://pypi.org/simple" }
|
| 156 |
dependencies = [
|
| 157 |
{ name = "anyio" },
|
|
@@ -163,9 +163,9 @@ dependencies = [
|
|
| 163 |
{ name = "sniffio" },
|
| 164 |
{ name = "typing-extensions" },
|
| 165 |
]
|
| 166 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 167 |
wheels = [
|
| 168 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 169 |
]
|
| 170 |
|
| 171 |
[package.optional-dependencies]
|
|
@@ -539,15 +539,15 @@ wheels = [
|
|
| 539 |
|
| 540 |
[[package]]
|
| 541 |
name = "email-validator"
|
| 542 |
-
version = "2.
|
| 543 |
source = { registry = "https://pypi.org/simple" }
|
| 544 |
dependencies = [
|
| 545 |
{ name = "dnspython" },
|
| 546 |
{ name = "idna" },
|
| 547 |
]
|
| 548 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 549 |
wheels = [
|
| 550 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 551 |
]
|
| 552 |
|
| 553 |
[[package]]
|
|
@@ -561,7 +561,7 @@ wheels = [
|
|
| 561 |
|
| 562 |
[[package]]
|
| 563 |
name = "fast-agent-mcp"
|
| 564 |
-
version = "0.6.
|
| 565 |
source = { registry = "https://pypi.org/simple" }
|
| 566 |
dependencies = [
|
| 567 |
{ name = "a2a-sdk" },
|
|
@@ -598,14 +598,14 @@ dependencies = [
|
|
| 598 |
{ name = "uvloop", marker = "sys_platform != 'win32'" },
|
| 599 |
{ name = "watchfiles" },
|
| 600 |
]
|
| 601 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 602 |
wheels = [
|
| 603 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 604 |
]
|
| 605 |
|
| 606 |
[[package]]
|
| 607 |
name = "fastapi"
|
| 608 |
-
version = "0.
|
| 609 |
source = { registry = "https://pypi.org/simple" }
|
| 610 |
dependencies = [
|
| 611 |
{ name = "annotated-doc" },
|
|
@@ -614,14 +614,14 @@ dependencies = [
|
|
| 614 |
{ name = "typing-extensions" },
|
| 615 |
{ name = "typing-inspection" },
|
| 616 |
]
|
| 617 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 618 |
wheels = [
|
| 619 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 620 |
]
|
| 621 |
|
| 622 |
[[package]]
|
| 623 |
name = "fastmcp"
|
| 624 |
-
version = "3.2.
|
| 625 |
source = { registry = "https://pypi.org/simple" }
|
| 626 |
dependencies = [
|
| 627 |
{ name = "authlib" },
|
|
@@ -646,9 +646,9 @@ dependencies = [
|
|
| 646 |
{ name = "watchfiles" },
|
| 647 |
{ name = "websockets" },
|
| 648 |
]
|
| 649 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 650 |
wheels = [
|
| 651 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 652 |
]
|
| 653 |
|
| 654 |
[[package]]
|
|
@@ -778,7 +778,7 @@ requests = [
|
|
| 778 |
|
| 779 |
[[package]]
|
| 780 |
name = "google-genai"
|
| 781 |
-
version = "1.
|
| 782 |
source = { registry = "https://pypi.org/simple" }
|
| 783 |
dependencies = [
|
| 784 |
{ name = "anyio" },
|
|
@@ -792,9 +792,9 @@ dependencies = [
|
|
| 792 |
{ name = "typing-extensions" },
|
| 793 |
{ name = "websockets" },
|
| 794 |
]
|
| 795 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 796 |
wheels = [
|
| 797 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 798 |
]
|
| 799 |
|
| 800 |
[[package]]
|
|
@@ -1082,7 +1082,7 @@ wheels = [
|
|
| 1082 |
|
| 1083 |
[[package]]
|
| 1084 |
name = "jsonschema"
|
| 1085 |
-
version = "4.
|
| 1086 |
source = { registry = "https://pypi.org/simple" }
|
| 1087 |
dependencies = [
|
| 1088 |
{ name = "attrs" },
|
|
@@ -1090,9 +1090,9 @@ dependencies = [
|
|
| 1090 |
{ name = "referencing" },
|
| 1091 |
{ name = "rpds-py" },
|
| 1092 |
]
|
| 1093 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 1094 |
wheels = [
|
| 1095 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1096 |
]
|
| 1097 |
|
| 1098 |
[[package]]
|
|
@@ -1870,7 +1870,7 @@ wheels = [
|
|
| 1870 |
|
| 1871 |
[[package]]
|
| 1872 |
name = "pydantic"
|
| 1873 |
-
version = "2.
|
| 1874 |
source = { registry = "https://pypi.org/simple" }
|
| 1875 |
dependencies = [
|
| 1876 |
{ name = "annotated-types" },
|
|
@@ -1878,9 +1878,9 @@ dependencies = [
|
|
| 1878 |
{ name = "typing-extensions" },
|
| 1879 |
{ name = "typing-inspection" },
|
| 1880 |
]
|
| 1881 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 1882 |
wheels = [
|
| 1883 |
-
{ url = "https://files.pythonhosted.org/packages/5a/
|
| 1884 |
]
|
| 1885 |
|
| 1886 |
[package.optional-dependencies]
|
|
@@ -1890,69 +1890,72 @@ email = [
|
|
| 1890 |
|
| 1891 |
[[package]]
|
| 1892 |
name = "pydantic-core"
|
| 1893 |
-
version = "2.
|
| 1894 |
source = { registry = "https://pypi.org/simple" }
|
| 1895 |
dependencies = [
|
| 1896 |
{ name = "typing-extensions" },
|
| 1897 |
]
|
| 1898 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 1899 |
-
wheels = [
|
| 1900 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1901 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1902 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1903 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1904 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1905 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1906 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1907 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1908 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1909 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1910 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1911 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1912 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1913 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1914 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1915 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1916 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1917 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1918 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1919 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1920 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1921 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1922 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1923 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1924 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1925 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1926 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1927 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1928 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1929 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1930 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1931 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1932 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1933 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1934 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1935 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1936 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1937 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1938 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1939 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1940 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1941 |
-
{ url = "https://files.pythonhosted.org/packages/
|
|
|
|
|
|
|
|
|
|
| 1942 |
]
|
| 1943 |
|
| 1944 |
[[package]]
|
| 1945 |
name = "pydantic-settings"
|
| 1946 |
-
version = "2.13.
|
| 1947 |
source = { registry = "https://pypi.org/simple" }
|
| 1948 |
dependencies = [
|
| 1949 |
{ name = "pydantic" },
|
| 1950 |
{ name = "python-dotenv" },
|
| 1951 |
{ name = "typing-inspection" },
|
| 1952 |
]
|
| 1953 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 1954 |
wheels = [
|
| 1955 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1956 |
]
|
| 1957 |
|
| 1958 |
[[package]]
|
|
@@ -1993,12 +1996,9 @@ crypto = [
|
|
| 1993 |
|
| 1994 |
[[package]]
|
| 1995 |
name = "pyperclip"
|
| 1996 |
-
version = "1.
|
| 1997 |
source = { registry = "https://pypi.org/simple" }
|
| 1998 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 1999 |
-
wheels = [
|
| 2000 |
-
{ url = "https://files.pythonhosted.org/packages/df/80/fc9d01d5ed37ba4c42ca2b55b4339ae6e200b456be3a1aaddf4a9fa99b8c/pyperclip-1.11.0-py3-none-any.whl", hash = "sha256:299403e9ff44581cb9ba2ffeed69c7aa96a008622ad0c46cb575ca75b5b84273", size = 11063, upload-time = "2025-09-26T14:40:36.069Z" },
|
| 2001 |
-
]
|
| 2002 |
|
| 2003 |
[[package]]
|
| 2004 |
name = "pytest"
|
|
@@ -2366,7 +2366,7 @@ wheels = [
|
|
| 2366 |
|
| 2367 |
[[package]]
|
| 2368 |
name = "slop-farmer"
|
| 2369 |
-
version = "0.1.
|
| 2370 |
source = { editable = "." }
|
| 2371 |
dependencies = [
|
| 2372 |
{ name = "duckdb" },
|
|
@@ -2394,7 +2394,7 @@ llm = [
|
|
| 2394 |
[package.metadata]
|
| 2395 |
requires-dist = [
|
| 2396 |
{ name = "duckdb", specifier = ">=1.2.2" },
|
| 2397 |
-
{ name = "fast-agent-mcp", specifier = ">=0.6.
|
| 2398 |
{ name = "fast-agent-mcp", marker = "python_full_version >= '3.13.5' and extra == 'llm'", specifier = ">=0.6.16" },
|
| 2399 |
{ name = "fastapi", specifier = ">=0.115.0" },
|
| 2400 |
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
|
|
|
|
| 4 |
|
| 5 |
[[package]]
|
| 6 |
name = "a2a-sdk"
|
| 7 |
+
version = "0.3.26"
|
| 8 |
source = { registry = "https://pypi.org/simple" }
|
| 9 |
dependencies = [
|
| 10 |
{ name = "google-api-core" },
|
|
|
|
| 13 |
{ name = "protobuf" },
|
| 14 |
{ name = "pydantic" },
|
| 15 |
]
|
| 16 |
+
sdist = { url = "https://files.pythonhosted.org/packages/be/97/a6840e01795b182ce751ca165430d46459927cde9bfab838087cbb24aef7/a2a_sdk-0.3.26.tar.gz", hash = "sha256:44068e2d037afbb07ab899267439e9bc7eaa7ac2af94f1e8b239933c993ad52d", size = 274598, upload-time = "2026-04-09T15:21:13.902Z" }
|
| 17 |
wheels = [
|
| 18 |
+
{ url = "https://files.pythonhosted.org/packages/dd/d5/51f4ee1bf3b736add42a542d3c8a3fd3fa85f3d36c17972127defc46c26f/a2a_sdk-0.3.26-py3-none-any.whl", hash = "sha256:754e0573f6d33b225c1d8d51f640efa69cbbed7bdfb06ce9c3540ea9f58d4a91", size = 151016, upload-time = "2026-04-09T15:21:12.35Z" },
|
| 19 |
]
|
| 20 |
|
| 21 |
[[package]]
|
|
|
|
| 53 |
|
| 54 |
[[package]]
|
| 55 |
name = "aiohttp"
|
| 56 |
+
version = "3.13.5"
|
| 57 |
source = { registry = "https://pypi.org/simple" }
|
| 58 |
dependencies = [
|
| 59 |
{ name = "aiohappyeyeballs" },
|
|
|
|
| 64 |
{ name = "propcache" },
|
| 65 |
{ name = "yarl" },
|
| 66 |
]
|
| 67 |
+
sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271, upload-time = "2026-03-31T22:01:03.343Z" }
|
| 68 |
+
wheels = [
|
| 69 |
+
{ url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930, upload-time = "2026-03-31T21:58:13.155Z" },
|
| 70 |
+
{ url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927, upload-time = "2026-03-31T21:58:15.073Z" },
|
| 71 |
+
{ url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141, upload-time = "2026-03-31T21:58:17.009Z" },
|
| 72 |
+
{ url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476, upload-time = "2026-03-31T21:58:18.925Z" },
|
| 73 |
+
{ url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507, upload-time = "2026-03-31T21:58:21.094Z" },
|
| 74 |
+
{ url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465, upload-time = "2026-03-31T21:58:23.159Z" },
|
| 75 |
+
{ url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523, upload-time = "2026-03-31T21:58:25.59Z" },
|
| 76 |
+
{ url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113, upload-time = "2026-03-31T21:58:27.624Z" },
|
| 77 |
+
{ url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351, upload-time = "2026-03-31T21:58:29.918Z" },
|
| 78 |
+
{ url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205, upload-time = "2026-03-31T21:58:32.214Z" },
|
| 79 |
+
{ url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618, upload-time = "2026-03-31T21:58:34.728Z" },
|
| 80 |
+
{ url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185, upload-time = "2026-03-31T21:58:36.909Z" },
|
| 81 |
+
{ url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311, upload-time = "2026-03-31T21:58:39.38Z" },
|
| 82 |
+
{ url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147, upload-time = "2026-03-31T21:58:41.476Z" },
|
| 83 |
+
{ url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356, upload-time = "2026-03-31T21:58:44.049Z" },
|
| 84 |
+
{ url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637, upload-time = "2026-03-31T21:58:46.167Z" },
|
| 85 |
+
{ url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896, upload-time = "2026-03-31T21:58:48.119Z" },
|
| 86 |
+
{ url = "https://files.pythonhosted.org/packages/5d/ce/46572759afc859e867a5bc8ec3487315869013f59281ce61764f76d879de/aiohttp-3.13.5-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:eb4639f32fd4a9904ab8fb45bf3383ba71137f3d9d4ba25b3b3f3109977c5b8c", size = 745721, upload-time = "2026-03-31T21:58:50.229Z" },
|
| 87 |
+
{ url = "https://files.pythonhosted.org/packages/13/fe/8a2efd7626dbe6049b2ef8ace18ffda8a4dfcbe1bcff3ac30c0c7575c20b/aiohttp-3.13.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:7e5dc4311bd5ac493886c63cbf76ab579dbe4641268e7c74e48e774c74b6f2be", size = 497663, upload-time = "2026-03-31T21:58:52.232Z" },
|
| 88 |
+
{ url = "https://files.pythonhosted.org/packages/9b/91/cc8cc78a111826c54743d88651e1687008133c37e5ee615fee9b57990fac/aiohttp-3.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:756c3c304d394977519824449600adaf2be0ccee76d206ee339c5e76b70ded25", size = 499094, upload-time = "2026-03-31T21:58:54.566Z" },
|
| 89 |
+
{ url = "https://files.pythonhosted.org/packages/0a/33/a8362cb15cf16a3af7e86ed11962d5cd7d59b449202dc576cdc731310bde/aiohttp-3.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecc26751323224cf8186efcf7fbcbc30f4e1d8c7970659daf25ad995e4032a56", size = 1726701, upload-time = "2026-03-31T21:58:56.864Z" },
|
| 90 |
+
{ url = "https://files.pythonhosted.org/packages/45/0c/c091ac5c3a17114bd76cbf85d674650969ddf93387876cf67f754204bd77/aiohttp-3.13.5-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10a75acfcf794edf9d8db50e5a7ec5fc818b2a8d3f591ce93bc7b1210df016d2", size = 1683360, upload-time = "2026-03-31T21:58:59.072Z" },
|
| 91 |
+
{ url = "https://files.pythonhosted.org/packages/23/73/bcee1c2b79bc275e964d1446c55c54441a461938e70267c86afaae6fba27/aiohttp-3.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f7a18f258d124cd678c5fe072fe4432a4d5232b0657fca7c1847f599233c83a", size = 1773023, upload-time = "2026-03-31T21:59:01.776Z" },
|
| 92 |
+
{ url = "https://files.pythonhosted.org/packages/c7/ef/720e639df03004fee2d869f771799d8c23046dec47d5b81e396c7cda583a/aiohttp-3.13.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:df6104c009713d3a89621096f3e3e88cc323fd269dbd7c20afe18535094320be", size = 1853795, upload-time = "2026-03-31T21:59:04.568Z" },
|
| 93 |
+
{ url = "https://files.pythonhosted.org/packages/bd/c9/989f4034fb46841208de7aeeac2c6d8300745ab4f28c42f629ba77c2d916/aiohttp-3.13.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a94f7de7c0c3b616627aaad530fe2cb620084a8b144d3be7b6ecfe95bae3b", size = 1730405, upload-time = "2026-03-31T21:59:07.221Z" },
|
| 94 |
+
{ url = "https://files.pythonhosted.org/packages/ce/75/ee1fd286ca7dc599d824b5651dad7b3be7ff8d9a7e7b3fe9820d9180f7db/aiohttp-3.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c974fb66180e58709b6fc402846f13791240d180b74de81d23913abe48e96d94", size = 1558082, upload-time = "2026-03-31T21:59:09.484Z" },
|
| 95 |
+
{ url = "https://files.pythonhosted.org/packages/c3/20/1e9e6650dfc436340116b7aa89ff8cb2bbdf0abc11dfaceaad8f74273a10/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6e27ea05d184afac78aabbac667450c75e54e35f62238d44463131bd3f96753d", size = 1692346, upload-time = "2026-03-31T21:59:12.068Z" },
|
| 96 |
+
{ url = "https://files.pythonhosted.org/packages/d8/40/8ebc6658d48ea630ac7903912fe0dd4e262f0e16825aa4c833c56c9f1f56/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a79a6d399cef33a11b6f004c67bb07741d91f2be01b8d712d52c75711b1e07c7", size = 1698891, upload-time = "2026-03-31T21:59:14.552Z" },
|
| 97 |
+
{ url = "https://files.pythonhosted.org/packages/d8/78/ea0ae5ec8ba7a5c10bdd6e318f1ba5e76fcde17db8275188772afc7917a4/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c632ce9c0b534fbe25b52c974515ed674937c5b99f549a92127c85f771a78772", size = 1742113, upload-time = "2026-03-31T21:59:17.068Z" },
|
| 98 |
+
{ url = "https://files.pythonhosted.org/packages/8a/66/9d308ed71e3f2491be1acb8769d96c6f0c47d92099f3bc9119cada27b357/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:fceedde51fbd67ee2bcc8c0b33d0126cc8b51ef3bbde2f86662bd6d5a6f10ec5", size = 1553088, upload-time = "2026-03-31T21:59:19.541Z" },
|
| 99 |
+
{ url = "https://files.pythonhosted.org/packages/da/a6/6cc25ed8dfc6e00c90f5c6d126a98e2cf28957ad06fa1036bd34b6f24a2c/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f92995dfec9420bb69ae629abf422e516923ba79ba4403bc750d94fb4a6c68c1", size = 1757976, upload-time = "2026-03-31T21:59:22.311Z" },
|
| 100 |
+
{ url = "https://files.pythonhosted.org/packages/c1/2b/cce5b0ffe0de99c83e5e36d8f828e4161e415660a9f3e58339d07cce3006/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20ae0ff08b1f2c8788d6fb85afcb798654ae6ba0b747575f8562de738078457b", size = 1712444, upload-time = "2026-03-31T21:59:24.635Z" },
|
| 101 |
+
{ url = "https://files.pythonhosted.org/packages/6c/cf/9e1795b4160c58d29421eafd1a69c6ce351e2f7c8d3c6b7e4ca44aea1a5b/aiohttp-3.13.5-cp314-cp314-win32.whl", hash = "sha256:b20df693de16f42b2472a9c485e1c948ee55524786a0a34345511afdd22246f3", size = 438128, upload-time = "2026-03-31T21:59:27.291Z" },
|
| 102 |
+
{ url = "https://files.pythonhosted.org/packages/22/4d/eaedff67fc805aeba4ba746aec891b4b24cebb1a7d078084b6300f79d063/aiohttp-3.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:f85c6f327bf0b8c29da7d93b1cabb6363fb5e4e160a32fa241ed2dce21b73162", size = 464029, upload-time = "2026-03-31T21:59:29.429Z" },
|
| 103 |
+
{ url = "https://files.pythonhosted.org/packages/79/11/c27d9332ee20d68dd164dc12a6ecdef2e2e35ecc97ed6cf0d2442844624b/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1efb06900858bb618ff5cee184ae2de5828896c448403d51fb633f09e109be0a", size = 778758, upload-time = "2026-03-31T21:59:31.547Z" },
|
| 104 |
+
{ url = "https://files.pythonhosted.org/packages/04/fb/377aead2e0a3ba5f09b7624f702a964bdf4f08b5b6728a9799830c80041e/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fee86b7c4bd29bdaf0d53d14739b08a106fdda809ca5fe032a15f52fae5fe254", size = 512883, upload-time = "2026-03-31T21:59:34.098Z" },
|
| 105 |
+
{ url = "https://files.pythonhosted.org/packages/bb/a6/aa109a33671f7a5d3bd78b46da9d852797c5e665bfda7d6b373f56bff2ec/aiohttp-3.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:20058e23909b9e65f9da62b396b77dfa95965cbe840f8def6e572538b1d32e36", size = 516668, upload-time = "2026-03-31T21:59:36.497Z" },
|
| 106 |
+
{ url = "https://files.pythonhosted.org/packages/79/b3/ca078f9f2fa9563c36fb8ef89053ea2bb146d6f792c5104574d49d8acb63/aiohttp-3.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cf20a8d6868cb15a73cab329ffc07291ba8c22b1b88176026106ae39aa6df0f", size = 1883461, upload-time = "2026-03-31T21:59:38.723Z" },
|
| 107 |
+
{ url = "https://files.pythonhosted.org/packages/b7/e3/a7ad633ca1ca497b852233a3cce6906a56c3225fb6d9217b5e5e60b7419d/aiohttp-3.13.5-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:330f5da04c987f1d5bdb8ae189137c77139f36bd1cb23779ca1a354a4b027800", size = 1747661, upload-time = "2026-03-31T21:59:41.187Z" },
|
| 108 |
+
{ url = "https://files.pythonhosted.org/packages/33/b9/cd6fe579bed34a906d3d783fe60f2fa297ef55b27bb4538438ee49d4dc41/aiohttp-3.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f1cbf0c7926d315c3c26c2da41fd2b5d2fe01ac0e157b78caefc51a782196cf", size = 1863800, upload-time = "2026-03-31T21:59:43.84Z" },
|
| 109 |
+
{ url = "https://files.pythonhosted.org/packages/c0/3f/2c1e2f5144cefa889c8afd5cf431994c32f3b29da9961698ff4e3811b79a/aiohttp-3.13.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:53fc049ed6390d05423ba33103ded7281fe897cf97878f369a527070bd95795b", size = 1958382, upload-time = "2026-03-31T21:59:46.187Z" },
|
| 110 |
+
{ url = "https://files.pythonhosted.org/packages/66/1d/f31ec3f1013723b3babe3609e7f119c2c2fb6ef33da90061a705ef3e1bc8/aiohttp-3.13.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:898703aa2667e3c5ca4c54ca36cd73f58b7a38ef87a5606414799ebce4d3fd3a", size = 1803724, upload-time = "2026-03-31T21:59:48.656Z" },
|
| 111 |
+
{ url = "https://files.pythonhosted.org/packages/0e/b4/57712dfc6f1542f067daa81eb61da282fab3e6f1966fca25db06c4fc62d5/aiohttp-3.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0494a01ca9584eea1e5fbd6d748e61ecff218c51b576ee1999c23db7066417d8", size = 1640027, upload-time = "2026-03-31T21:59:51.284Z" },
|
| 112 |
+
{ url = "https://files.pythonhosted.org/packages/25/3c/734c878fb43ec083d8e31bf029daae1beafeae582d1b35da234739e82ee7/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6cf81fe010b8c17b09495cbd15c1d35afbc8fb405c0c9cf4738e5ae3af1d65be", size = 1806644, upload-time = "2026-03-31T21:59:53.753Z" },
|
| 113 |
+
{ url = "https://files.pythonhosted.org/packages/20/a5/f671e5cbec1c21d044ff3078223f949748f3a7f86b14e34a365d74a5d21f/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:c564dd5f09ddc9d8f2c2d0a301cd30a79a2cc1b46dd1a73bef8f0038863d016b", size = 1791630, upload-time = "2026-03-31T21:59:56.239Z" },
|
| 114 |
+
{ url = "https://files.pythonhosted.org/packages/0b/63/fb8d0ad63a0b8a99be97deac8c04dacf0785721c158bdf23d679a87aa99e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2994be9f6e51046c4f864598fd9abeb4fba6e88f0b2152422c9666dcd4aea9c6", size = 1809403, upload-time = "2026-03-31T21:59:59.103Z" },
|
| 115 |
+
{ url = "https://files.pythonhosted.org/packages/59/0c/bfed7f30662fcf12206481c2aac57dedee43fe1c49275e85b3a1e1742294/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:157826e2fa245d2ef46c83ea8a5faf77ca19355d278d425c29fda0beb3318037", size = 1634924, upload-time = "2026-03-31T22:00:02.116Z" },
|
| 116 |
+
{ url = "https://files.pythonhosted.org/packages/17/d6/fd518d668a09fd5a3319ae5e984d4d80b9a4b3df4e21c52f02251ef5a32e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a8aca50daa9493e9e13c0f566201a9006f080e7c50e5e90d0b06f53146a54500", size = 1836119, upload-time = "2026-03-31T22:00:04.756Z" },
|
| 117 |
+
{ url = "https://files.pythonhosted.org/packages/78/b7/15fb7a9d52e112a25b621c67b69c167805cb1f2ab8f1708a5c490d1b52fe/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3b13560160d07e047a93f23aaa30718606493036253d5430887514715b67c9d9", size = 1772072, upload-time = "2026-03-31T22:00:07.494Z" },
|
| 118 |
+
{ url = "https://files.pythonhosted.org/packages/7e/df/57ba7f0c4a553fc2bd8b6321df236870ec6fd64a2a473a8a13d4f733214e/aiohttp-3.13.5-cp314-cp314t-win32.whl", hash = "sha256:9a0f4474b6ea6818b41f82172d799e4b3d29e22c2c520ce4357856fced9af2f8", size = 471819, upload-time = "2026-03-31T22:00:10.277Z" },
|
| 119 |
+
{ url = "https://files.pythonhosted.org/packages/62/29/2f8418269e46454a26171bfdd6a055d74febf32234e474930f2f60a17145/aiohttp-3.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:18a2f6c1182c51baa1d28d68fea51513cb2a76612f038853c0ad3c145423d3d9", size = 505441, upload-time = "2026-03-31T22:00:12.791Z" },
|
| 120 |
]
|
| 121 |
|
| 122 |
[[package]]
|
|
|
|
| 151 |
|
| 152 |
[[package]]
|
| 153 |
name = "anthropic"
|
| 154 |
+
version = "0.96.0"
|
| 155 |
source = { registry = "https://pypi.org/simple" }
|
| 156 |
dependencies = [
|
| 157 |
{ name = "anyio" },
|
|
|
|
| 163 |
{ name = "sniffio" },
|
| 164 |
{ name = "typing-extensions" },
|
| 165 |
]
|
| 166 |
+
sdist = { url = "https://files.pythonhosted.org/packages/b9/7e/672f533dee813028d2c699bfd2a7f52c9118d7353680d9aa44b9e23f717f/anthropic-0.96.0.tar.gz", hash = "sha256:9de947b737f39452f68aa520f1c2239d44119c9b73b0fb6d4e6ca80f00279ee6", size = 658210, upload-time = "2026-04-16T14:28:02.846Z" }
|
| 167 |
wheels = [
|
| 168 |
+
{ url = "https://files.pythonhosted.org/packages/48/5a/72f33204064b6e87601a71a6baf8d855769f8a0c1eaae8d06a1094872371/anthropic-0.96.0-py3-none-any.whl", hash = "sha256:9a6e335a354602a521cd9e777e92bfd46ba6e115bf9bbfe6135311e8fb2015b2", size = 635930, upload-time = "2026-04-16T14:28:01.436Z" },
|
| 169 |
]
|
| 170 |
|
| 171 |
[package.optional-dependencies]
|
|
|
|
| 539 |
|
| 540 |
[[package]]
|
| 541 |
name = "email-validator"
|
| 542 |
+
version = "2.2.0"
|
| 543 |
source = { registry = "https://pypi.org/simple" }
|
| 544 |
dependencies = [
|
| 545 |
{ name = "dnspython" },
|
| 546 |
{ name = "idna" },
|
| 547 |
]
|
| 548 |
+
sdist = { url = "https://files.pythonhosted.org/packages/48/ce/13508a1ec3f8bb981ae4ca79ea40384becc868bfae97fd1c942bb3a001b1/email_validator-2.2.0.tar.gz", hash = "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7", size = 48967, upload-time = "2024-06-20T11:30:30.034Z" }
|
| 549 |
wheels = [
|
| 550 |
+
{ url = "https://files.pythonhosted.org/packages/d7/ee/bf0adb559ad3c786f12bcbc9296b3f5675f529199bef03e2df281fa1fadb/email_validator-2.2.0-py3-none-any.whl", hash = "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631", size = 33521, upload-time = "2024-06-20T11:30:28.248Z" },
|
| 551 |
]
|
| 552 |
|
| 553 |
[[package]]
|
|
|
|
| 561 |
|
| 562 |
[[package]]
|
| 563 |
name = "fast-agent-mcp"
|
| 564 |
+
version = "0.6.17"
|
| 565 |
source = { registry = "https://pypi.org/simple" }
|
| 566 |
dependencies = [
|
| 567 |
{ name = "a2a-sdk" },
|
|
|
|
| 598 |
{ name = "uvloop", marker = "sys_platform != 'win32'" },
|
| 599 |
{ name = "watchfiles" },
|
| 600 |
]
|
| 601 |
+
sdist = { url = "https://files.pythonhosted.org/packages/8c/a1/b6b1045345d38b342da3def7723a2dc6a44faff9c01fee6d81afbd272d62/fast_agent_mcp-0.6.17.tar.gz", hash = "sha256:a920113d47ef2ab82be1bd63b77d3bf78f8f862a5a6e91f1fd0aa931850fb25f", size = 2091401, upload-time = "2026-04-16T21:48:43.334Z" }
|
| 602 |
wheels = [
|
| 603 |
+
{ url = "https://files.pythonhosted.org/packages/b4/ef/47e05d6fa95e04ed8ad60afac3ae29d8205894fb220ffde193bd33578f3a/fast_agent_mcp-0.6.17-py3-none-any.whl", hash = "sha256:a23c5a5ed8924e38809dabd31f994e5cc81b8c084e84632bb1eb246b257c4752", size = 1573794, upload-time = "2026-04-16T21:48:38.999Z" },
|
| 604 |
]
|
| 605 |
|
| 606 |
[[package]]
|
| 607 |
name = "fastapi"
|
| 608 |
+
version = "0.136.0"
|
| 609 |
source = { registry = "https://pypi.org/simple" }
|
| 610 |
dependencies = [
|
| 611 |
{ name = "annotated-doc" },
|
|
|
|
| 614 |
{ name = "typing-extensions" },
|
| 615 |
{ name = "typing-inspection" },
|
| 616 |
]
|
| 617 |
+
sdist = { url = "https://files.pythonhosted.org/packages/4e/d9/e66315807e41e69e7f6a1b42a162dada2f249c5f06ad3f1a95f84ab336ef/fastapi-0.136.0.tar.gz", hash = "sha256:cf08e067cc66e106e102d9ba659463abfac245200752f8a5b7b1e813de4ff73e", size = 396607, upload-time = "2026-04-16T11:47:13.623Z" }
|
| 618 |
wheels = [
|
| 619 |
+
{ url = "https://files.pythonhosted.org/packages/26/a3/0bd5f0cdb0bbc92650e8dc457e9250358411ee5d1b65e42b6632387daf81/fastapi-0.136.0-py3-none-any.whl", hash = "sha256:8793d44ec7378e2be07f8a013cf7f7aa47d6327d0dfe9804862688ec4541a6b4", size = 117556, upload-time = "2026-04-16T11:47:11.922Z" },
|
| 620 |
]
|
| 621 |
|
| 622 |
[[package]]
|
| 623 |
name = "fastmcp"
|
| 624 |
+
version = "3.2.3"
|
| 625 |
source = { registry = "https://pypi.org/simple" }
|
| 626 |
dependencies = [
|
| 627 |
{ name = "authlib" },
|
|
|
|
| 646 |
{ name = "watchfiles" },
|
| 647 |
{ name = "websockets" },
|
| 648 |
]
|
| 649 |
+
sdist = { url = "https://files.pythonhosted.org/packages/b9/42/7eed0a38e3b7a386805fecacf8a5a9353a2b3040395ef9e30e585d8549ac/fastmcp-3.2.3.tar.gz", hash = "sha256:4f02ae8b00227285a0cf6544dea1db29b022c8cdd8d3dfdec7118540210ae60a", size = 26328743, upload-time = "2026-04-09T22:05:03.402Z" }
|
| 650 |
wheels = [
|
| 651 |
+
{ url = "https://files.pythonhosted.org/packages/f5/48/84b6dcba793178a44b9d99b4def6cd62f870dcfc5bb7b9153ac390135812/fastmcp-3.2.3-py3-none-any.whl", hash = "sha256:cc50af6eed1f62ed8b6ebf4987286d8d1d006f08d5bec739d5c7fb76160e0911", size = 707260, upload-time = "2026-04-09T22:05:01.225Z" },
|
| 652 |
]
|
| 653 |
|
| 654 |
[[package]]
|
|
|
|
| 778 |
|
| 779 |
[[package]]
|
| 780 |
name = "google-genai"
|
| 781 |
+
version = "1.66.0"
|
| 782 |
source = { registry = "https://pypi.org/simple" }
|
| 783 |
dependencies = [
|
| 784 |
{ name = "anyio" },
|
|
|
|
| 792 |
{ name = "typing-extensions" },
|
| 793 |
{ name = "websockets" },
|
| 794 |
]
|
| 795 |
+
sdist = { url = "https://files.pythonhosted.org/packages/9b/ba/0b343b0770d4710ad2979fd9301d7caa56c940174d5361ed4a7cc4979241/google_genai-1.66.0.tar.gz", hash = "sha256:ffc01647b65046bca6387320057aa51db0ad64bcc72c8e3e914062acfa5f7c49", size = 504386, upload-time = "2026-03-04T22:15:28.156Z" }
|
| 796 |
wheels = [
|
| 797 |
+
{ url = "https://files.pythonhosted.org/packages/d1/dd/403949d922d4e261b08b64aaa132af4e456c3b15c8e2a2d9e6ef693f66e2/google_genai-1.66.0-py3-none-any.whl", hash = "sha256:7f127a39cf695277104ce4091bb26e417c59bb46e952ff3699c3a982d9c474ee", size = 732174, upload-time = "2026-03-04T22:15:26.63Z" },
|
| 798 |
]
|
| 799 |
|
| 800 |
[[package]]
|
|
|
|
| 1082 |
|
| 1083 |
[[package]]
|
| 1084 |
name = "jsonschema"
|
| 1085 |
+
version = "4.25.1"
|
| 1086 |
source = { registry = "https://pypi.org/simple" }
|
| 1087 |
dependencies = [
|
| 1088 |
{ name = "attrs" },
|
|
|
|
| 1090 |
{ name = "referencing" },
|
| 1091 |
{ name = "rpds-py" },
|
| 1092 |
]
|
| 1093 |
+
sdist = { url = "https://files.pythonhosted.org/packages/74/69/f7185de793a29082a9f3c7728268ffb31cb5095131a9c139a74078e27336/jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85", size = 357342, upload-time = "2025-08-18T17:03:50.038Z" }
|
| 1094 |
wheels = [
|
| 1095 |
+
{ url = "https://files.pythonhosted.org/packages/bf/9c/8c95d856233c1f82500c2450b8c68576b4cf1c871db3afac5c34ff84e6fd/jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63", size = 90040, upload-time = "2025-08-18T17:03:48.373Z" },
|
| 1096 |
]
|
| 1097 |
|
| 1098 |
[[package]]
|
|
|
|
| 1870 |
|
| 1871 |
[[package]]
|
| 1872 |
name = "pydantic"
|
| 1873 |
+
version = "2.13.1"
|
| 1874 |
source = { registry = "https://pypi.org/simple" }
|
| 1875 |
dependencies = [
|
| 1876 |
{ name = "annotated-types" },
|
|
|
|
| 1878 |
{ name = "typing-extensions" },
|
| 1879 |
{ name = "typing-inspection" },
|
| 1880 |
]
|
| 1881 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f3/6b/1353beb3d1cd5cf61cdec5b6f87a9872399de3bc5cae0b7ce07ff4de2ab0/pydantic-2.13.1.tar.gz", hash = "sha256:a0f829b279ddd1e39291133fe2539d2aa46cc6b150c1706a270ff0879e3774d2", size = 843746, upload-time = "2026-04-15T14:57:19.398Z" }
|
| 1882 |
wheels = [
|
| 1883 |
+
{ url = "https://files.pythonhosted.org/packages/81/5a/2225f4c176dbfed0d809e848b50ef08f70e61daa667b7fa14b0d311ae44d/pydantic-2.13.1-py3-none-any.whl", hash = "sha256:9557ecc2806faaf6037f85b1fbd963d01e30511c48085f0d573650fdeaad378a", size = 471917, upload-time = "2026-04-15T14:57:17.277Z" },
|
| 1884 |
]
|
| 1885 |
|
| 1886 |
[package.optional-dependencies]
|
|
|
|
| 1890 |
|
| 1891 |
[[package]]
|
| 1892 |
name = "pydantic-core"
|
| 1893 |
+
version = "2.46.1"
|
| 1894 |
source = { registry = "https://pypi.org/simple" }
|
| 1895 |
dependencies = [
|
| 1896 |
{ name = "typing-extensions" },
|
| 1897 |
]
|
| 1898 |
+
sdist = { url = "https://files.pythonhosted.org/packages/a1/93/f97a86a7eb28faa1d038af2fd5d6166418b4433659108a4c311b57128b2d/pydantic_core-2.46.1.tar.gz", hash = "sha256:d408153772d9f298098fb5d620f045bdf0f017af0d5cb6e309ef8c205540caa4", size = 471230, upload-time = "2026-04-15T14:49:34.52Z" }
|
| 1899 |
+
wheels = [
|
| 1900 |
+
{ url = "https://files.pythonhosted.org/packages/ff/d2/bda39bad2f426cb5078e6ad28076614d3926704196efe0d7a2a19a99025d/pydantic_core-2.46.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:cdc8a5762a9c4b9d86e204d555444e3227507c92daba06259ee66595834de47a", size = 2119092, upload-time = "2026-04-15T14:49:50.392Z" },
|
| 1901 |
+
{ url = "https://files.pythonhosted.org/packages/ee/f3/69631e64d69cb3481494b2bddefe0ddd07771209f74e9106d066f9138c2a/pydantic_core-2.46.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ba381dfe9c85692c566ecb60fa5a77a697a2a8eebe274ec5e4d6ec15fafad799", size = 1951400, upload-time = "2026-04-15T14:51:06.588Z" },
|
| 1902 |
+
{ url = "https://files.pythonhosted.org/packages/53/1c/21cb3db6ae997df31be8e91f213081f72ffa641cb45c89b8a1986832b1f9/pydantic_core-2.46.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1593d8de98207466dc070118322fef68307a0cc6a5625e7b386f6fdae57f9ab6", size = 1976864, upload-time = "2026-04-15T14:50:54.804Z" },
|
| 1903 |
+
{ url = "https://files.pythonhosted.org/packages/91/9c/05c819f734318ce5a6ca24da300d93696c105af4adb90494ee571303afd8/pydantic_core-2.46.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8262c74a1af5b0fdf795f5537f7145785a63f9fbf9e15405f547440c30017ed8", size = 2066669, upload-time = "2026-04-15T14:51:42.346Z" },
|
| 1904 |
+
{ url = "https://files.pythonhosted.org/packages/cb/23/fadddf1c7f2f517f58731aea9b35c914e6005250f08dac9b8e53904cdbaa/pydantic_core-2.46.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b88949a24182e83fbbb3f7ca9b7858d0d37b735700ea91081434b7d37b3b444", size = 2238737, upload-time = "2026-04-15T14:50:45.558Z" },
|
| 1905 |
+
{ url = "https://files.pythonhosted.org/packages/23/07/0cd4f95cb0359c8b1ec71e89c3777e7932c8dfeb9cd54740289f310aaead/pydantic_core-2.46.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8f3708cd55537aeaf3fd0ea55df0d68d0da51dcb07cbc8508745b34acc4c6e0", size = 2316258, upload-time = "2026-04-15T14:51:08.471Z" },
|
| 1906 |
+
{ url = "https://files.pythonhosted.org/packages/0c/40/6fc24c3766a19c222a0d60d652b78f0283339d4cd4c173fab06b7ee76571/pydantic_core-2.46.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f79292435fff1d4f0c18d9cfaf214025cc88e4f5104bfaed53f173621da1c743", size = 2097474, upload-time = "2026-04-15T14:49:56.543Z" },
|
| 1907 |
+
{ url = "https://files.pythonhosted.org/packages/4b/af/f39795d1ce549e35d0841382b9c616ae211caffb88863147369a8d74fba9/pydantic_core-2.46.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:a2e607aeb59cf4575bb364470288db3b9a1f0e7415d053a322e3e154c1a0802e", size = 2168383, upload-time = "2026-04-15T14:51:29.269Z" },
|
| 1908 |
+
{ url = "https://files.pythonhosted.org/packages/e6/32/0d563f74582795779df6cc270c3fc220f49f4daf7860d74a5a6cda8491ff/pydantic_core-2.46.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec5ca190b75878a9f6ae1fc8f5eb678497934475aef3d93204c9fa01e97370b6", size = 2186182, upload-time = "2026-04-15T14:50:19.097Z" },
|
| 1909 |
+
{ url = "https://files.pythonhosted.org/packages/5c/07/1c10d5ce312fc4cf86d1e50bdcdbb8ef248409597b099cab1b4bb3a093f7/pydantic_core-2.46.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:1f80535259dcdd517d7b8ca588d5ca24b4f337228e583bebedf7a3adcdf5f721", size = 2187859, upload-time = "2026-04-15T14:49:22.974Z" },
|
| 1910 |
+
{ url = "https://files.pythonhosted.org/packages/92/01/e1f62d4cb39f0913dbf5c95b9b119ef30ddba9493dff8c2b012f0cdd67dc/pydantic_core-2.46.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:24820b3c82c43df61eca30147e42853e6c127d8b868afdc0c162df829e011eb4", size = 2338372, upload-time = "2026-04-15T14:49:53.316Z" },
|
| 1911 |
+
{ url = "https://files.pythonhosted.org/packages/44/ed/218dfeea6127fb1781a6ceca241ec6edf00e8a8933ff331af2215975a534/pydantic_core-2.46.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f12794b1dd8ac9fb66619e0b3a0427189f5d5638e55a3de1385121a9b7bf9b39", size = 2384039, upload-time = "2026-04-15T14:53:04.929Z" },
|
| 1912 |
+
{ url = "https://files.pythonhosted.org/packages/6c/1e/011e763cd059238249fbd5780e0f8d0b04b47f86c8925e22784f3e5fc977/pydantic_core-2.46.1-cp313-cp313-win32.whl", hash = "sha256:9bc09aed935cdf50f09e908923f9efbcca54e9244bd14a5a0e2a6c8d2c21b4e9", size = 1977943, upload-time = "2026-04-15T14:52:17.969Z" },
|
| 1913 |
+
{ url = "https://files.pythonhosted.org/packages/8c/06/b559a490d3ed106e9b1777b8d5c8112dd8d31716243cd662616f66c1f8ea/pydantic_core-2.46.1-cp313-cp313-win_amd64.whl", hash = "sha256:fac2d6c8615b8b42bee14677861ba09d56ee076ba4a65cfb9c3c3d0cc89042f2", size = 2068729, upload-time = "2026-04-15T14:53:07.288Z" },
|
| 1914 |
+
{ url = "https://files.pythonhosted.org/packages/9f/52/32a198946e2e19508532aa9da02a61419eb15bd2d96bab57f810f2713e31/pydantic_core-2.46.1-cp313-cp313-win_arm64.whl", hash = "sha256:f978329f12ace9f3cb814a5e44d98bbeced2e36f633132bafa06d2d71332e33e", size = 2029550, upload-time = "2026-04-15T14:52:22.707Z" },
|
| 1915 |
+
{ url = "https://files.pythonhosted.org/packages/bd/2b/6793fe89ab66cb2d3d6e5768044eab80bba1d0fae8fd904d0a1574712e17/pydantic_core-2.46.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:9917cb61effac7ec0f448ef491ec7584526d2193be84ff981e85cbf18b68c42a", size = 2118110, upload-time = "2026-04-15T14:50:52.947Z" },
|
| 1916 |
+
{ url = "https://files.pythonhosted.org/packages/d2/87/e9a905ddfcc2fd7bd862b340c02be6ab1f827922822d425513635d0ac774/pydantic_core-2.46.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e749679ca9f8a9d0bff95fb7f6b57bb53f2207fa42ffcc1ec86de7e0029ab89", size = 1948645, upload-time = "2026-04-15T14:51:55.577Z" },
|
| 1917 |
+
{ url = "https://files.pythonhosted.org/packages/15/23/26e67f86ed62ac9d6f7f3091ee5220bf14b5ac36fb811851d601365ef896/pydantic_core-2.46.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2ecacee70941e233a2dad23f7796a06f86cc10cc2fbd1c97c7dd5b5a79ffa4f", size = 1977576, upload-time = "2026-04-15T14:49:37.58Z" },
|
| 1918 |
+
{ url = "https://files.pythonhosted.org/packages/b8/78/813c13c0de323d4de54ee2e6fdd69a0271c09ac8dd65a8a000931aa487a5/pydantic_core-2.46.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:647d0a2475b8ed471962eed92fa69145b864942f9c6daa10f95ac70676637ae7", size = 2060358, upload-time = "2026-04-15T14:51:40.087Z" },
|
| 1919 |
+
{ url = "https://files.pythonhosted.org/packages/09/5e/4caf2a15149271fbd2b4d968899a450853c800b85152abcf54b11531417f/pydantic_core-2.46.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac9cde61965b0697fce6e6cc372df9e1ad93734828aac36e9c1c42a22ad02897", size = 2235980, upload-time = "2026-04-15T14:50:34.535Z" },
|
| 1920 |
+
{ url = "https://files.pythonhosted.org/packages/c2/c1/a2cdabb5da6f5cb63a3558bcafffc20f790fa14ccffbefbfb1370fadc93f/pydantic_core-2.46.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0a2eb0864085f8b641fb3f54a2fb35c58aff24b175b80bc8a945050fcde03204", size = 2316800, upload-time = "2026-04-15T14:52:46.999Z" },
|
| 1921 |
+
{ url = "https://files.pythonhosted.org/packages/76/fd/19d711e4e9331f9d77f222bffc202bf30ea0d74f6419046376bb82f244c8/pydantic_core-2.46.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b83ce9fede4bc4fb649281d9857f06d30198b8f70168f18b987518d713111572", size = 2101762, upload-time = "2026-04-15T14:49:24.278Z" },
|
| 1922 |
+
{ url = "https://files.pythonhosted.org/packages/dc/64/ce95625448e1a4e219390a2923fd594f3fa368599c6b42ac71a5df7238c9/pydantic_core-2.46.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:cb33192753c60f269d2f4a1db8253c95b0df6e04f2989631a8cc1b0f4f6e2e92", size = 2167737, upload-time = "2026-04-15T14:50:41.637Z" },
|
| 1923 |
+
{ url = "https://files.pythonhosted.org/packages/ad/31/413572d03ca3e73b408f00f54418b91a8be6401451bc791eaeff210328e5/pydantic_core-2.46.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:96611d51f953f87e1ae97637c01ee596a08b7f494ea00a5afb67ea6547b9f53b", size = 2185658, upload-time = "2026-04-15T14:51:46.799Z" },
|
| 1924 |
+
{ url = "https://files.pythonhosted.org/packages/36/09/e4f581353bdf3f0c7de8a8b27afd14fc761da29d78146376315a6fedc487/pydantic_core-2.46.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:9b176fa55f9107db5e6c86099aa5bfd934f1d3ba6a8b43f714ddeebaed3f42b7", size = 2184154, upload-time = "2026-04-15T14:52:49.629Z" },
|
| 1925 |
+
{ url = "https://files.pythonhosted.org/packages/1a/a4/d0d52849933f5a4bf1ad9d8da612792f96469b37e286a269e3ee9c60bbb1/pydantic_core-2.46.1-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:79a59f63a4ce4f3330e27e6f3ce281dd1099453b637350e97d7cf24c207cd120", size = 2332379, upload-time = "2026-04-15T14:49:55.009Z" },
|
| 1926 |
+
{ url = "https://files.pythonhosted.org/packages/30/93/25bfb08fdbef419f73290e573899ce938a327628c34e8f3a4bafeea30126/pydantic_core-2.46.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:f200fce071808a385a314b7343f5e3688d7c45746be3d64dc71ee2d3e2a13268", size = 2377964, upload-time = "2026-04-15T14:51:59.649Z" },
|
| 1927 |
+
{ url = "https://files.pythonhosted.org/packages/15/36/b777766ff83fef1cf97473d64764cd44f38e0d8c269ed06faace9ae17666/pydantic_core-2.46.1-cp314-cp314-win32.whl", hash = "sha256:3a07eccc0559fb9acc26d55b16bf8ebecd7f237c74a9e2c5741367db4e6d8aff", size = 1976450, upload-time = "2026-04-15T14:51:57.665Z" },
|
| 1928 |
+
{ url = "https://files.pythonhosted.org/packages/7b/4b/4cd19d2437acfc18ca166db5a2067040334991eb862c4ecf2db098c91fbf/pydantic_core-2.46.1-cp314-cp314-win_amd64.whl", hash = "sha256:1706d270309ac7d071ffe393988c471363705feb3d009186e55d17786ada9622", size = 2067750, upload-time = "2026-04-15T14:49:38.941Z" },
|
| 1929 |
+
{ url = "https://files.pythonhosted.org/packages/7f/a0/490751c0ef8f5b27aae81731859aed1508e72c1a9b5774c6034269db773b/pydantic_core-2.46.1-cp314-cp314-win_arm64.whl", hash = "sha256:22d4e7457ade8af06528012f382bc994a97cc2ce6e119305a70b3deff1e409d6", size = 2021109, upload-time = "2026-04-15T14:50:27.728Z" },
|
| 1930 |
+
{ url = "https://files.pythonhosted.org/packages/36/3a/2a018968245fffd25d5f1972714121ad309ff2de19d80019ad93494844f9/pydantic_core-2.46.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:607ff9db0b7e2012e7eef78465e69f9a0d7d1c3e7c6a84cf0c4011db0fcc3feb", size = 2111548, upload-time = "2026-04-15T14:52:08.273Z" },
|
| 1931 |
+
{ url = "https://files.pythonhosted.org/packages/77/5b/4103b6192213217e874e764e5467d2ff10d8873c1147d01fa432ac281880/pydantic_core-2.46.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:8cda3eacaea13bd02a1bea7e457cc9fc30b91c5a91245cef9b215140f80dd78c", size = 1926745, upload-time = "2026-04-15T14:50:03.045Z" },
|
| 1932 |
+
{ url = "https://files.pythonhosted.org/packages/c3/70/602a667cf4be4bec6c3334512b12ae4ea79ce9bfe41dc51be1fd34434453/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9493279cdc7997fe19e5ed9b41f30cbc3806bd4722adb402fedb6f6d41bd72a", size = 1965922, upload-time = "2026-04-15T14:51:12.555Z" },
|
| 1933 |
+
{ url = "https://files.pythonhosted.org/packages/a9/24/06a89ce5323e755b7d2812189f9706b87aaebe49b34d247b380502f7992c/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3644e5e10059999202355b6c6616e624909e23773717d8f76deb8a6e2a72328c", size = 2043221, upload-time = "2026-04-15T14:51:18.995Z" },
|
| 1934 |
+
{ url = "https://files.pythonhosted.org/packages/2c/6e/b1d9ad907d9d76964903903349fd2e33c87db4b993cc44713edcad0fc488/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4ad6c9de57683e26c92730991960c0c3571b8053263b042de2d3e105930b2767", size = 2243655, upload-time = "2026-04-15T14:50:10.718Z" },
|
| 1935 |
+
{ url = "https://files.pythonhosted.org/packages/ef/73/787abfaad51174641abb04c8aa125322279b40ad7ce23c495f5a69f76554/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:557ebaa27c7617e7088002318c679a8ce685fa048523417cd1ca52b7f516d955", size = 2295976, upload-time = "2026-04-15T14:53:09.694Z" },
|
| 1936 |
+
{ url = "https://files.pythonhosted.org/packages/56/0b/b7c5a631b6d5153d4a1ea4923b139aea256dc3bd99c8e6c7b312c7733146/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cd37e39b22b796ba0298fe81e9421dd7b65f97acfbb0fb19b33ffdda7b9a7b4", size = 2103439, upload-time = "2026-04-15T14:50:08.32Z" },
|
| 1937 |
+
{ url = "https://files.pythonhosted.org/packages/2a/3f/952ee470df69e5674cdec1cbde22331adf643b5cc2ff79f4292d80146ee4/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:6689443b59714992e67d62505cdd2f952d6cf1c14cc9fd9aeec6719befc6f23b", size = 2132871, upload-time = "2026-04-15T14:50:24.445Z" },
|
| 1938 |
+
{ url = "https://files.pythonhosted.org/packages/e3/8b/1dea3b1e683c60c77a60f710215f90f486755962aa8939dbcb7c0f975ac3/pydantic_core-2.46.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f32c41ca1e3456b5dd691827b7c1433c12d5f0058cc186afbb3615bc07d97b8", size = 2168658, upload-time = "2026-04-15T14:52:24.897Z" },
|
| 1939 |
+
{ url = "https://files.pythonhosted.org/packages/67/97/32ae283810910d274d5ba9f48f856f5f2f612410b78b249f302d297816f5/pydantic_core-2.46.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:88cd1355578852db83954dc36e4f58f299646916da976147c20cf6892ba5dc43", size = 2171184, upload-time = "2026-04-15T14:52:34.854Z" },
|
| 1940 |
+
{ url = "https://files.pythonhosted.org/packages/a2/57/c9a855527fe56c2072070640221f53095b0b19eaf651f3c77643c9cabbe3/pydantic_core-2.46.1-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:a170fefdb068279a473cc9d34848b85e61d68bfcc2668415b172c5dfc6f213bf", size = 2316573, upload-time = "2026-04-15T14:52:12.871Z" },
|
| 1941 |
+
{ url = "https://files.pythonhosted.org/packages/37/b3/14c39ffc7399819c5448007c7bcb4e6da5669850cfb7dcbb727594290b48/pydantic_core-2.46.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:556a63ff1006934dba4eed7ea31b58274c227e29298ec398e4275eda4b905e95", size = 2378340, upload-time = "2026-04-15T14:51:02.619Z" },
|
| 1942 |
+
{ url = "https://files.pythonhosted.org/packages/01/55/a37461fbb29c053ea4e62cfc5c2d56425cb5efbef8316e63f6d84ae45718/pydantic_core-2.46.1-cp314-cp314t-win32.whl", hash = "sha256:3b146d8336a995f7d7da6d36e4a779b7e7dff2719ac00a1eb8bd3ded00bec87b", size = 1960843, upload-time = "2026-04-15T14:52:06.103Z" },
|
| 1943 |
+
{ url = "https://files.pythonhosted.org/packages/22/d7/97e1221197d17a27f768363f87ec061519eeeed15bbd315d2e9d1429ff03/pydantic_core-2.46.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f1bc856c958e6fe9ec071e210afe6feb695f2e2e81fd8d2b102f558d364c4c17", size = 2048696, upload-time = "2026-04-15T14:52:52.154Z" },
|
| 1944 |
+
{ url = "https://files.pythonhosted.org/packages/19/d5/4eac95255c7d35094b46a32ec1e4d80eac94729c694726ee1d69948bd5f0/pydantic_core-2.46.1-cp314-cp314t-win_arm64.whl", hash = "sha256:21a5bfd8a1aa4de60494cdf66b0c912b1495f26a8899896040021fbd6038d989", size = 2022343, upload-time = "2026-04-15T14:49:49.036Z" },
|
| 1945 |
]
|
| 1946 |
|
| 1947 |
[[package]]
|
| 1948 |
name = "pydantic-settings"
|
| 1949 |
+
version = "2.13.0"
|
| 1950 |
source = { registry = "https://pypi.org/simple" }
|
| 1951 |
dependencies = [
|
| 1952 |
{ name = "pydantic" },
|
| 1953 |
{ name = "python-dotenv" },
|
| 1954 |
{ name = "typing-inspection" },
|
| 1955 |
]
|
| 1956 |
+
sdist = { url = "https://files.pythonhosted.org/packages/96/a1/ae859ffac5a3338a66b74c5e29e244fd3a3cc483c89feaf9f56c39898d75/pydantic_settings-2.13.0.tar.gz", hash = "sha256:95d875514610e8595672800a5c40b073e99e4aae467fa7c8f9c263061ea2e1fe", size = 222450, upload-time = "2026-02-15T12:11:23.476Z" }
|
| 1957 |
wheels = [
|
| 1958 |
+
{ url = "https://files.pythonhosted.org/packages/b0/1a/dd1b9d7e627486cf8e7523d09b70010e05a4bc41414f4ae6ce184cf0afb6/pydantic_settings-2.13.0-py3-none-any.whl", hash = "sha256:d67b576fff39cd086b595441bf9c75d4193ca9c0ed643b90360694d0f1240246", size = 58429, upload-time = "2026-02-15T12:11:22.133Z" },
|
| 1959 |
]
|
| 1960 |
|
| 1961 |
[[package]]
|
|
|
|
| 1996 |
|
| 1997 |
[[package]]
|
| 1998 |
name = "pyperclip"
|
| 1999 |
+
version = "1.9.0"
|
| 2000 |
source = { registry = "https://pypi.org/simple" }
|
| 2001 |
+
sdist = { url = "https://files.pythonhosted.org/packages/30/23/2f0a3efc4d6a32f3b63cdff36cd398d9701d26cda58e3ab97ac79fb5e60d/pyperclip-1.9.0.tar.gz", hash = "sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310", size = 20961, upload-time = "2024-06-18T20:38:48.401Z" }
|
|
|
|
|
|
|
|
|
|
| 2002 |
|
| 2003 |
[[package]]
|
| 2004 |
name = "pytest"
|
|
|
|
| 2366 |
|
| 2367 |
[[package]]
|
| 2368 |
name = "slop-farmer"
|
| 2369 |
+
version = "0.1.1"
|
| 2370 |
source = { editable = "." }
|
| 2371 |
dependencies = [
|
| 2372 |
{ name = "duckdb" },
|
|
|
|
| 2394 |
[package.metadata]
|
| 2395 |
requires-dist = [
|
| 2396 |
{ name = "duckdb", specifier = ">=1.2.2" },
|
| 2397 |
+
{ name = "fast-agent-mcp", specifier = ">=0.6.17" },
|
| 2398 |
{ name = "fast-agent-mcp", marker = "python_full_version >= '3.13.5' and extra == 'llm'", specifier = ">=0.6.16" },
|
| 2399 |
{ name = "fastapi", specifier = ">=0.115.0" },
|
| 2400 |
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
|