aws_rl_env / data /upload_sft_to_hf.py
Sizzing's picture
Upload folder using huggingface_hub
e56d042 verified
"""Push the generated SFT JSONL splits to HuggingFace Hub as a proper dataset.
After upload, consumers can load it with:
from datasets import load_dataset
ds = load_dataset("<your-user>/aws-rl-sft")
ds["train"] # 1500 rows
ds["validation"] # 150 rows
ds["reserve"] # 200 held-out rows
Prerequisites:
pip install datasets>=2.19 huggingface_hub
export HF_TOKEN=hf_... # or `huggingface-cli login`
Usage:
python data/upload_sft_to_hf.py --repo-id <user>/aws-rl-sft
python data/upload_sft_to_hf.py --repo-id <user>/aws-rl-sft --private
python data/upload_sft_to_hf.py --repo-id <user>/aws-rl-sft --skip-push # dry run
python data/upload_sft_to_hf.py --repo-id Sizzing/aws-rl-sft --private --token hf_**** # upload to an org repo with explicit token
"""
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
def _find_repo_root(start: Path) -> Path:
"""Walk up from `start` looking for server/services/tasks/ as a sentinel."""
for p in [start, *start.parents]:
if (p / "server" / "services" / "tasks").is_dir():
return p
return start
REPO_ROOT = _find_repo_root(Path(__file__).resolve().parent)
SFT_DIR = REPO_ROOT / "data" / "sft"
SPLIT_FILES: dict[str, str] = {
"train": "aws_rl_sft.train.jsonl",
"validation": "aws_rl_sft.val.jsonl",
"reserve": "aws_rl_sft.reserve.jsonl",
}
def load_jsonl(path: Path) -> list[dict]:
rows: list[dict] = []
with open(path) as f:
for line in f:
line = line.strip()
if not line:
continue
rows.append(json.loads(line))
return rows
def build_dataset_dict(sft_dir: Path):
"""Build a DatasetDict from the JSONL splits."""
from datasets import Dataset, DatasetDict
splits = {}
for split, fname in SPLIT_FILES.items():
path = sft_dir / fname
if not path.exists():
print(f" skip split '{split}' — {path} not found")
continue
rows = load_jsonl(path)
ds = Dataset.from_list(rows)
splits[split] = ds
print(f" loaded '{split}': {len(rows)} rows, columns={list(ds.column_names)}")
return DatasetDict(splits)
DATASET_CARD = """---
language:
- en
license: apache-2.0
size_categories:
- 1K<n<10K
task_categories:
- text-generation
tags:
- aws
- aws-cli
- sft
- lora
- agentic
- rl
- grpo
- tool-use
pretty_name: AWS RL Env SFT
---
# AWS RL Env — SFT Dataset
Supervised fine-tuning dataset for training an LLM agent that operates AWS
infrastructure via the CLI. Built for the **aws-rl-env** reinforcement-learning
environment, which emulates 34 AWS services in-container (MiniStack) and rewards
agents for completing cloud-operations tasks via single-command steps.
Designed as the **cold-start phase** of an SFT → GRPO pipeline:
1. **SFT with LoRA** (this dataset) — command-only assistant targets, lock output format
2. **GRPO on curriculum** — refine policy with online env reward, optionally emerge `<think>` reasoning
## Schema
Each row is one `(state → command)` decision, formatted as HuggingFace chat messages.
Directly compatible with `trl.SFTTrainer` (auto-detects `messages` column and
applies the tokenizer's chat template).
```python
{
"task_id": int,
"difficulty": "warmup" | "beginner" | "intermediate" | "advanced" | "expert",
"source": "success_first_step" | "multi_step_continuation" | "failure_recovery" | "verification" | "hint_usage",
"step_idx": int,
"messages": [
{"role": "system", "content": "<system prompt>"},
{"role": "user", "content": "TASK: ... Step: N ..."},
{"role": "assistant", "content": "aws ..."},
],
}
```
## Composition (by source)
| Source | Share | What it teaches |
|---|---:|---|
| `success_first_step` | ~55% | Canonical command at step 0 given empty state |
| `multi_step_continuation` | ~20% | Step N>0 with prior command history |
| `failure_recovery` | ~15% | Correct command after a plausible mistake (wrong-op, missing-arg, s3-vs-s3api, typo, etc.) |
| `verification` | ~5% | Read-only verify command after task completion |
| `hint_usage` | ~5% | Edge case: assistant requests hint via `aws help --task-hint` |
## Composition (by tier)
| Tier | Share |
|---|---:|
| warmup | ~30% |
| beginner | ~25% |
| intermediate | ~44% |
| advanced | 0% — deferred to GRPO (dynamic resource IDs can't be safely synthesized offline) |
| expert | 0% — deferred to GRPO (policy-crafting / security audits benefit from env reward) |
## Splits
| Split | Rows | Purpose |
|---|---:|---|
| `train` | 1500 | LoRA SFT training |
| `validation` | 150 | Eval loss, early stopping |
| `reserve` | 200 | Held-out; use only if train proves insufficient |
## Quickstart
### Load
```python
from datasets import load_dataset
ds = load_dataset("<your-user>/aws-rl-sft")
print(ds)
```
### Filter by source or tier
```python
easy = ds["train"].filter(lambda r: r["difficulty"] in ("warmup", "beginner"))
recovery = ds["train"].filter(lambda r: r["source"] == "failure_recovery")
```
### Train with `trl.SFTTrainer` + LoRA
```python
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="bfloat16")
ds = load_dataset("<your-user>/aws-rl-sft")
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=ds["train"],
eval_dataset=ds["validation"],
peft_config=LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
task_type="CAUSAL_LM",
),
args=SFTConfig(
output_dir="./sft-ckpt",
max_seq_length=2048,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
eval_strategy="steps",
eval_steps=100,
save_steps=200,
logging_steps=10,
bf16=True,
packing=False,
),
)
trainer.train()
trainer.save_model("./sft-ckpt/final")
```
## Generation notes
- **Fully synthetic, no teacher LLM required.** Canonical commands were pulled from
the env's own test suite (`tests_tasks/test_*.py`), where each task's command
sequence is already verified to pass the grader with reward 1.0.
- **Failure-recovery rows** use a 5-mistake catalog (wrong-op, missing-arg,
wrong-service, s3-vs-s3api confusion, character-swap typo) paired with realistic
AWS CLI error messages.
- **Prompt variance** injected via reward jitter (±0.1), history-window trimming,
and sampled reset-state outputs so dedup-on-exact-prompt still produces enough
unique rows.
## License
Apache 2.0. AWS CLI commands themselves are public interface; assistant targets
were generated deterministically from the env's grader test suite.
"""
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
ap.add_argument("--repo-id", required=True, help="HF repo id, e.g. username/aws-rl-sft")
ap.add_argument("--private", action="store_true", help="Create as private repo")
ap.add_argument("--sft-dir", type=Path, default=SFT_DIR)
ap.add_argument("--token", default=None, help="HF token (falls back to HF_TOKEN env var)")
ap.add_argument(
"--skip-push",
action="store_true",
help="Build + save locally, don't upload (useful for testing)",
)
args = ap.parse_args()
try:
from datasets import DatasetDict # noqa: F401
except ImportError:
raise SystemExit(
"The 'datasets' library is required. Install it with:\n"
" pip install datasets>=2.19"
)
token = args.token or os.getenv("HF_TOKEN")
if not token and not args.skip_push:
raise SystemExit(
"No HF token found. Either:\n"
" export HF_TOKEN=hf_...\n"
" # or\n"
" huggingface-cli login\n"
" # or pass --token explicitly\n"
" # or use --skip-push to build locally without uploading"
)
print(f"Loading JSONL splits from {args.sft_dir}...")
ds_dict = build_dataset_dict(args.sft_dir)
if not ds_dict:
raise SystemExit(
f"No splits loaded from {args.sft_dir}. "
"Run build_sft_dataset.py first."
)
if args.skip_push:
local_path = args.sft_dir / "hf_dataset_preview"
ds_dict.save_to_disk(str(local_path))
print(f"\n--skip-push: saved DatasetDict to {local_path}")
print("Inspect with: datasets.load_from_disk('{0}')".format(local_path))
print("\nOne sample row from 'train':")
print(json.dumps(ds_dict["train"][0], indent=2)[:800])
return
from huggingface_hub import HfApi, login
login(token=token)
print(f"\nPushing to https://huggingface.co/datasets/{args.repo_id}")
print(f" private={args.private}")
ds_dict.push_to_hub(args.repo_id, private=args.private, token=token)
api = HfApi(token=token)
readme_bytes = DATASET_CARD.encode("utf-8")
api.upload_file(
path_or_fileobj=readme_bytes,
path_in_repo="README.md",
repo_id=args.repo_id,
repo_type="dataset",
commit_message="Add dataset card",
)
print(f"\nDone. https://huggingface.co/datasets/{args.repo_id}")
print("\nConsumer usage:")
print(f" from datasets import load_dataset")
print(f" ds = load_dataset('{args.repo_id}')")
if __name__ == "__main__":
main()