File size: 1,522 Bytes
7453f13
 
 
 
 
 
 
a578ad9
 
 
 
 
7453f13
a578ad9
 
 
7453f13
 
a578ad9
7453f13
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# Baked SageMaker training image for the Composer-replication RL stack (F3 §3.2).
#
# The repeatable path: bake trl+vllm+the framework into an image so jobs don't
# pip-install at startup (saves ~5-10 min/job and removes a flaky failure
# surface). The one-shot smoke can instead use the stock DLC + source_dir
# (run_sagemaker_launch.py --image dlc), which needs no local build.
#
# Base: AWS PyTorch DLC, tag RESOLVED LIVE against the us-west-2 registry.
# MUST be torch-2.7: trl 1.5 → transformers>=4.56.2 → torch.float8_e8m0fnu
# (torch>=2.7). The torch-2.6 DLC fails AutoModel.from_pretrained on that dtype.
# cu128, -v1.26 build suffix required (no bare floating tag exists).
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.26

# RL stack baked in. torch 2.7 + CUDA 12.8 already in the DLC — do NOT reinstall
# torch. vllm>=0.9 is the torch-2.7 line (0.8.x hard-pins torch 2.6 and would
# fight this base); pin to a 2.7-matched vllm to avoid a wheel/CUDA mismatch.
RUN pip install --no-cache-dir \
      "trl>=1.5,<2" "peft>=0.13" "accelerate>=1.0" "datasets>=3.0" \
      "vllm>=0.9" "fsspec>=2024.6" "s3fs>=2024.6" "hf_transfer>=0.1.6"

# The framework itself (train + serverless extras → trainer, loss, executors,
# replica_entrypoint, s3fs all present).
COPY . /opt/composer_replication
RUN pip install --no-cache-dir -e "/opt/composer_replication[train,serverless]"

ENV HF_HOME=/opt/ml/input/hf_cache \
    HF_HUB_ENABLE_HF_TRANSFER=1