| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| name: GPU unit tests |
|
|
| on: |
| |
| |
| push: |
| branches: |
| - main |
| - v0.4.x |
| paths: |
| - "**/*.py" |
| - .github/workflows/gpu_unit_tests.yml |
| pull_request: |
| branches: |
| - main |
| - v0.4.x |
| paths: |
| |
| |
| |
| - "**/*.py" |
| |
| - "!examples/**" |
| - "!verl/trainer/main_*.py" |
| - "!verl/trainer/fsdp_sft_trainer.py" |
| |
| - .github/workflows/gpu_unit_tests.yml |
| - "tests/**test_*.py" |
| |
| - "!tests/*_on_cpu.py" |
|
|
| |
| concurrency: |
| group: ${{ github.workflow }}-${{ github.ref }} |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} |
|
|
| |
| permissions: |
| contents: read |
|
|
| env: |
| IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl059.dev2" |
| DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" |
|
|
| jobs: |
| setup: |
| if: github.repository_owner == 'verl-project' |
| runs-on: ubuntu-latest |
| outputs: |
| runner-label: ${{ steps.create-runner.outputs.runner-label }} |
| mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }} |
| steps: |
| - uses: actions/checkout@v4 |
| - id: create-runner |
| uses: volcengine/vemlp-github-runner@v1 |
| with: |
| mode: "create" |
| faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" |
| mlp-image: "${{ env.IMAGE }}" |
|
|
| gpu_unit_tests: |
| if: github.repository_owner == 'verl-project' |
| needs: setup |
| runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] |
| timeout-minutes: 60 |
| env: |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} |
| NO_PROXY: "localhost,127.0.0.1" |
| HF_HUB_ENABLE_HF_TRANSFER: 1 |
| steps: |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 |
| with: |
| fetch-depth: 0 |
| - name: Install the current repository |
| run: | |
| pip3 install hf_transfer |
| pip3 install -r requirements-test.txt |
| pip3 install --no-deps -e . |
| pip3 install cupy-cuda12x==13.6.0 pytest-asyncio |
| pip3 install --ignore-installed blinker |
| pip3 install --ignore-installed mlflow "numpy<2.0" |
| - name: Run all GPU unit tests |
| run: | |
| pytest -s -x --ignore-glob="*on_npu.py" --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_shared_memory*" --ignore-glob="tests/workers/rollout/rollout_trtllm" --ignore-glob="*test_bucketed_weight_transfer*" tests/ |
| - name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption |
| run: | |
| LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_special_linear_cross_entropy_tp.py |
| - name: Testing FSDP2 actor functionality |
| run: | |
| torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/actor/test_special_dp_actor.py |
| - name: Testing FSDP2 critic functionality |
| run: | |
| torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/critic/test_special_dp_critic.py |
| |
| cleanup: |
| runs-on: ubuntu-latest |
| needs: [setup, gpu_unit_tests] |
| if: always() |
| steps: |
| - id: destroy-runner |
| uses: volcengine/vemlp-github-runner@v1 |
| with: |
| mode: "destroy" |
| faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" |
| mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}" |
|
|