interactSpeech / docs /transformers /.github /workflows /self-push-amd.yml

Add files using upload-large-folder tool

0947ff8 verified 4 months ago

15 kB

	name: Self-hosted runner AMD GPU (push)

	on:
	workflow_call:
	inputs:
	gpu_flavor:
	required: true
	type: string

	env:
	HF_HOME: /mnt/cache
	TRANSFORMERS_IS_CI: yes
	OMP_NUM_THREADS: 8
	MKL_NUM_THREADS: 8
	PYTEST_TIMEOUT: 60
	TF_FORCE_GPU_ALLOW_GROWTH: true
	HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}

	jobs:
	check_runner_status:
	name: Check Runner Status
	runs-on: ubuntu-22.04
	steps:
	- name: Checkout transformers
	uses: actions/checkout@v4
	with:
	fetch-depth: 2

	- name: Check Runner Status
	run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

	check_runners:
	name: Check Runners
	needs: check_runner_status
	strategy:
	matrix:
	machine_type: [single-gpu, multi-gpu]
	runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
	container:
	image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: ROCM-SMI
	run: \|
	rocm-smi
	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14
	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	setup_gpu:
	name: Setup
	needs: check_runners
	strategy:
	matrix:
	machine_type: [single-gpu, multi-gpu]
	runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
	container:
	image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	outputs:
	matrix: ${{ steps.set-matrix.outputs.matrix }}
	test_map: ${{ steps.set-matrix.outputs.test_map }}
	env:
	# `CI_BRANCH_PUSH`: The branch name from the push event
	# `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
	# `CI_SHA_PUSH`: The commit SHA from the push event
	# `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
	CI_BRANCH_PUSH: ${{ github.event.ref }}
	CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
	CI_SHA_PUSH: ${{ github.event.head_commit.id }}
	CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
	steps:
	# Necessary to get the correct branch name and commit SHA for `workflow_run` event
	# We also take into account the `push` event (we might want to test some changes in a branch)
	- name: Prepare custom environment variables
	shell: bash
	# `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
	# `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
	run: \|
	CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
	echo $CI_BRANCH_PUSH
	echo $CI_BRANCH_WORKFLOW_RUN
	echo $CI_SHA_PUSH
	echo $CI_SHA_WORKFLOW_RUN
	[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV \|\| echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
	[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV \|\| echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV

	- name: print environment variables
	run: \|
	echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
	echo "env.CI_SHA = ${{ env.CI_SHA }}"

	- name: Update clone using environment variables
	working-directory: /transformers
	run: \|
	echo "original branch = $(git branch --show-current)"
	git fetch && git checkout ${{ env.CI_BRANCH }}
	echo "updated branch = $(git branch --show-current)"
	git checkout ${{ env.CI_SHA }}
	echo "log = $(git log -n 1)"

	- name: Cleanup
	working-directory: /transformers
	run: \|
	rm -rf tests/__pycache__
	rm -rf tests/models/__pycache__
	rm -rf reports

	- name: Show installed libraries and their versions
	working-directory: /transformers
	run: pip freeze

	- name: Fetch the tests to run
	working-directory: /transformers
	# TODO: add `git-python` in the docker images
	run: \|
	pip install --upgrade git-python
	python3 utils/tests_fetcher.py --diff_with_last_commit \| tee test_preparation.txt

	- name: Report fetched tests
	uses: actions/upload-artifact@v4
	with:
	name: test_fetched
	path: /transformers/test_preparation.txt

	- id: set-matrix
	name: Organize tests into models
	working-directory: /transformers
	# The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
	# The `test_map` is used to get the actual identified test files under each key.
	# If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
	run: \|
	if [ -f test_map.json ]; then
	keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
	test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
	else
	keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
	test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
	fi
	echo $keys
	echo $test_map
	echo "matrix=$keys" >> $GITHUB_OUTPUT
	echo "test_map=$test_map" >> $GITHUB_OUTPUT

	run_models_gpu:
	name: Model tests
	needs: setup_gpu
	# `dummy` means there is no test to run
	if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true
	strategy:
	fail-fast: false
	matrix:
	folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
	machine_type: [single-gpu, multi-gpu]
	runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
	container:
	image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	env:
	# For the meaning of these environment variables, see the job `Setup`
	CI_BRANCH_PUSH: ${{ github.event.ref }}
	CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
	CI_SHA_PUSH: ${{ github.event.head_commit.id }}
	CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
	steps:
	# Necessary to get the correct branch name and commit SHA for `workflow_run` event
	# We also take into account the `push` event (we might want to test some changes in a branch)
	- name: Prepare custom environment variables
	shell: bash
	# For the meaning of these environment variables, see the job `Setup`
	run: \|
	CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
	echo $CI_BRANCH_PUSH
	echo $CI_BRANCH_WORKFLOW_RUN
	echo $CI_SHA_PUSH
	echo $CI_SHA_WORKFLOW_RUN
	[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV \|\| echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
	[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV \|\| echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV

	- name: print environment variables
	run: \|
	echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
	echo "env.CI_SHA = ${{ env.CI_SHA }}"

	- name: Update clone using environment variables
	working-directory: /transformers
	run: \|
	echo "original branch = $(git branch --show-current)"
	git fetch && git checkout ${{ env.CI_BRANCH }}
	echo "updated branch = $(git branch --show-current)"
	git checkout ${{ env.CI_SHA }}
	echo "log = $(git log -n 1)"

	- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
	working-directory: /transformers
	run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

	- name: Echo folder ${{ matrix.folders }}
	shell: bash
	# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
	# set the artifact folder names (because the character `/` is not allowed).
	run: \|
	echo "${{ matrix.folders }}"
	echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}"
	matrix_folders=${{ matrix.folders }}
	matrix_folders=${matrix_folders/'models/'/'models_'}
	echo "$matrix_folders"
	echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

	- name: ROCM-SMI
	run: \|
	rocm-smi
	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14
	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	- name: Environment
	working-directory: /transformers
	run: \|
	python3 utils/print_env.py

	- name: Show installed libraries and their versions
	working-directory: /transformers
	run: pip freeze

	- name: Run all non-slow selected tests on GPU
	working-directory: /transformers
	run: \|
	python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"

	- name: Failure short reports
	if: ${{ failure() }}
	continue-on-error: true
	run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt

	- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
	if: ${{ always() }}
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
	path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports

	send_results:
	name: Send results to webhook
	runs-on: ubuntu-22.04
	if: always()
	needs: [
	check_runner_status,
	check_runners,
	setup_gpu,
	run_models_gpu,
	# run_tests_torch_cuda_extensions_single_gpu,
	# run_tests_torch_cuda_extensions_multi_gpu
	]
	env:
	# For the meaning of these environment variables, see the job `Setup`
	CI_BRANCH_PUSH: ${{ github.event.ref }}
	CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
	CI_SHA_PUSH: ${{ github.event.head_commit.id }}
	CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
	steps:
	- name: Preliminary job status
	shell: bash
	# For the meaning of these environment variables, see the job `Setup`
	run: \|
	echo "Runner availability: ${{ needs.check_runner_status.result }}"
	echo "Setup status: ${{ needs.setup_gpu.result }}"
	echo "Runner status: ${{ needs.check_runners.result }}"

	# Necessary to get the correct branch name and commit SHA for `workflow_run` event
	# We also take into account the `push` event (we might want to test some changes in a branch)
	- name: Prepare custom environment variables
	shell: bash
	# For the meaning of these environment variables, see the job `Setup`
	run: \|
	CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
	echo $CI_BRANCH_PUSH
	echo $CI_BRANCH_WORKFLOW_RUN
	echo $CI_SHA_PUSH
	echo $CI_SHA_WORKFLOW_RUN
	[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV \|\| echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
	[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV \|\| echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV

	- name: print environment variables
	run: \|
	echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
	echo "env.CI_SHA = ${{ env.CI_SHA }}"

	- uses: actions/checkout@v4
	# To avoid failure when multiple commits are merged into `main` in a short period of time.
	# Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
	# (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
	with:
	fetch-depth: 20

	- name: Update clone using environment variables
	run: \|
	echo "original branch = $(git branch --show-current)"
	git fetch && git checkout ${{ env.CI_BRANCH }}
	echo "updated branch = $(git branch --show-current)"
	git checkout ${{ env.CI_SHA }}
	echo "log = $(git log -n 1)"

	- uses: actions/download-artifact@v4
	- name: Send message to Slack
	env:
	CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
	CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
	CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
	CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
	CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
	CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
	ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
	CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
	CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
	CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
	CI_SHA: ${{ env.CI_SHA }}
	RUNNER_STATUS: ${{ needs.check_runner_status.result }}
	RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
	SETUP_STATUS: ${{ needs.setup_gpu.result }}

	# We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
	# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
	run: \|
	pip install huggingface_hub
	pip install slack_sdk
	pip show slack_sdk
	python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"