File size: 6,385 Bytes
9e1c398
c3d82b0
 
 
 
 
9e1c398
c3d82b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e1c398
 
 
c3d82b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0bcea
 
 
 
 
 
 
 
 
c3d82b0
 
4c0bcea
 
c3d82b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e1c398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3d82b0
 
 
 
 
9e1c398
c3d82b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
name: Deploy PyTorch Training with Hugging Face Sync

on:
  push:
    branches:
      - master
      - main
  workflow_dispatch:

jobs:
  start-runner:
    name: Start self-hosted EC2 runner
    runs-on: ubuntu-latest
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}

      - name: Start EC2 runner
        id: start-ec2-runner
        uses: machulav/ec2-github-runner@v2
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ami-044b0717aadbc9dfa
          ec2-instance-type: t2.xlarge
          subnet-id: subnet-024811dee81325f1c
          security-group-id: sg-0646c2a337a355a31

  deploy:
    name: Deploy PyTorch Training Pipeline
    needs: start-runner
    runs-on: ${{ needs.start-runner.outputs.label }}
    outputs:
      ecr-registry: ${{ steps.login-ecr.outputs.registry }}
      image-tag: ${{ github.sha }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}

      - name: Cache Docker layers
        uses: actions/cache@v3
        with:
          path: /tmp/.buildx-cache
          key: ${{ runner.os }}-docker-${{ github.sha }}
          restore-keys: |
            ${{ runner.os }}-docker-

      - name: Log in to Amazon ECR
        id: login-ecr
        uses: aws-actions/amazon-ecr-login@v2

      - name: Create .env file
        run: |
          echo "HYDRA_FULL_ERROR=1" >> .env
          echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
          echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
          echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
          echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
          echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"

      - name: Run Docker Compose for all services
        run: |
          docker-compose --env-file .env build --no-cache
          docker-compose --env-file .env up -d
          docker-compose logs --follow train eval
          docker-compose down --remove-orphans

      - name: Build, tag, and push Docker image to Amazon ECR
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
          REPOSITORY: soutrik71/mnist
          IMAGE_TAG: ${{ github.sha }}
        run: |
          docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
          docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
          docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
          docker push $REGISTRY/$REPOSITORY:latest

      - name: Pull Docker image from ECR and verify
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
          REPOSITORY: soutrik71/mnist
          IMAGE_TAG: ${{ github.sha }}
        run: |
          docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
          docker images | grep "$REGISTRY/$REPOSITORY"

      - name: Clean up environment
        run: |
          docker system prune -af --volumes

  sync-to-hub:
    name: Sync to Hugging Face Hub
    needs: deploy
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
          lfs: true

      - name: Install Git LFS
        run: |
          curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
          sudo apt-get install git-lfs
          git lfs install

      - name: Configure Git identity
        run: |
          git config --global user.name "soutrik"
          git config --global user.email "soutrik.chowdhury@ab-inbev.com"

      - name: Add remote
        run: |
          git remote add space https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          USER: soutrik
          SPACE: gradio_demo_MNIST_Classifier

      - name: Ensure LFS objects are present
        run: git lfs checkout

      - name: Add README.md
        run: |
          cat <<EOF > README.md
          ---
          title: My Gradio App MNIST Classifier
          emoji: πŸš€
          colorFrom: blue
          colorTo: green
          sdk: gradio
          sdk_version: "5.7.1"
          app_file: app.py
          pinned: false
          ---
          EOF
          git add README.md
          git commit -m "Add README.md" || echo "Skip commit if no changes"

      - name: Push to hub
        run: |
          git push --force https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE main
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          USER: soutrik
          SPACE: gradio_demo_MNIST_Classifier

  stop-runner:
    name: Stop self-hosted EC2 runner
    needs:
      - start-runner
      - deploy
      - sync-to-hub
    runs-on: ubuntu-latest
    if: ${{ always() }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}

      - name: Stop EC2 runner
        uses: machulav/ec2-github-runner@v2
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

      - name: Validate EC2 termination
        run: |
          aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
          --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."