soutrik commited on
Commit
c3d82b0
0 Parent(s):

orphan branch

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
2
+ checkpoints/best_model.ckpt filter=lfs diff=lfs merge=lfs -text
3
+ checkpoints/last.ckpt filter=lfs diff=lfs merge=lfs -text
4
+ checkpoints/*.ckpt filter=lfs diff=lfs merge=lfs -text
.github/workflows/deployment.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy PyTorch Training with EC2 Runner and Docker Compose
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ start-runner:
12
+ name: Start self-hosted EC2 runner
13
+ runs-on: ubuntu-latest
14
+ outputs:
15
+ label: ${{ steps.start-ec2-runner.outputs.label }}
16
+ ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
17
+ steps:
18
+ - name: Configure AWS credentials
19
+ uses: aws-actions/configure-aws-credentials@v4
20
+ with:
21
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
22
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
23
+ aws-region: ${{ secrets.AWS_REGION }}
24
+
25
+ - name: Start EC2 runner
26
+ id: start-ec2-runner
27
+ uses: machulav/ec2-github-runner@v2
28
+ with:
29
+ mode: start
30
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
31
+ ec2-image-id: ami-044b0717aadbc9dfa
32
+ ec2-instance-type: t2.xlarge
33
+ subnet-id: subnet-024811dee81325f1c
34
+ security-group-id: sg-0646c2a337a355a31
35
+
36
+ deploy:
37
+ name: Deploy PyTorch Training Pipeline
38
+ needs: start-runner
39
+ runs-on: ${{ needs.start-runner.outputs.label }}
40
+ steps:
41
+ - name: Checkout repository
42
+ uses: actions/checkout@v4
43
+
44
+ - name: Set up Docker Buildx
45
+ uses: docker/setup-buildx-action@v3
46
+
47
+ - name: Configure AWS credentials
48
+ uses: aws-actions/configure-aws-credentials@v4
49
+ with:
50
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
51
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
52
+ aws-region: ${{ secrets.AWS_REGION }}
53
+
54
+ - name: Log in to Amazon ECR
55
+ id: login-ecr
56
+ uses: aws-actions/amazon-ecr-login@v2
57
+
58
+ - name: Create .env file
59
+ run: |
60
+ echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
61
+ echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
62
+ echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
63
+
64
+ - name: Run Docker Compose for train and eval service
65
+ run: |
66
+ docker-compose stop
67
+ docker-compose up --build
68
+ docker-compose logs --follow
69
+ docker-compose down --remove-orphans
70
+
71
+
72
+ - name: Build, tag, and push Docker image to Amazon ECR
73
+ env:
74
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
75
+ REPOSITORY: soutrik71/mnist
76
+ IMAGE_TAG: ${{ github.sha }}
77
+ run: |
78
+ docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
79
+ docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
80
+ docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
81
+ docker push $REGISTRY/$REPOSITORY:latest
82
+
83
+ - name: Pull Docker image from ECR and verify
84
+ env:
85
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
86
+ REPOSITORY: soutrik71/mnist
87
+ IMAGE_TAG: ${{ github.sha }}
88
+ run: |
89
+ docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
90
+ docker images | grep "$REGISTRY/$REPOSITORY"
91
+
92
+ stop-runner:
93
+ name: Stop self-hosted EC2 runner
94
+ needs:
95
+ - start-runner
96
+ - deploy
97
+ runs-on: ubuntu-latest
98
+ if: ${{ always() }}
99
+ steps:
100
+ - name: Configure AWS credentials
101
+ uses: aws-actions/configure-aws-credentials@v4
102
+ with:
103
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
104
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
105
+ aws-region: ${{ secrets.AWS_REGION }}
106
+
107
+ - name: Stop EC2 runner
108
+ uses: machulav/ec2-github-runner@v2
109
+ with:
110
+ mode: stop
111
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
112
+ label: ${{ needs.start-runner.outputs.label }}
113
+ ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
.github/workflows/deployment_advanced.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy PyTorch Training with EC2 Runner and Docker Compose with Advanced Deployment
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+
8
+ jobs:
9
+ start-runner:
10
+ name: Start self-hosted EC2 runner
11
+ runs-on: ubuntu-latest
12
+ outputs:
13
+ label: ${{ steps.start-ec2-runner.outputs.label }}
14
+ ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
15
+ steps:
16
+ - name: Configure AWS credentials
17
+ uses: aws-actions/configure-aws-credentials@v4
18
+ with:
19
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
20
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
21
+ aws-region: ${{ secrets.AWS_REGION }}
22
+
23
+ - name: Start EC2 runner
24
+ id: start-ec2-runner
25
+ uses: machulav/ec2-github-runner@v2
26
+ with:
27
+ mode: start
28
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
29
+ ec2-image-id: ami-044b0717aadbc9dfa
30
+ ec2-instance-type: t2.xlarge
31
+ subnet-id: subnet-024811dee81325f1c
32
+ security-group-id: sg-0646c2a337a355a31
33
+
34
+ deploy:
35
+ name: Deploy PyTorch Training Pipeline
36
+ needs: start-runner
37
+ runs-on: ${{ needs.start-runner.outputs.label }}
38
+ steps:
39
+ - name: Checkout repository
40
+ uses: actions/checkout@v4
41
+
42
+ - name: Set up Docker Buildx
43
+ uses: docker/setup-buildx-action@v3
44
+
45
+ - name: Configure AWS credentials
46
+ uses: aws-actions/configure-aws-credentials@v4
47
+ with:
48
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50
+ aws-region: ${{ secrets.AWS_REGION }}
51
+
52
+ - name: Cache Docker layers
53
+ uses: actions/cache@v3
54
+ with:
55
+ path: /tmp/.buildx-cache
56
+ key: ${{ runner.os }}-docker-${{ github.sha }}
57
+ restore-keys: |
58
+ ${{ runner.os }}-docker-
59
+
60
+ - name: Log in to Amazon ECR
61
+ id: login-ecr
62
+ uses: aws-actions/amazon-ecr-login@v2
63
+
64
+ - name: Create .env file
65
+ run: |
66
+ echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
67
+ echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
68
+ echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
69
+ echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
70
+ echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"
71
+
72
+ - name: Run Docker Compose for all services
73
+ run: |
74
+ docker-compose build --no-cache
75
+ docker-compose up -d
76
+ docker-compose logs --follow train eval
77
+ docker-compose down --remove-orphans
78
+
79
+ - name: Build, tag, and push Docker image to Amazon ECR
80
+ env:
81
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
82
+ REPOSITORY: soutrik71/mnist
83
+ IMAGE_TAG: ${{ github.sha }}
84
+ run: |
85
+ docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
86
+ docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
87
+ docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
88
+ docker push $REGISTRY/$REPOSITORY:latest
89
+
90
+ - name: Pull Docker image from ECR and verify
91
+ env:
92
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
93
+ REPOSITORY: soutrik71/mnist
94
+ IMAGE_TAG: ${{ github.sha }}
95
+ run: |
96
+ docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
97
+ docker images | grep "$REGISTRY/$REPOSITORY"
98
+
99
+ - name: Clean up environment
100
+ run: |
101
+ rm -f .env
102
+ docker system prune -af --volumes
103
+
104
+ stop-runner:
105
+ name: Stop self-hosted EC2 runner
106
+ needs:
107
+ - start-runner
108
+ - deploy
109
+ runs-on: ubuntu-latest
110
+ if: ${{ always() }}
111
+ steps:
112
+ - name: Configure AWS credentials
113
+ uses: aws-actions/configure-aws-credentials@v4
114
+ with:
115
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
116
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
117
+ aws-region: ${{ secrets.AWS_REGION }}
118
+
119
+ - name: Stop EC2 runner
120
+ uses: machulav/ec2-github-runner@v2
121
+ with:
122
+ mode: stop
123
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
124
+ label: ${{ needs.start-runner.outputs.label }}
125
+ ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
126
+
127
+ - name: Validate EC2 termination
128
+ run: |
129
+ aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
130
+ --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."
.github/workflows/do-the-job.yml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: do-the-job
2
+ on:
3
+ push:
4
+ branches:
5
+ - master
6
+
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ start-runner:
11
+ name: Start self-hosted EC2 runner
12
+ runs-on: ubuntu-latest
13
+ outputs:
14
+ label: ${{ steps.start-ec2-runner.outputs.label }}
15
+ ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
16
+ steps:
17
+ - name: Configure AWS credentials
18
+ uses: aws-actions/configure-aws-credentials@v4
19
+ with:
20
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
21
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
22
+ aws-region: ${{ secrets.AWS_REGION }}
23
+ - name: Start EC2 runner
24
+ id: start-ec2-runner
25
+ uses: machulav/ec2-github-runner@v2
26
+ with:
27
+ mode: start
28
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
29
+ ec2-image-id: ami-044b0717aadbc9dfa
30
+ ec2-instance-type: t2.xlarge
31
+ subnet-id: subnet-024811dee81325f1c
32
+ security-group-id: sg-0646c2a337a355a31
33
+
34
+ do-the-job:
35
+ name: Do the job on the runner
36
+ needs: start-runner
37
+ runs-on: ${{ needs.start-runner.outputs.label }}
38
+ steps:
39
+ - name: Clone the repository
40
+ uses: actions/checkout@v4
41
+ with:
42
+ fetch-depth: 0
43
+ - name: Run custom command
44
+ run: echo 'Hello World!'
45
+
46
+ stop-runner:
47
+ name: Stop self-hosted EC2 runner
48
+ needs:
49
+ - start-runner
50
+ - do-the-job
51
+ runs-on: ubuntu-latest
52
+ if: ${{ always() }}
53
+ steps:
54
+ - name: Configure AWS credentials
55
+ uses: aws-actions/configure-aws-credentials@v4
56
+ with:
57
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
58
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
59
+ aws-region: ${{ secrets.AWS_REGION }}
60
+ - name: Stop EC2 runner
61
+ uses: machulav/ec2-github-runner@v2
62
+ with:
63
+ mode: stop
64
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
65
+ label: ${{ needs.start-runner.outputs.label }}
66
+ ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
.github/workflows/hf_deploy.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Hub for Gradio App MNIST Classifier # this is not working due to lfs issue
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+
8
+ jobs:
9
+ sync-to-hub:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ with:
14
+ fetch-depth: 0
15
+ lfs: true
16
+
17
+ - name: Install Git LFS
18
+ run: |
19
+ curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
20
+ sudo apt-get install git-lfs
21
+ git lfs install
22
+
23
+ - name: Configure Git identity
24
+ run: |
25
+ git config --global user.name "soutrik"
26
+ git config --global user.email "soutrik.chowdhury@ab-inbev.com"
27
+
28
+ - name: Add remote
29
+ run: |
30
+ git remote add space https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE
31
+ env:
32
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
33
+ USER: soutrik
34
+ SPACE: gradio_demo_MNIST_Classifier
35
+
36
+ # # Track individual files with LFS
37
+ # - name: Track last.ckpt with Git LFS
38
+ # run: |
39
+ # git lfs track "checkpoints/last.ckpt"
40
+ # git add .gitattributes
41
+ # git commit -m "Track last.ckpt with Git LFS" || echo "Skip commit if no changes"
42
+
43
+ # - name: Track best_model.ckpt with Git LFS
44
+ # run: |
45
+ # git lfs track "checkpoints/best_model.ckpt"
46
+ # git add .gitattributes
47
+ # git commit -m "Track best_model.ckpt with Git LFS" || echo "Skip commit if no changes"
48
+
49
+ # Ensure LFS objects are checked out
50
+ - name: Ensure LFS objects are present
51
+ run: git lfs checkout
52
+
53
+ - name: Add README.md
54
+ run: |
55
+ cat <<EOF > README.md
56
+ ---
57
+ title: My Gradio App MNIST Classifier
58
+ emoji: 🚀
59
+ colorFrom: blue
60
+ colorTo: green
61
+ sdk: gradio
62
+ sdk_version: "5.7.1"
63
+ app_file: app.py
64
+ pinned: false
65
+ ---
66
+ EOF
67
+ git add README.md
68
+ git commit -m "Add README.md" || echo "Skip commit if no changes"
69
+
70
+ - name: Push to hub
71
+ run: |
72
+ git push --force https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE main
73
+ env:
74
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
75
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
76
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
77
+ AWS_REGION: ${{ secrets.AWS_REGION }}
78
+ USER: soutrik
79
+ SPACE: gradio_demo_MNIST_Classifier
.github/workflows/main_cd.yml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy PyTorch Training with all advanced features like self-hosted EC2 runner, Docker Buildx, Amazon ECR, Hugging Face Spaces
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ start-runner:
11
+ name: Start self-hosted EC2 runner
12
+ runs-on: ubuntu-latest
13
+ outputs:
14
+ label: ${{ steps.start-ec2-runner.outputs.label }}
15
+ ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
16
+ steps:
17
+ - name: Configure AWS credentials
18
+ uses: aws-actions/configure-aws-credentials@v4
19
+ with:
20
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
21
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
22
+ aws-region: ${{ secrets.AWS_REGION }}
23
+
24
+ - name: Start EC2 runner
25
+ id: start-ec2-runner
26
+ uses: machulav/ec2-github-runner@v2
27
+ with:
28
+ mode: start
29
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
30
+ ec2-image-id: ami-044b0717aadbc9dfa
31
+ ec2-instance-type: t2.xlarge
32
+ subnet-id: subnet-024811dee81325f1c
33
+ security-group-id: sg-0646c2a337a355a31
34
+
35
+ deploy:
36
+ name: Deploy PyTorch Training Pipeline
37
+ needs: start-runner
38
+ runs-on: ${{ needs.start-runner.outputs.label }}
39
+ steps:
40
+ - name: Checkout repository
41
+ uses: actions/checkout@v4
42
+
43
+ - name: Set up Docker Buildx
44
+ uses: docker/setup-buildx-action@v3
45
+
46
+ - name: Configure AWS credentials
47
+ uses: aws-actions/configure-aws-credentials@v4
48
+ with:
49
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
50
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
51
+ aws-region: ${{ secrets.AWS_REGION }}
52
+
53
+ - name: Cache Docker layers
54
+ uses: actions/cache@v3
55
+ with:
56
+ path: /tmp/.buildx-cache
57
+ key: ${{ runner.os }}-docker-${{ github.sha }}
58
+ restore-keys: |
59
+ ${{ runner.os }}-docker-
60
+
61
+ - name: Log in to Amazon ECR
62
+ id: login-ecr
63
+ uses: aws-actions/amazon-ecr-login@v2
64
+
65
+ - name: Create .env file
66
+ run: |
67
+ echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
68
+ echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
69
+ echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
70
+ echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
71
+ echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"
72
+
73
+ - name: Run Docker Compose for all services
74
+ run: |
75
+ docker-compose build --no-cache
76
+ docker-compose up -d
77
+ docker-compose logs --follow train eval
78
+ docker-compose down --remove-orphans
79
+
80
+ - name: Build, tag, and push Docker image to Amazon ECR
81
+ env:
82
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
83
+ REPOSITORY: soutrik71/mnist
84
+ IMAGE_TAG: ${{ github.sha }}
85
+ run: |
86
+ docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
87
+ docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
88
+ docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
89
+ docker push $REGISTRY/$REPOSITORY:latest
90
+
91
+ - name: Pull Docker image from ECR and verify
92
+ env:
93
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
94
+ REPOSITORY: soutrik71/mnist
95
+ IMAGE_TAG: ${{ github.sha }}
96
+ run: |
97
+ docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
98
+ docker images | grep "$REGISTRY/$REPOSITORY"
99
+
100
+ - name: Clean up environment
101
+ run: |
102
+ rm -f .env
103
+ docker system prune -af --volumes
104
+
105
+ stop-runner:
106
+ name: Stop self-hosted EC2 runner
107
+ needs:
108
+ - start-runner
109
+ - deploy
110
+ runs-on: ubuntu-latest
111
+ if: ${{ always() }}
112
+ steps:
113
+ - name: Configure AWS credentials
114
+ uses: aws-actions/configure-aws-credentials@v4
115
+ with:
116
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
117
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
118
+ aws-region: ${{ secrets.AWS_REGION }}
119
+
120
+ - name: Stop EC2 runner
121
+ uses: machulav/ec2-github-runner@v2
122
+ with:
123
+ mode: stop
124
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
125
+ label: ${{ needs.start-runner.outputs.label }}
126
+ ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
127
+
128
+ - name: Validate EC2 termination
129
+ run: |
130
+ aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
131
+ --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."
.gitignore ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # celery beat schedule file
88
+ celerybeat-schedule
89
+
90
+ # SageMath parsed files
91
+ *.sage.py
92
+
93
+ # Environments
94
+ .env
95
+ .venv
96
+ env/
97
+ venv/
98
+ ENV/
99
+ env.bak/
100
+ venv.bak/
101
+
102
+ # Spyder project settings
103
+ .spyderproject
104
+ .spyderworkspace
105
+
106
+ # Rope project settings
107
+ .ropeproject
108
+
109
+ # mkdocs documentation
110
+ /site
111
+
112
+ # mypy
113
+ .mypy_cache/
114
+ .dmypy.json
115
+ dmypy.json
116
+
117
+ # Pyre type checker
118
+ .pyre/
119
+
120
+ # PyTorch
121
+ *.pt
122
+ *.pth
123
+ logs/
124
+ data/
125
+ checkpoints/
126
+ checkpoints/*
127
+ checkpoints/best_model.ckpt
128
+ checkpoints/last.ckpt
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.project-root ADDED
File without changes
Dockerfile ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Build environment with Poetry and dependencies
2
+ FROM python:3.10.15-slim as builder
3
+
4
+ LABEL maintainer="Soutrik soutrik1991@gmail.com" \
5
+ description="Docker image for running a Python app with dependencies managed by Poetry."
6
+
7
+ # Install Poetry and necessary system dependencies
8
+ RUN apt-get update && apt-get install -y --no-install-recommends curl && \
9
+ curl -sSL https://install.python-poetry.org | python3 - && \
10
+ apt-get clean && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Add Poetry to the PATH explicitly
13
+ ENV PATH="/root/.local/bin:$PATH"
14
+
15
+ # Set the working directory to /app
16
+ WORKDIR /app
17
+
18
+ # Copy pyproject.toml and poetry.lock to install dependencies
19
+ COPY pyproject.toml poetry.lock /app/
20
+
21
+ # Configure Poetry environment
22
+ ENV POETRY_NO_INTERACTION=1 \
23
+ POETRY_VIRTUALENVS_IN_PROJECT=1 \
24
+ POETRY_CACHE_DIR=/tmp/poetry_cache
25
+
26
+ # Install dependencies without installing the package itself
27
+ RUN --mount=type=cache,target=/tmp/poetry_cache poetry install --only main --no-root
28
+
29
+ # Additional steps: Uninstall and re-add cryptography
30
+ RUN poetry run pip uninstall -y cryptography && \
31
+ poetry add cryptography --lock
32
+
33
+ # Stage 2: Runtime environment
34
+ FROM python:3.10.15-slim as runner
35
+
36
+ # Install curl for health check script
37
+ RUN apt-get update && apt-get install -y --no-install-recommends curl && \
38
+ apt-get clean && rm -rf /var/lib/apt/lists/*
39
+
40
+ # Copy application source code and necessary files
41
+ COPY src /app/src
42
+ COPY main.py /app/main.py
43
+
44
+ # Copy virtual environment from the builder stage
45
+ COPY --from=builder /app/.venv /app/.venv
46
+
47
+ # Set the working directory to /app
48
+ WORKDIR /app
49
+
50
+ # Set the environment path to use the virtual environment
51
+ ENV PATH="/app/.venv/bin:$PATH"
52
+
53
+ # Default command
54
+ CMD ["python", "-m", "main"]
README.md ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: My Gradio App Mnist Classifier
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "5.7.1"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # aws_ec2_automation
13
+ Here’s a detailed explanation of the GitHub Actions (GHA) pipeline in **raw Markdown format**:
14
+
15
+ ---
16
+
17
+ # GitHub Actions Pipeline Documentation
18
+
19
+ ## Name: Deploy PyTorch Training with EC2 Runner and Docker Compose
20
+
21
+ This pipeline automates the following tasks:
22
+ 1. Starts an EC2 instance as a self-hosted GitHub runner.
23
+ 2. Deploys a PyTorch training pipeline using Docker Compose.
24
+ 3. Builds, tags, and pushes Docker images to Amazon ECR.
25
+ 4. Stops the EC2 instance after the job is completed.
26
+
27
+ ---
28
+
29
+ ### Workflow Triggers
30
+
31
+ ```yaml
32
+ on:
33
+ push:
34
+ branches:
35
+ - main
36
+ ```
37
+
38
+ - **Trigger**: This workflow runs whenever a push is made to the `main` branch.
39
+
40
+ ---
41
+
42
+ ## Jobs Overview
43
+
44
+ ### 1. **start-runner**
45
+ Starts a self-hosted EC2 runner using the GitHub Actions Runner.
46
+
47
+ #### Steps:
48
+ 1. **Configure AWS Credentials**:
49
+ ```yaml
50
+ - name: Configure AWS credentials
51
+ uses: aws-actions/configure-aws-credentials@v4
52
+ with:
53
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
54
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
55
+ aws-region: ${{ secrets.AWS_REGION }}
56
+ ```
57
+ - Authenticates with AWS using access keys and the region specified in the secrets.
58
+ - Required for creating and managing the EC2 instance.
59
+
60
+ 2. **Start EC2 Runner**:
61
+ ```yaml
62
+ - name: Start EC2 runner
63
+ id: start-ec2-runner
64
+ uses: machulav/ec2-github-runner@v2
65
+ with:
66
+ mode: start
67
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
68
+ ec2-image-id: ami-044b0717aadbc9dfa
69
+ ec2-instance-type: t2.xlarge
70
+ subnet-id: subnet-024811dee81325f1c
71
+ security-group-id: sg-0646c2a337a355a31
72
+ ```
73
+ - Starts an EC2 instance with the specified AMI, instance type, subnet, and security group.
74
+ - Outputs:
75
+ - `label`: A unique label for the EC2 runner.
76
+ - `ec2-instance-id`: The ID of the created EC2 instance.
77
+
78
+ ---
79
+
80
+ ### 2. **deploy**
81
+ Deploys the PyTorch training pipeline using the EC2 runner started in the previous step.
82
+
83
+ #### Dependencies:
84
+ ```yaml
85
+ needs: start-runner
86
+ runs-on: ${{ needs.start-runner.outputs.label }}
87
+ ```
88
+ - **Depends on** the `start-runner` job and runs on the newly created EC2 instance.
89
+
90
+ #### Steps:
91
+ 1. **Checkout Repository**:
92
+ ```yaml
93
+ - name: Checkout repository
94
+ uses: actions/checkout@v4
95
+ ```
96
+ - Clones the current repository to the runner.
97
+
98
+ 2. **Set Up Docker Buildx**:
99
+ ```yaml
100
+ - name: Set up Docker Buildx
101
+ uses: docker/setup-buildx-action@v3
102
+ ```
103
+ - Configures Docker Buildx for building multi-platform Docker images.
104
+
105
+ 3. **Configure AWS Credentials**:
106
+ ```yaml
107
+ - name: Configure AWS credentials
108
+ uses: aws-actions/configure-aws-credentials@v4
109
+ with:
110
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
111
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
112
+ aws-region: ${{ secrets.AWS_REGION }}
113
+ ```
114
+ - Reconfigures AWS credentials for Docker ECR authentication and resource management.
115
+
116
+ 4. **Log in to Amazon ECR**:
117
+ ```yaml
118
+ - name: Log in to Amazon ECR
119
+ id: login-ecr
120
+ uses: aws-actions/amazon-ecr-login@v2
121
+ ```
122
+ - Logs into Amazon ECR for pushing and pulling Docker images.
123
+
124
+ 5. **Create `.env` File**:
125
+ ```yaml
126
+ - name: Create .env file
127
+ run: |
128
+ echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
129
+ echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
130
+ echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
131
+ ```
132
+ - Generates a `.env` file for the application with AWS credentials and region.
133
+
134
+ 6. **Run Docker Compose for Train and Eval Services**:
135
+ ```yaml
136
+ - name: Run Docker Compose for train and eval service
137
+ run: |
138
+ docker-compose build
139
+ docker-compose up --build
140
+ docker-compose logs --follow
141
+ docker-compose down --remove-orphans
142
+ ```
143
+ - **Build**: Builds all services defined in the `docker-compose.yml` file.
144
+ - **Up**: Runs all services, including training and evaluation.
145
+ - **Logs**: Outputs logs for debugging purposes.
146
+ - **Down**: Stops all services and removes orphaned containers.
147
+
148
+ 7. **Build, Tag, and Push Docker Image to Amazon ECR**:
149
+ ```yaml
150
+ - name: Build, tag, and push Docker image to Amazon ECR
151
+ env:
152
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
153
+ REPOSITORY: soutrik71/mnist
154
+ IMAGE_TAG: ${{ github.sha }}
155
+ run: |
156
+ docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
157
+ docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
158
+ docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
159
+ docker push $REGISTRY/$REPOSITORY:latest
160
+ ```
161
+ - **Build**: Creates a Docker image with the repository and tag.
162
+ - **Push**: Pushes the image to Amazon ECR.
163
+ - **Tag**: Updates the `latest` tag.
164
+
165
+ 8. **Pull and Verify Docker Image from ECR**:
166
+ ```yaml
167
+ - name: Pull Docker image from ECR and verify
168
+ env:
169
+ REGISTRY: ${{ steps.login-ecr.outputs.registry }}
170
+ REPOSITORY: soutrik71/mnist
171
+ IMAGE_TAG: ${{ github.sha }}
172
+ run: |
173
+ docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
174
+ docker images | grep "$REGISTRY/$REPOSITORY"
175
+ ```
176
+ - **Pull**: Pulls the built image from ECR.
177
+ - **Verify**: Ensures the image exists locally.
178
+
179
+ 9. **Clean Up Environment**:
180
+ ```yaml
181
+ - name: Clean up environment
182
+ run: |
183
+ rm -f .env
184
+ docker system prune -af
185
+ ```
186
+ - Deletes the `.env` file and removes unused Docker resources.
187
+
188
+ ---
189
+
190
+ ### 3. **stop-runner**
191
+ Stops and terminates the EC2 runner created in the `start-runner` job.
192
+
193
+ #### Dependencies:
194
+ ```yaml
195
+ needs:
196
+ - start-runner
197
+ - deploy
198
+ ```
199
+
200
+ #### Steps:
201
+ 1. **Configure AWS Credentials**:
202
+ ```yaml
203
+ - name: Configure AWS credentials
204
+ uses: aws-actions/configure-aws-credentials@v4
205
+ with:
206
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
207
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
208
+ aws-region: ${{ secrets.AWS_REGION }}
209
+ ```
210
+
211
+ 2. **Stop EC2 Runner**:
212
+ ```yaml
213
+ - name: Stop EC2 runner
214
+ uses: machulav/ec2-github-runner@v2
215
+ with:
216
+ mode: stop
217
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
218
+ label: ${{ needs.start-runner.outputs.label }}
219
+ ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
220
+ ```
221
+ - Stops the EC2 runner instance created in the first job.
222
+
223
+ 3. **Validate EC2 Termination**:
224
+ ```yaml
225
+ - name: Validate EC2 termination
226
+ run: aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }}
227
+ ```
228
+ - Ensures the EC2 instance has been properly terminated.
229
+
230
+ ---
231
+
232
+ ### Key Highlights
233
+ 1. **Sequential Execution**:
234
+ - The `start-runner`, `deploy`, and `stop-runner` jobs are executed sequentially.
235
+
236
+ 2. **Error Handling**:
237
+ - The `stop-runner` job runs even if previous jobs fail (`if: ${{ always() }}`).
238
+
239
+ 3. **Efficiency**:
240
+ - Docker layer caching speeds up builds.
241
+ - Cleanup steps maintain a clean environment.
242
+
243
+ 4. **Security**:
244
+ - Secrets are masked and removed after use.
245
+ - Proper resource cleanup ensures cost efficiency.
246
+
247
+ ---
248
+
249
+ This pipeline ensures robust deployment with error handling, logging, and cleanup mechanisms. So far we have discussed the GitHub Actions pipeline , the basic structure of the pipeline, and the steps involved in the pipeline.
250
+ Next we will have an interdependent pipeline where the output of one job will be used as input for the next job.
251
+
252
+ ---
253
+ ## Advanced Pipeline with
254
+ * Sequential Flow: Each job has clear dependencies, ensuring no step runs out of order.
255
+ * Code Checkout: Explicit repository checkout in each job ensures consistent source code.
256
+ * Secure Credential Handling: Sensitive credentials are masked and stored securely.
257
+ * Resource Cleanup: Includes Docker clean-up and EC2 instance termination validation.
258
+ * Logging: Added detailed logs to improve debugging and monitoring.
259
+
260
+
261
+ Step 1: Start EC2 Runner
262
+ Purpose: Initializes a self-hosted EC2 runner for running subsequent jobs.
263
+ Key Actions:
264
+ Configures AWS credentials.
265
+ Launches an EC2 instance using specified AMI, instance type, and networking configurations.
266
+ Outputs the runner label and instance ID for downstream jobs.
267
+ Step 2: Test PyTorch Code Using Docker Compose
268
+ Purpose: Tests the PyTorch training and evaluation services.
269
+ Key Actions:
270
+ Checks out the repository.
271
+ Sets up Docker Buildx for advanced build capabilities.
272
+ Configures AWS credentials and creates a masked .env file for secure credential sharing.
273
+ Runs all services (train, eval) using Docker Compose, monitors logs, and cleans up containers.
274
+ Step 3: Build, Tag, and Push Docker Image
275
+ Purpose: Builds a Docker image, tags it, and pushes it to Amazon ECR after successful tests.
276
+ Key Actions:
277
+ Checks out the repository again to ensure consistency.
278
+ Logs into Amazon ECR using AWS credentials.
279
+ Builds and tags the Docker image with latest and SHA-based tags.
280
+ Pushes the image to Amazon ECR and verifies by pulling it back.
281
+ Step 4: Stop and Delete EC2 Runner
282
+ Purpose: Stops and terminates the EC2 instance to ensure cost efficiency and cleanup.
283
+ Key Actions:
284
+ Configures AWS credentials.
285
+ Stops the EC2 instance using the label and instance ID from start-runner.
286
+ Validates the termination state of the EC2 instance to ensure proper cleanup.
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchvision.transforms as transforms
4
+ from PIL import Image
5
+ from pathlib import Path
6
+ from loguru import logger
7
+ from src.model import LitEfficientNet
8
+ from src.utils.aws_s3_services import S3Handler
9
+
10
+ # Configure Loguru for logging
11
+ logger.add("logs/inference.log", rotation="1 MB", level="INFO")
12
+
13
+
14
+ class MNISTClassifier:
15
+ def __init__(self, checkpoint_path="./checkpoints/best_model.ckpt"):
16
+ self.checkpoint_path = checkpoint_path
17
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ logger.info(f"Inference will run on device: {self.device}")
19
+
20
+ # Load the model
21
+ self.model = self.load_model()
22
+ self.model.eval()
23
+
24
+ # Define transforms
25
+ self.transform = transforms.Compose(
26
+ [
27
+ transforms.Resize((28, 28)),
28
+ transforms.ToTensor(),
29
+ transforms.Normalize((0.5,), (0.5,)),
30
+ ]
31
+ )
32
+ self.labels = [str(i) for i in range(10)] # MNIST labels are 0-9
33
+
34
+ def load_model(self):
35
+ """
36
+ Loads the model checkpoint for inference.
37
+ """
38
+ if not Path(self.checkpoint_path).exists():
39
+ logger.error(f"Checkpoint not found: {self.checkpoint_path}")
40
+ raise FileNotFoundError(f"Checkpoint not found: {self.checkpoint_path}")
41
+
42
+ logger.info(f"Loading model from checkpoint: {self.checkpoint_path}")
43
+ return LitEfficientNet.load_from_checkpoint(self.checkpoint_path).to(
44
+ self.device
45
+ )
46
+
47
+ @torch.no_grad()
48
+ def predict(self, image):
49
+ """
50
+ Perform inference on a single image.
51
+
52
+ Args:
53
+ image: Input image in PIL format.
54
+
55
+ Returns:
56
+ dict: Predicted class probabilities.
57
+ """
58
+ if image is None:
59
+ logger.error("No image provided for prediction.")
60
+ return None
61
+
62
+ # Convert to tensor and preprocess
63
+ img_tensor = self.transform(image).unsqueeze(0).to(self.device)
64
+
65
+ # Perform inference
66
+ output = self.model(img_tensor)
67
+ probabilities = torch.nn.functional.softmax(output[0], dim=0)
68
+
69
+ # Map probabilities to labels
70
+ return {self.labels[idx]: float(prob) for idx, prob in enumerate(probabilities)}
71
+
72
+
73
+ # Instantiate the classifier
74
+ checkpoint_path = "./checkpoints/best_model.ckpt"
75
+
76
+ # Download checkpoint from S3 (if needed)
77
+ s3_handler = S3Handler(bucket_name="deep-bucket-s3")
78
+ s3_handler.download_folder(
79
+ "checkpoints_test",
80
+ "checkpoints",
81
+ )
82
+
83
+ classifier = MNISTClassifier(checkpoint_path=checkpoint_path)
84
+
85
+ # Define Gradio interface
86
+ demo = gr.Interface(
87
+ fn=classifier.predict,
88
+ inputs=gr.Image(height=160, width=160, image_mode="L", type="pil"),
89
+ outputs=gr.Label(num_top_classes=1),
90
+ title="MNIST Classifier",
91
+ description="Upload a handwritten digit image to classify it (0-9).",
92
+ )
93
+
94
+ if __name__ == "__main__":
95
+ demo.launch(share=True)
docker-compose.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ train:
3
+ build:
4
+ context: .
5
+ command: |
6
+ python -m src.train && \
7
+ touch ./checkpoints/train_done.flag
8
+ volumes:
9
+ - ./data:/app/data
10
+ - ./checkpoints:/app/checkpoints
11
+ - ./logs:/app/logs
12
+ environment:
13
+ - PYTHONUNBUFFERED=1
14
+ - PYTHONPATH=/app
15
+ - NUM_WORKERS=4 # Set the number of workers
16
+ shm_size: '4g'
17
+ deploy:
18
+ resources:
19
+ limits:
20
+ memory: 8g # Limit to 8GB RAM
21
+ cpus: '4.0' # Use up to 4 CPU cores
22
+ reservations:
23
+ memory: 6g # Reserve 6GB RAM
24
+ cpus: '4.0' # Reserve 4 CPU cores
25
+ networks:
26
+ - default
27
+ env_file:
28
+ - .env
29
+
30
+ eval:
31
+ build:
32
+ context: .
33
+ command: |
34
+ sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.test'
35
+ volumes:
36
+ - ./data:/app/data
37
+ - ./checkpoints:/app/checkpoints
38
+ - ./logs:/app/logs
39
+ environment:
40
+ - PYTHONUNBUFFERED=1
41
+ - PYTHONPATH=/app
42
+ - NUM_WORKERS=2 # Set the number of workers
43
+ shm_size: '4g'
44
+ deploy:
45
+ resources:
46
+ limits:
47
+ memory: 4g # Limit to 4GB RAM
48
+ cpus: '4.0' # Use up to 4 CPU core
49
+ reservations:
50
+ memory: 2g # Reserve 2GB RAM
51
+ cpus: '2' # Reserve 2 CPU core
52
+ networks:
53
+ - default
54
+ env_file:
55
+ - .env
56
+
57
+ volumes:
58
+ data:
59
+ checkpoints:
60
+ logs:
61
+
62
+ networks:
63
+ default:
main.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import os
2
+
3
+ print(os.getcwd())
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "pytorch_fastapi_project"
3
+ version = "0.1.0"
4
+ description = "Consolidated PyTorch and FastAPI project for AWS deployment and GHA testing"
5
+ authors = ["soutrik71 <soutrik.chowdhury@ab-inbev.com>"]
6
+ license = "Apache-2.0"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "3.10.15"
11
+ black = "24.8.0"
12
+ coverage = ">=7.6.1"
13
+ hydra-colorlog = "1.2.0"
14
+ hydra-core = "1.3.2"
15
+ lightning = {version = "2.4.0", extras = ["extra"]}
16
+ loguru = "0.7.2"
17
+ pytest = "^8.3.3"
18
+ rich = "13.8.1"
19
+ rootutils = "1.0.7"
20
+ tensorboard = "2.17.1"
21
+ timm = "1.0.9"
22
+ pandas = "^2.2.3"
23
+ numpy = "^1.26.0"
24
+ ruff = "*"
25
+ torch = {version = "^2.4.1+cpu", source = "pytorch_cpu"}
26
+ torchvision = {version = "^0.19.1+cpu", source = "pytorch_cpu"}
27
+ seaborn = "^0.13.2"
28
+ pydantic = "^2.9.2"
29
+ kaggle = "^1.6.17"
30
+ pytest-cov = "^5.0.0"
31
+ pytest-mock = "^3.14.0"
32
+ flake8 = "^7.1.1"
33
+ dvc-gdrive = "^3.0.1"
34
+ dvc-azure = "^3.1.0"
35
+ transformers = "^4.45.2"
36
+ fastapi = "^0.115.4"
37
+ pydantic-settings = "^2.6.1"
38
+ uvicorn = "^0.32.0"
39
+ tenacity = "^9.0.0"
40
+ gunicorn = "^23.0.0"
41
+ aim = "^3.25.0"
42
+ mlflow = "^2.17.1"
43
+ hydra-optuna-sweeper = "^1.2.0"
44
+ dvc = "^3.56.0"
45
+ platformdirs = "3.10"
46
+ fastapi-utils = "^0.7.0"
47
+ httpx = "^0.27.2"
48
+ typing-inspect = "^0.9.0"
49
+ requests = "^2.32.3"
50
+ fastapi-restful = {extras = ["all"], version = "^0.6.0"}
51
+ aioredis = "^2.0.1"
52
+ psycopg2-binary = "^2.9.10"
53
+ asyncpg = "^0.30.0"
54
+ confluent-kafka = "^2.6.0"
55
+ aiokafka = "^0.12.0"
56
+ azure-servicebus = "^7.12.3"
57
+ aiohttp = "^3.10.10"
58
+ aiofiles = "*"
59
+ aiologger = "^0.7.0"
60
+ pyyaml = "^6.0.2"
61
+ sqlalchemy-utils = "^0.41.2"
62
+ sqlalchemy = "^2.0.36"
63
+ alembic = "^1.13.3"
64
+ fastapi-limiter = "^0.1.6"
65
+ redis = "5.0.8"
66
+ redisearch = "2.0.0"
67
+ python-multipart = "*"
68
+ python-dotenv = "^1.0.1"
69
+ celery = "^5.4.0"
70
+ fastapi-cache2 = "^0.2.2"
71
+ aiocache = "^0.12.3"
72
+ dvc-s3 = "^3.2.0"
73
+ litserve = "^0.2.4"
74
+ gpustat = "^1.1.1"
75
+ nvitop = "^1.3.2"
76
+ pyopenssl = "^23.0.0"
77
+ cryptography = "^41.0.0"
78
+ accelerate = "^1.1.1"
79
+ gradio="5.7.1"
80
+
81
+ [tool.poetry.dev-dependencies]
82
+ pytest-asyncio = "^0.20.3"
83
+
84
+ [[tool.poetry.source]]
85
+ name = "pytorch_cpu"
86
+ url = "https://download.pytorch.org/whl/cpu"
87
+ priority = "explicit"
88
+
89
+ [build-system]
90
+ requires = ["poetry-core"]
91
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.4.1
2
+ torchvision==0.19.1
3
+ hydra-colorlog==1.2.0
4
+ hydra-core==1.3.2
5
+ lightning[extra]==2.4.0
6
+ loguru==0.7.2
7
+ rich==13.8.1
8
+ rootutils==1.0.7
9
+ tensorboard==2.17.1
10
+ timm==1.0.9
11
+ pandas>=2.2.3
12
+ numpy>=1.26.0
13
+ transformers>=4.45.2
14
+ aim>=3.25.0
15
+ mlflow>=2.17.1
16
+ hydra-optuna-sweeper>=1.2.0
17
+ aiologger>=0.7.0
18
+ pyyaml>=6.0.2
19
+ dvc-s3>=3.2.0
20
+ litserve>=0.2.4
21
+ gpustat>=1.1.1
22
+ nvitop>=1.3.2
23
+ gradio==5.7.1
24
+ gradio-client>=1.5.0
25
+ accelerate>=1.1.1
26
+ cryptography>=44.0.0
27
+ boto3
28
+ pyopenssl>=24.3.0
src/__init__.py ADDED
File without changes
src/dataloader.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import torch
3
+ from torch.utils.data import DataLoader, Subset
4
+ from torchvision import datasets, transforms
5
+ import lightning as pl
6
+ from typing import Optional
7
+ from multiprocessing import cpu_count
8
+ from sklearn.model_selection import train_test_split
9
+
10
+ # Configure Loguru to save logs to the logs/ directory
11
+ logger.add("logs/dataloader.log", rotation="1 MB", level="INFO")
12
+
13
+
14
+ class MNISTDataModule(pl.LightningDataModule):
15
+ def __init__(
16
+ self,
17
+ batch_size: int = 64,
18
+ data_dir: str = "./data",
19
+ num_workers: int = int(cpu_count()),
20
+ train_subset_fraction: float = 0.25, # Fraction of training data to use
21
+ ):
22
+ """
23
+ Initializes the MNIST Data Module with configurations for dataloaders.
24
+
25
+ Args:
26
+ batch_size (int): Batch size for training, validation, and testing.
27
+ data_dir (str): Directory to download and store the dataset.
28
+ num_workers (int): Number of workers for data loading.
29
+ train_subset_fraction (float): Fraction of training data to use (0.0 < fraction <= 1.0).
30
+ """
31
+ super().__init__()
32
+ self.batch_size = batch_size
33
+ self.data_dir = data_dir
34
+ self.num_workers = num_workers
35
+ self.train_subset_fraction = train_subset_fraction
36
+ self.transform = transforms.Compose(
37
+ [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
38
+ )
39
+ logger.info(f"MNIST DataModule initialized with batch size {self.batch_size}")
40
+
41
+ def prepare_data(self):
42
+ """
43
+ Downloads the MNIST dataset if not already downloaded.
44
+ """
45
+ datasets.MNIST(root=self.data_dir, train=True, download=True)
46
+ datasets.MNIST(root=self.data_dir, train=False, download=True)
47
+ logger.info("MNIST dataset downloaded.")
48
+
49
+ def setup(self, stage: Optional[str] = None):
50
+ """
51
+ Set up the dataset for different stages.
52
+
53
+ Args:
54
+ stage (str, optional): One of "fit", "validate", "test", or "predict".
55
+ """
56
+ logger.info(f"Setting up data for stage: {stage}")
57
+ if stage == "fit" or stage is None:
58
+ full_train_dataset = datasets.MNIST(
59
+ root=self.data_dir, train=True, transform=self.transform
60
+ )
61
+ train_indices, _ = train_test_split(
62
+ range(len(full_train_dataset)),
63
+ train_size=self.train_subset_fraction,
64
+ random_state=42,
65
+ )
66
+ self.mnist_train = Subset(full_train_dataset, train_indices)
67
+
68
+ self.mnist_val = datasets.MNIST(
69
+ root=self.data_dir, train=False, transform=self.transform
70
+ )
71
+ logger.info(f"Loaded training subset: {len(self.mnist_train)} samples.")
72
+ logger.info(f"Loaded validation data: {len(self.mnist_val)} samples.")
73
+ if stage == "test" or stage is None:
74
+ self.mnist_test = datasets.MNIST(
75
+ root=self.data_dir, train=False, transform=self.transform
76
+ )
77
+ logger.info(f"Loaded test data: {len(self.mnist_test)} samples.")
78
+
79
+ def train_dataloader(self) -> DataLoader:
80
+ """
81
+ Returns the training DataLoader.
82
+
83
+ Returns:
84
+ DataLoader: Training data loader.
85
+ """
86
+ logger.info("Creating training DataLoader...")
87
+ return DataLoader(
88
+ self.mnist_train,
89
+ batch_size=self.batch_size,
90
+ shuffle=True,
91
+ num_workers=self.num_workers,
92
+ )
93
+
94
+ def val_dataloader(self) -> DataLoader:
95
+ """
96
+ Returns the validation DataLoader.
97
+
98
+ Returns:
99
+ DataLoader: Validation data loader.
100
+ """
101
+ logger.info("Creating validation DataLoader...")
102
+ return DataLoader(
103
+ self.mnist_val,
104
+ batch_size=self.batch_size,
105
+ shuffle=False,
106
+ num_workers=self.num_workers,
107
+ )
108
+
109
+ def test_dataloader(self) -> DataLoader:
110
+ """
111
+ Returns the test DataLoader.
112
+
113
+ Returns:
114
+ DataLoader: Test data loader.
115
+ """
116
+ logger.info("Creating test DataLoader...")
117
+ return DataLoader(
118
+ self.mnist_test,
119
+ batch_size=self.batch_size,
120
+ shuffle=False,
121
+ num_workers=self.num_workers,
122
+ )
src/model.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lightning as pl
2
+ import torch.nn as nn
3
+ import torch
4
+ from timm import create_model
5
+ from torchmetrics.classification import Accuracy
6
+ from torch.optim.lr_scheduler import StepLR
7
+ import torch.optim as optim
8
+ from loguru import logger
9
+
10
+ logger.add("logs/model.log", rotation="1 MB", level="INFO")
11
+
12
+
13
+ class LitEfficientNet(pl.LightningModule):
14
+ def __init__(
15
+ self,
16
+ model_name="tf_efficientnet_lite0",
17
+ num_classes=10,
18
+ lr=1e-3,
19
+ custom_loss=None,
20
+ ):
21
+ """
22
+ Initializes a CNN model from TIMM and integrates TorchMetrics.
23
+
24
+ Args:
25
+ model_name (str): TIMM model name (e.g., "tf_efficientnet_lite0").
26
+ num_classes (int): Number of output classes (e.g., 0–9 for MNIST).
27
+ lr (float): Learning rate for the optimizer.
28
+ custom_loss (callable, optional): Custom loss function. Defaults to CrossEntropyLoss.
29
+ """
30
+ super().__init__()
31
+
32
+ self.lr = lr
33
+ self.model = create_model(
34
+ model_name,
35
+ pretrained=True,
36
+ num_classes=num_classes,
37
+ in_chans=1, # Set to 1 channel for grayscale input
38
+ )
39
+ self.loss_fn = custom_loss or nn.CrossEntropyLoss()
40
+ self.train_acc = Accuracy(num_classes=num_classes, task="multiclass")
41
+ self.val_acc = Accuracy(num_classes=num_classes, task="multiclass")
42
+ self.test_acc = Accuracy(num_classes=num_classes, task="multiclass")
43
+ logger.info(f"Model initialized with TIMM backbone: {model_name}")
44
+ logger.info(f"Number of output classes: {num_classes}")
45
+
46
+ def forward(self, x):
47
+ """
48
+ Forward pass of the model.
49
+
50
+ Args:
51
+ x (torch.Tensor): Input tensor.
52
+
53
+ Returns:
54
+ torch.Tensor: Model predictions.
55
+ """
56
+ return self.model(x)
57
+
58
+ def training_step(self, batch, batch_idx):
59
+ x, y = batch
60
+ y_hat = self(x)
61
+ loss = self.loss_fn(y_hat, y)
62
+ self.train_acc.update(y_hat, y)
63
+ self.log("train_loss", loss, prog_bar=True, logger=True)
64
+ self.log("train_acc", self.train_acc, prog_bar=True, logger=True)
65
+ return loss
66
+
67
+ def validation_step(self, batch, batch_idx):
68
+ x, y = batch
69
+ y_hat = self(x)
70
+ loss = self.loss_fn(y_hat, y)
71
+ self.val_acc.update(y_hat, y)
72
+ self.log("val_loss", loss, prog_bar=True, logger=True)
73
+ self.log("val_acc", self.val_acc, prog_bar=True, logger=True)
74
+
75
+ def test_step(self, batch, batch_idx):
76
+ x, y = batch
77
+ y_hat = self(x)
78
+ self.test_acc.update(y_hat, y)
79
+ self.log("test_acc", self.test_acc, prog_bar=True, logger=True)
80
+
81
+ def configure_optimizers(self):
82
+ optimizer = optim.Adam(self.parameters(), lr=self.lr)
83
+ scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
84
+ logger.info(f"Optimizer: Adam, Learning Rate: {self.lr}")
85
+ logger.info("Scheduler: StepLR with step_size=1 and gamma=0.9")
86
+ return [optimizer], [scheduler]
src/test.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from loguru import logger
3
+ from src.model import LitEfficientNet
4
+ from src.dataloader import MNISTDataModule
5
+ from torchmetrics.classification import Accuracy
6
+ from pathlib import Path
7
+ from src.utils.aws_s3_services import S3Handler
8
+
9
+ # Configure Loguru to save logs to the logs/ directory
10
+ logger.add("logs/test.log", rotation="1 MB", level="INFO")
11
+
12
+
13
+ def infer(checkpoint_path, image):
14
+ """
15
+ Perform inference on a single image using the model checkpoint.
16
+
17
+ Args:
18
+ checkpoint_path (str): Path to the model checkpoint.
19
+ image (torch.Tensor): Image tensor to predict (shape: [1, 28, 28] for MNIST).
20
+
21
+ Returns:
22
+ int: Predicted class (0-9).
23
+ """
24
+ logger.info(f"Loading model from checkpoint: {checkpoint_path} for inference...")
25
+ if not Path(checkpoint_path).exists():
26
+ logger.error(f"Checkpoint not found: {checkpoint_path}")
27
+ raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
28
+
29
+ # Detect device
30
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
+ logger.info(f"Inference will run on device: {device}")
32
+
33
+ # Load the model
34
+ model = LitEfficientNet.load_from_checkpoint(checkpoint_path).to(device)
35
+ model.eval()
36
+
37
+ # Perform inference
38
+ with torch.no_grad():
39
+ if image.dim() == 3:
40
+ image = image.unsqueeze(0) # Add batch dimension if needed
41
+ image = image.to(device) # Ensure the image is on the same device as the model
42
+ prediction = model(image)
43
+ predicted_class = torch.argmax(prediction, dim=1).item()
44
+
45
+ logger.info(f"Predicted class: {predicted_class}")
46
+ return predicted_class
47
+
48
+
49
+ def test_model(checkpoint_path):
50
+ """
51
+ Test the model using the test dataset and log metrics.
52
+
53
+ Args:
54
+ checkpoint_path (str): Path to the model checkpoint.
55
+
56
+ Returns:
57
+ float: Final test accuracy.
58
+ """
59
+ logger.info(f"Loading model from checkpoint: {checkpoint_path} for testing...")
60
+ if not Path(checkpoint_path).exists():
61
+ logger.error(f"Checkpoint not found: {checkpoint_path}")
62
+ raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
63
+
64
+ # Detect device
65
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
66
+ logger.info(f"Testing will run on device: {device}")
67
+
68
+ # Load the model
69
+ model = LitEfficientNet.load_from_checkpoint(checkpoint_path).to(device)
70
+ model.eval()
71
+
72
+ # Set up data module and load test data
73
+ data_module = MNISTDataModule()
74
+ data_module.setup(stage="test")
75
+ test_loader = data_module.test_dataloader()
76
+
77
+ # Initialize accuracy metric
78
+ test_acc = Accuracy(num_classes=10, task="multiclass").to(device)
79
+
80
+ # Evaluate model on test data
81
+ logger.info("Evaluating on test dataset...")
82
+ with torch.no_grad():
83
+ for images, labels in test_loader:
84
+ images, labels = images.to(device), labels.to(
85
+ device
86
+ ) # Move data to the same device
87
+ outputs = model(images)
88
+ test_acc.update(outputs, labels)
89
+
90
+ accuracy = test_acc.compute().item()
91
+ logger.info(f"Final Test Accuracy (TorchMetrics): {accuracy:.2%}")
92
+ return accuracy
93
+
94
+
95
+ if __name__ == "__main__":
96
+
97
+ # downloading from s3
98
+ s3_handler = S3Handler(bucket_name="deep-bucket-s3")
99
+ s3_handler.download_folder(
100
+ "checkpoints_test",
101
+ "checkpoints",
102
+ )
103
+ checkpoint_path = "./checkpoints/best_model.ckpt"
104
+ try:
105
+ # Perform testing
106
+ test_accuracy = test_model(checkpoint_path)
107
+ logger.info(f"Test completed successfully with accuracy: {test_accuracy:.2%}")
108
+
109
+ # Example inference
110
+ logger.info("Running inference on a single test image...")
111
+ dummy_image = torch.randn(1, 28, 28) # Replace with actual test image
112
+ predicted_class = infer(checkpoint_path, dummy_image)
113
+ logger.info(f"Inference result: Predicted class {predicted_class}")
114
+ except Exception as e:
115
+ logger.error(f"An error occurred: {e}")
src/train.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lightning as pl
2
+ from lightning.pytorch.callbacks import (
3
+ ModelCheckpoint,
4
+ EarlyStopping,
5
+ LearningRateMonitor,
6
+ RichProgressBar,
7
+ )
8
+ from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
9
+ from lightning.pytorch.callbacks import ModelSummary
10
+ from src.dataloader import MNISTDataModule
11
+ from src.model import LitEfficientNet
12
+ from loguru import logger
13
+ import os
14
+ from src.utils.aws_s3_services import S3Handler
15
+
16
+ # Ensure the logs directory exists
17
+ os.makedirs("logs", exist_ok=True)
18
+
19
+ # Configure Loguru for logging
20
+ logger.add("logs/training.log", rotation="1 MB", level="INFO")
21
+
22
+
23
+ def main():
24
+ """
25
+ Main training loop for the model with advanced configuration (CPU training).
26
+ """
27
+ # Data Module
28
+ logger.info("Setting up data module...")
29
+ data_module = MNISTDataModule(batch_size=256)
30
+
31
+ # Model
32
+ logger.info("Setting up model...")
33
+ model = LitEfficientNet(model_name="tf_efficientnet_lite0", num_classes=10, lr=1e-3)
34
+ logger.info(model)
35
+
36
+ # Callbacks
37
+ logger.info("Setting up callbacks...")
38
+ checkpoint_callback = ModelCheckpoint(
39
+ monitor="val_acc",
40
+ dirpath="checkpoints/",
41
+ filename="best_model",
42
+ save_top_k=1,
43
+ mode="max",
44
+ auto_insert_metric_name=False,
45
+ verbose=True,
46
+ save_last=True,
47
+ enable_version_counter=False,
48
+ )
49
+ early_stopping_callback = EarlyStopping(
50
+ monitor="val_acc",
51
+ patience=5, # Extended patience for advanced models
52
+ mode="max",
53
+ verbose=True,
54
+ )
55
+ lr_monitor = LearningRateMonitor(logging_interval="epoch") # Log learning rate
56
+ rich_progress = RichProgressBar()
57
+ model_summary = ModelSummary(
58
+ max_depth=1
59
+ ) # Show only the first level of model layers
60
+
61
+ # Loggers
62
+ logger.info("Setting up loggers...")
63
+ csv_logger = CSVLogger("logs/", name="mnist_csv")
64
+ tb_logger = TensorBoardLogger("logs/", name="mnist_tb")
65
+
66
+ # Trainer Configuration for CPU
67
+ logger.info("Setting up trainer...")
68
+ trainer = pl.Trainer(
69
+ max_epochs=2,
70
+ callbacks=[
71
+ checkpoint_callback,
72
+ early_stopping_callback,
73
+ lr_monitor,
74
+ rich_progress,
75
+ model_summary,
76
+ ],
77
+ logger=[csv_logger, tb_logger],
78
+ deterministic=True,
79
+ accelerator="auto",
80
+ devices="auto",
81
+ )
82
+
83
+ # Train the model
84
+ logger.info("Training the model...")
85
+ trainer.fit(model, datamodule=data_module)
86
+
87
+ # Test the model
88
+ logger.info("Testing the model...")
89
+ data_module.setup(stage="test")
90
+ trainer.test(model, datamodule=data_module)
91
+
92
+ # write a checkpoints/train_done.flag
93
+ with open("checkpoints/train_done.flag", "w") as f:
94
+ f.write("Training done.")
95
+
96
+ # upload checkpoints to S3
97
+ s3_handler = S3Handler(bucket_name="deep-bucket-s3")
98
+ s3_handler.upload_folder(
99
+ "checkpoints",
100
+ "checkpoints_test",
101
+ )
102
+
103
+
104
+ if __name__ == "__main__":
105
+ main()
src/utils/aws_s3_services.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import os
3
+ from pathlib import Path
4
+ from dotenv import load_dotenv, find_dotenv
5
+
6
+ # Load environment variables from .env file
7
+ load_dotenv(find_dotenv(".env"))
8
+
9
+
10
+ class S3Handler:
11
+ def __init__(self, bucket_name):
12
+ self.bucket_name = bucket_name
13
+ self.s3 = boto3.client(
14
+ "s3",
15
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
16
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
17
+ region_name=os.getenv("AWS_REGION"),
18
+ )
19
+
20
+ def upload_folder(self, source_folder, dest_folder, filenames=None):
21
+ """
22
+ Upload specified files or all files from a local folder to an S3 folder.
23
+
24
+ Args:
25
+ source_folder (str): Local source folder path.
26
+ dest_folder (str): Destination folder path in S3.
27
+ filenames (list): List of filenames to upload (relative to source_folder). If None, uploads all files.
28
+ """
29
+ source_folder = Path(source_folder)
30
+
31
+ # Select files based on filenames list or all files if filenames is None
32
+ files_to_upload = (
33
+ [source_folder / file for file in filenames]
34
+ if filenames
35
+ else list(source_folder.rglob("*"))
36
+ )
37
+
38
+ for file_path in files_to_upload:
39
+ if file_path.is_file():
40
+ s3_path = f"{dest_folder}/{file_path.relative_to(source_folder)}"
41
+ self.s3.upload_file(str(file_path), self.bucket_name, s3_path)
42
+ print(f"Uploaded: {file_path} to {s3_path}")
43
+ else:
44
+ print(f"File not found: {file_path}")
45
+
46
+ def download_folder(self, s3_folder, dest_folder):
47
+ """
48
+ Download all files from an S3 folder to a local folder.
49
+
50
+ Args:
51
+ s3_folder (str): Source folder in S3.
52
+ dest_folder (str): Local destination folder path.
53
+ """
54
+ dest_folder = Path(dest_folder).resolve()
55
+ paginator = self.s3.get_paginator("list_objects_v2")
56
+
57
+ for page in paginator.paginate(Bucket=self.bucket_name, Prefix=s3_folder):
58
+ for obj in page.get("Contents", []):
59
+ s3_path = obj["Key"]
60
+ # Skip folder itself if returned by S3
61
+ if s3_path.endswith("/"):
62
+ continue
63
+
64
+ # Compute relative path and local destination
65
+ relative_path = Path(s3_path[len(s3_folder) :].lstrip("/"))
66
+ local_path = dest_folder / relative_path
67
+
68
+ # Create necessary local directories
69
+ local_path.parent.mkdir(parents=True, exist_ok=True)
70
+
71
+ # Download file
72
+ self.s3.download_file(self.bucket_name, s3_path, str(local_path))
73
+ print(f"Downloaded: {s3_path} to {local_path}")
74
+
75
+
76
+ # Usage Example
77
+ if __name__ == "__main__":
78
+ # Initialize with bucket name
79
+ s3_handler = S3Handler(bucket_name="deep-bucket-s3")
80
+
81
+ # Upload specific files
82
+ s3_handler.upload_folder(
83
+ "checkpoints_test",
84
+ "checkpoints_test",
85
+ )
86
+
87
+ # Download example
88
+ s3_handler.download_folder("checkpoints_test", "checkpoints_test")