Spaces:

soutrik
/

gradio_demo_MNIST_Classifier

Sleeping

App Files Files Community

soutrik commited on 12 days ago

Commit

c3d82b0

•

0 Parent(s):

orphan branch

Browse files

Files changed (23) hide show

.gitattributes +4 -0
.github/workflows/deployment.yaml +113 -0
.github/workflows/deployment_advanced.yaml +130 -0
.github/workflows/do-the-job.yml +66 -0
.github/workflows/hf_deploy.yaml +79 -0
.github/workflows/main_cd.yml +131 -0
.gitignore +128 -0
.gradio/certificate.pem +31 -0
.project-root +0 -0
Dockerfile +54 -0
README.md +286 -0
app.py +95 -0
docker-compose.yaml +63 -0
main.py +3 -0
poetry.lock +0 -0
pyproject.toml +91 -0
requirements.txt +28 -0
src/__init__.py +0 -0
src/dataloader.py +122 -0
src/model.py +86 -0
src/test.py +115 -0
src/train.py +105 -0
src/utils/aws_s3_services.py +88 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+*.ckpt filter=lfs diff=lfs merge=lfs -text
+checkpoints/best_model.ckpt filter=lfs diff=lfs merge=lfs -text
+checkpoints/last.ckpt filter=lfs diff=lfs merge=lfs -text
+checkpoints/*.ckpt filter=lfs diff=lfs merge=lfs -text

.github/workflows/deployment.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+name: Deploy PyTorch Training with EC2 Runner and Docker Compose
+on:
+  push:
+    branches:
+      - master
+  workflow_dispatch:
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-044b0717aadbc9dfa
+          ec2-instance-type: t2.xlarge
+          subnet-id: subnet-024811dee81325f1c
+          security-group-id: sg-0646c2a337a355a31
+  deploy:
+    name: Deploy PyTorch Training Pipeline
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Log in to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - name: Create .env file
+        run: |
+          echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
+          echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
+          echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
+      - name: Run Docker Compose for train and eval service
+        run: |
+          docker-compose stop
+          docker-compose up --build
+          docker-compose logs --follow
+          docker-compose down --remove-orphans
+      - name: Build, tag, and push Docker image to Amazon ECR
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/mnist
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
+          docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
+          docker push $REGISTRY/$REPOSITORY:latest
+      - name: Pull Docker image from ECR and verify
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/mnist
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker images | grep "$REGISTRY/$REPOSITORY"
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner
+      - deploy
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

.github/workflows/deployment_advanced.yaml ADDED Viewed

	@@ -0,0 +1,130 @@

+name: Deploy PyTorch Training with EC2 Runner and Docker Compose with Advanced Deployment
+on:
+  push:
+    branches:
+      - master
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-044b0717aadbc9dfa
+          ec2-instance-type: t2.xlarge
+          subnet-id: subnet-024811dee81325f1c
+          security-group-id: sg-0646c2a337a355a31
+  deploy:
+    name: Deploy PyTorch Training Pipeline
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Cache Docker layers
+        uses: actions/cache@v3
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-docker-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-docker-
+      - name: Log in to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - name: Create .env file
+        run: |
+          echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
+          echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
+          echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
+          echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
+          echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"
+      - name: Run Docker Compose for all services
+        run: |
+          docker-compose build --no-cache
+          docker-compose up -d
+          docker-compose logs --follow train eval
+          docker-compose down --remove-orphans
+      - name: Build, tag, and push Docker image to Amazon ECR
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/mnist
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
+          docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
+          docker push $REGISTRY/$REPOSITORY:latest
+      - name: Pull Docker image from ECR and verify
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/mnist
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker images | grep "$REGISTRY/$REPOSITORY"
+      - name: Clean up environment
+        run: |
+          rm -f .env
+          docker system prune -af --volumes
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner
+      - deploy
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+      - name: Validate EC2 termination
+        run: |
+          aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
+          --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."

.github/workflows/do-the-job.yml ADDED Viewed

	@@ -0,0 +1,66 @@

+name: do-the-job
+on:
+  push:
+    branches:
+      - master
+  workflow_dispatch:
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-044b0717aadbc9dfa
+          ec2-instance-type: t2.xlarge
+          subnet-id: subnet-024811dee81325f1c
+          security-group-id: sg-0646c2a337a355a31
+  do-the-job:
+    name: Do the job on the runner
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+    steps:
+      - name: Clone the repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Run custom command
+        run: echo 'Hello World!'
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner
+      - do-the-job
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

.github/workflows/hf_deploy.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+name: Sync to Hugging Face Hub for Gradio App MNIST Classifier  # this is not working due to lfs issue
+on:
+  push:
+    branches:
+      - master
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Install Git LFS
+        run: |
+          curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+          sudo apt-get install git-lfs
+          git lfs install
+      - name: Configure Git identity
+        run: |
+          git config --global user.name "soutrik"
+          git config --global user.email "soutrik.chowdhury@ab-inbev.com"
+      - name: Add remote
+        run: |
+          git remote add space https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          USER: soutrik
+          SPACE: gradio_demo_MNIST_Classifier
+      # # Track individual files with LFS
+      # - name: Track last.ckpt with Git LFS
+      #   run: |
+      #     git lfs track "checkpoints/last.ckpt"
+      #     git add .gitattributes
+      #     git commit -m "Track last.ckpt with Git LFS" || echo "Skip commit if no changes"
+      # - name: Track best_model.ckpt with Git LFS
+      #   run: |
+      #     git lfs track "checkpoints/best_model.ckpt"
+      #     git add .gitattributes
+      #     git commit -m "Track best_model.ckpt with Git LFS" || echo "Skip commit if no changes"
+      # Ensure LFS objects are checked out
+      - name: Ensure LFS objects are present
+        run: git lfs checkout
+      - name: Add README.md
+        run: |
+          cat <<EOF > README.md
+          ---
+          title: My Gradio App MNIST Classifier
+          emoji: 🚀
+          colorFrom: blue
+          colorTo: green
+          sdk: gradio
+          sdk_version: "5.7.1"
+          app_file: app.py
+          pinned: false
+          ---
+          EOF
+          git add README.md
+          git commit -m "Add README.md" || echo "Skip commit if no changes"
+      - name: Push to hub
+        run: |
+          git push --force https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_REGION: ${{ secrets.AWS_REGION }}
+          USER: soutrik
+          SPACE: gradio_demo_MNIST_Classifier

.github/workflows/main_cd.yml ADDED Viewed

	@@ -0,0 +1,131 @@

+name: Deploy PyTorch Training with all advanced features like self-hosted EC2 runner, Docker Buildx, Amazon ECR, Hugging Face Spaces
+on:
+  push:
+    branches:
+      - master
+  workflow_dispatch:
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-044b0717aadbc9dfa
+          ec2-instance-type: t2.xlarge
+          subnet-id: subnet-024811dee81325f1c
+          security-group-id: sg-0646c2a337a355a31
+  deploy:
+    name: Deploy PyTorch Training Pipeline
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Cache Docker layers
+        uses: actions/cache@v3
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-docker-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-docker-
+      - name: Log in to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - name: Create .env file
+        run: |
+          echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
+          echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
+          echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
+          echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
+          echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"
+      - name: Run Docker Compose for all services
+        run: |
+          docker-compose build --no-cache
+          docker-compose up -d
+          docker-compose logs --follow train eval
+          docker-compose down --remove-orphans
+      - name: Build, tag, and push Docker image to Amazon ECR
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/mnist
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
+          docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
+          docker push $REGISTRY/$REPOSITORY:latest
+      - name: Pull Docker image from ECR and verify
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/mnist
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker images | grep "$REGISTRY/$REPOSITORY"
+      - name: Clean up environment
+        run: |
+          rm -f .env
+          docker system prune -af --volumes
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner
+      - deploy
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+      - name: Validate EC2 termination
+        run: |
+          aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
+          --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."

.gitignore ADDED Viewed

	@@ -0,0 +1,128 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyderworkspace
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# PyTorch
+*.pt
+*.pth
+logs/
+data/
+checkpoints/
+checkpoints/*
+checkpoints/best_model.ckpt
+checkpoints/last.ckpt

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.project-root ADDED Viewed

File without changes

Dockerfile ADDED Viewed

	@@ -0,0 +1,54 @@

+# Stage 1: Build environment with Poetry and dependencies
+FROM python:3.10.15-slim as builder
+LABEL maintainer="Soutrik soutrik1991@gmail.com" \
+      description="Docker image for running a Python app with dependencies managed by Poetry."
+# Install Poetry and necessary system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends curl && \
+    curl -sSL https://install.python-poetry.org | python3 - && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Add Poetry to the PATH explicitly
+ENV PATH="/root/.local/bin:$PATH"
+# Set the working directory to /app
+WORKDIR /app
+# Copy pyproject.toml and poetry.lock to install dependencies
+COPY pyproject.toml poetry.lock /app/
+# Configure Poetry environment
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+# Install dependencies without installing the package itself
+RUN --mount=type=cache,target=/tmp/poetry_cache poetry install --only main --no-root
+# Additional steps: Uninstall and re-add cryptography
+RUN poetry run pip uninstall -y cryptography && \
+    poetry add cryptography --lock
+# Stage 2: Runtime environment
+FROM python:3.10.15-slim as runner
+# Install curl for health check script
+RUN apt-get update && apt-get install -y --no-install-recommends curl && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Copy application source code and necessary files
+COPY src /app/src
+COPY main.py /app/main.py
+# Copy virtual environment from the builder stage
+COPY --from=builder /app/.venv /app/.venv
+# Set the working directory to /app
+WORKDIR /app
+# Set the environment path to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Default command
+CMD ["python", "-m", "main"]

README.md ADDED Viewed

	@@ -0,0 +1,286 @@

+---
+title: My Gradio App Mnist Classifier
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "5.7.1"
+app_file: app.py
+pinned: false
+---
+# aws_ec2_automation
+Here’s a detailed explanation of the GitHub Actions (GHA) pipeline in **raw Markdown format**:
+---
+# GitHub Actions Pipeline Documentation
+## Name: Deploy PyTorch Training with EC2 Runner and Docker Compose
+This pipeline automates the following tasks:
+1. Starts an EC2 instance as a self-hosted GitHub runner.
+2. Deploys a PyTorch training pipeline using Docker Compose.
+3. Builds, tags, and pushes Docker images to Amazon ECR.
+4. Stops the EC2 instance after the job is completed.
+---
+### Workflow Triggers
+```yaml
+on:
+  push:
+    branches:
+      - main
+```
+- **Trigger**: This workflow runs whenever a push is made to the `main` branch.
+---
+## Jobs Overview
+### 1. **start-runner**
+Starts a self-hosted EC2 runner using the GitHub Actions Runner.
+#### Steps:
+1. **Configure AWS Credentials**:
+   ```yaml
+   - name: Configure AWS credentials
+     uses: aws-actions/configure-aws-credentials@v4
+     with:
+       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+       aws-region: ${{ secrets.AWS_REGION }}
+   ```
+   - Authenticates with AWS using access keys and the region specified in the secrets.
+   - Required for creating and managing the EC2 instance.
+2. **Start EC2 Runner**:
+   ```yaml
+   - name: Start EC2 runner
+     id: start-ec2-runner
+     uses: machulav/ec2-github-runner@v2
+     with:
+       mode: start
+       github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+       ec2-image-id: ami-044b0717aadbc9dfa
+       ec2-instance-type: t2.xlarge
+       subnet-id: subnet-024811dee81325f1c
+       security-group-id: sg-0646c2a337a355a31
+   ```
+   - Starts an EC2 instance with the specified AMI, instance type, subnet, and security group.
+   - Outputs:
+     - `label`: A unique label for the EC2 runner.
+     - `ec2-instance-id`: The ID of the created EC2 instance.
+---
+### 2. **deploy**
+Deploys the PyTorch training pipeline using the EC2 runner started in the previous step.
+#### Dependencies:
+```yaml
+needs: start-runner
+runs-on: ${{ needs.start-runner.outputs.label }}
+```
+- **Depends on** the `start-runner` job and runs on the newly created EC2 instance.
+#### Steps:
+1. **Checkout Repository**:
+   ```yaml
+   - name: Checkout repository
+     uses: actions/checkout@v4
+   ```
+   - Clones the current repository to the runner.
+2. **Set Up Docker Buildx**:
+   ```yaml
+   - name: Set up Docker Buildx
+     uses: docker/setup-buildx-action@v3
+   ```
+   - Configures Docker Buildx for building multi-platform Docker images.
+3. **Configure AWS Credentials**:
+   ```yaml
+   - name: Configure AWS credentials
+     uses: aws-actions/configure-aws-credentials@v4
+     with:
+       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+       aws-region: ${{ secrets.AWS_REGION }}
+   ```
+   - Reconfigures AWS credentials for Docker ECR authentication and resource management.
+4. **Log in to Amazon ECR**:
+   ```yaml
+   - name: Log in to Amazon ECR
+     id: login-ecr
+     uses: aws-actions/amazon-ecr-login@v2
+   ```
+   - Logs into Amazon ECR for pushing and pulling Docker images.
+5. **Create `.env` File**:
+   ```yaml
+   - name: Create .env file
+     run: |
+       echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
+       echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
+       echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
+   ```
+   - Generates a `.env` file for the application with AWS credentials and region.
+6. **Run Docker Compose for Train and Eval Services**:
+   ```yaml
+   - name: Run Docker Compose for train and eval service
+     run: |
+       docker-compose build
+       docker-compose up --build
+       docker-compose logs --follow
+       docker-compose down --remove-orphans
+   ```
+   - **Build**: Builds all services defined in the `docker-compose.yml` file.
+   - **Up**: Runs all services, including training and evaluation.
+   - **Logs**: Outputs logs for debugging purposes.
+   - **Down**: Stops all services and removes orphaned containers.
+7. **Build, Tag, and Push Docker Image to Amazon ECR**:
+   ```yaml
+   - name: Build, tag, and push Docker image to Amazon ECR
+     env:
+       REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+       REPOSITORY: soutrik71/mnist
+       IMAGE_TAG: ${{ github.sha }}
+     run: |
+       docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
+       docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
+       docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
+       docker push $REGISTRY/$REPOSITORY:latest
+   ```
+   - **Build**: Creates a Docker image with the repository and tag.
+   - **Push**: Pushes the image to Amazon ECR.
+   - **Tag**: Updates the `latest` tag.
+8. **Pull and Verify Docker Image from ECR**:
+   ```yaml
+   - name: Pull Docker image from ECR and verify
+     env:
+       REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+       REPOSITORY: soutrik71/mnist
+       IMAGE_TAG: ${{ github.sha }}
+     run: |
+       docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
+       docker images | grep "$REGISTRY/$REPOSITORY"
+   ```
+   - **Pull**: Pulls the built image from ECR.
+   - **Verify**: Ensures the image exists locally.
+9. **Clean Up Environment**:
+   ```yaml
+   - name: Clean up environment
+     run: |
+       rm -f .env
+       docker system prune -af
+   ```
+   - Deletes the `.env` file and removes unused Docker resources.
+---
+### 3. **stop-runner**
+Stops and terminates the EC2 runner created in the `start-runner` job.
+#### Dependencies:
+```yaml
+needs:
+  - start-runner
+  - deploy
+```
+#### Steps:
+1. **Configure AWS Credentials**:
+   ```yaml
+   - name: Configure AWS credentials
+     uses: aws-actions/configure-aws-credentials@v4
+     with:
+       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+       aws-region: ${{ secrets.AWS_REGION }}
+   ```
+2. **Stop EC2 Runner**:
+   ```yaml
+   - name: Stop EC2 runner
+     uses: machulav/ec2-github-runner@v2
+     with:
+       mode: stop
+       github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+       label: ${{ needs.start-runner.outputs.label }}
+       ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+   ```
+   - Stops the EC2 runner instance created in the first job.
+3. **Validate EC2 Termination**:
+   ```yaml
+   - name: Validate EC2 termination
+     run: aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }}
+   ```
+   - Ensures the EC2 instance has been properly terminated.
+---
+### Key Highlights
+1. **Sequential Execution**:
+   - The `start-runner`, `deploy`, and `stop-runner` jobs are executed sequentially.
+2. **Error Handling**:
+   - The `stop-runner` job runs even if previous jobs fail (`if: ${{ always() }}`).
+3. **Efficiency**:
+   - Docker layer caching speeds up builds.
+   - Cleanup steps maintain a clean environment.
+4. **Security**:
+   - Secrets are masked and removed after use.
+   - Proper resource cleanup ensures cost efficiency.
+---
+This pipeline ensures robust deployment with error handling, logging, and cleanup mechanisms. So far we have discussed the GitHub Actions pipeline , the basic structure of the pipeline, and the steps involved in the pipeline.
+Next we will have an interdependent pipeline where the output of one job will be used as input for the next job.
+---
+## Advanced Pipeline with
+* Sequential Flow: Each job has clear dependencies, ensuring no step runs out of order.
+* Code Checkout: Explicit repository checkout in each job ensures consistent source code.
+* Secure Credential Handling: Sensitive credentials are masked and stored securely.
+* Resource Cleanup: Includes Docker clean-up and EC2 instance termination validation.
+* Logging: Added detailed logs to improve debugging and monitoring.
+Step 1: Start EC2 Runner
+  Purpose: Initializes a self-hosted EC2 runner for running subsequent jobs.
+  Key Actions:
+  Configures AWS credentials.
+  Launches an EC2 instance using specified AMI, instance type, and networking configurations.
+  Outputs the runner label and instance ID for downstream jobs.
+Step 2: Test PyTorch Code Using Docker Compose
+  Purpose: Tests the PyTorch training and evaluation services.
+  Key Actions:
+  Checks out the repository.
+  Sets up Docker Buildx for advanced build capabilities.
+  Configures AWS credentials and creates a masked .env file for secure credential sharing.
+  Runs all services (train, eval) using Docker Compose, monitors logs, and cleans up containers.
+Step 3: Build, Tag, and Push Docker Image
+  Purpose: Builds a Docker image, tags it, and pushes it to Amazon ECR after successful tests.
+  Key Actions:
+  Checks out the repository again to ensure consistency.
+  Logs into Amazon ECR using AWS credentials.
+  Builds and tags the Docker image with latest and SHA-based tags.
+  Pushes the image to Amazon ECR and verifies by pulling it back.
+Step 4: Stop and Delete EC2 Runner
+  Purpose: Stops and terminates the EC2 instance to ensure cost efficiency and cleanup.
+  Key Actions:
+  Configures AWS credentials.
+  Stops the EC2 instance using the label and instance ID from start-runner.
+  Validates the termination state of the EC2 instance to ensure proper cleanup.

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import gradio as gr
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from pathlib import Path
+from loguru import logger
+from src.model import LitEfficientNet
+from src.utils.aws_s3_services import S3Handler
+# Configure Loguru for logging
+logger.add("logs/inference.log", rotation="1 MB", level="INFO")
+class MNISTClassifier:
+    def __init__(self, checkpoint_path="./checkpoints/best_model.ckpt"):
+        self.checkpoint_path = checkpoint_path
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Inference will run on device: {self.device}")
+        # Load the model
+        self.model = self.load_model()
+        self.model.eval()
+        # Define transforms
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((28, 28)),
+                transforms.ToTensor(),
+                transforms.Normalize((0.5,), (0.5,)),
+            ]
+        )
+        self.labels = [str(i) for i in range(10)]  # MNIST labels are 0-9
+    def load_model(self):
+        """
+        Loads the model checkpoint for inference.
+        """
+        if not Path(self.checkpoint_path).exists():
+            logger.error(f"Checkpoint not found: {self.checkpoint_path}")
+            raise FileNotFoundError(f"Checkpoint not found: {self.checkpoint_path}")
+        logger.info(f"Loading model from checkpoint: {self.checkpoint_path}")
+        return LitEfficientNet.load_from_checkpoint(self.checkpoint_path).to(
+            self.device
+        )
+    @torch.no_grad()
+    def predict(self, image):
+        """
+        Perform inference on a single image.
+        Args:
+            image: Input image in PIL format.
+        Returns:
+            dict: Predicted class probabilities.
+        """
+        if image is None:
+            logger.error("No image provided for prediction.")
+            return None
+        # Convert to tensor and preprocess
+        img_tensor = self.transform(image).unsqueeze(0).to(self.device)
+        # Perform inference
+        output = self.model(img_tensor)
+        probabilities = torch.nn.functional.softmax(output[0], dim=0)
+        # Map probabilities to labels
+        return {self.labels[idx]: float(prob) for idx, prob in enumerate(probabilities)}
+# Instantiate the classifier
+checkpoint_path = "./checkpoints/best_model.ckpt"
+# Download checkpoint from S3 (if needed)
+s3_handler = S3Handler(bucket_name="deep-bucket-s3")
+s3_handler.download_folder(
+    "checkpoints_test",
+    "checkpoints",
+)
+classifier = MNISTClassifier(checkpoint_path=checkpoint_path)
+# Define Gradio interface
+demo = gr.Interface(
+    fn=classifier.predict,
+    inputs=gr.Image(height=160, width=160, image_mode="L", type="pil"),
+    outputs=gr.Label(num_top_classes=1),
+    title="MNIST Classifier",
+    description="Upload a handwritten digit image to classify it (0-9).",
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+services:
+  train:
+    build:
+      context: .
+    command: |
+      python -m src.train && \
+      touch ./checkpoints/train_done.flag
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+      - NUM_WORKERS=4  # Set the number of workers
+    shm_size: '4g'
+    deploy:
+      resources:
+        limits:
+          memory: 8g  # Limit to 8GB RAM
+          cpus: '4.0' # Use up to 4 CPU cores
+        reservations:
+          memory: 6g  # Reserve 6GB RAM
+          cpus: '4.0' # Reserve 4 CPU cores
+    networks:
+      - default
+    env_file:
+      - .env
+  eval:
+    build:
+      context: .
+    command: |
+      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.test'
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+      - NUM_WORKERS=2  # Set the number of workers
+    shm_size: '4g'
+    deploy:
+      resources:
+        limits:
+          memory: 4g  # Limit to 4GB RAM
+          cpus: '4.0' # Use up to 4 CPU core
+        reservations:
+          memory: 2g  # Reserve 2GB RAM
+          cpus: '2' # Reserve 2 CPU core
+    networks:
+      - default
+    env_file:
+      - .env
+volumes:
+  data:
+  checkpoints:
+  logs:
+networks:
+  default:

main.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import os
2	+
3	+ print(os.getcwd())

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,91 @@

+[tool.poetry]
+name = "pytorch_fastapi_project"
+version = "0.1.0"
+description = "Consolidated PyTorch and FastAPI project for AWS deployment and GHA testing"
+authors = ["soutrik71 <soutrik.chowdhury@ab-inbev.com>"]
+license = "Apache-2.0"
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "3.10.15"
+black = "24.8.0"
+coverage = ">=7.6.1"
+hydra-colorlog = "1.2.0"
+hydra-core = "1.3.2"
+lightning = {version = "2.4.0", extras = ["extra"]}
+loguru = "0.7.2"
+pytest = "^8.3.3"
+rich = "13.8.1"
+rootutils = "1.0.7"
+tensorboard = "2.17.1"
+timm = "1.0.9"
+pandas = "^2.2.3"
+numpy = "^1.26.0"
+ruff = "*"
+torch = {version = "^2.4.1+cpu", source = "pytorch_cpu"}
+torchvision = {version = "^0.19.1+cpu", source = "pytorch_cpu"}
+seaborn = "^0.13.2"
+pydantic = "^2.9.2"
+kaggle = "^1.6.17"
+pytest-cov = "^5.0.0"
+pytest-mock = "^3.14.0"
+flake8 = "^7.1.1"
+dvc-gdrive = "^3.0.1"
+dvc-azure = "^3.1.0"
+transformers = "^4.45.2"
+fastapi = "^0.115.4"
+pydantic-settings = "^2.6.1"
+uvicorn = "^0.32.0"
+tenacity = "^9.0.0"
+gunicorn = "^23.0.0"
+aim = "^3.25.0"
+mlflow = "^2.17.1"
+hydra-optuna-sweeper = "^1.2.0"
+dvc = "^3.56.0"
+platformdirs = "3.10"
+fastapi-utils = "^0.7.0"
+httpx = "^0.27.2"
+typing-inspect = "^0.9.0"
+requests = "^2.32.3"
+fastapi-restful = {extras = ["all"], version = "^0.6.0"}
+aioredis = "^2.0.1"
+psycopg2-binary = "^2.9.10"
+asyncpg = "^0.30.0"
+confluent-kafka = "^2.6.0"
+aiokafka = "^0.12.0"
+azure-servicebus = "^7.12.3"
+aiohttp = "^3.10.10"
+aiofiles = "*"
+aiologger = "^0.7.0"
+pyyaml = "^6.0.2"
+sqlalchemy-utils = "^0.41.2"
+sqlalchemy = "^2.0.36"
+alembic = "^1.13.3"
+fastapi-limiter = "^0.1.6"
+redis = "5.0.8"
+redisearch = "2.0.0"
+python-multipart = "*"
+python-dotenv = "^1.0.1"
+celery = "^5.4.0"
+fastapi-cache2 = "^0.2.2"
+aiocache = "^0.12.3"
+dvc-s3 = "^3.2.0"
+litserve = "^0.2.4"
+gpustat = "^1.1.1"
+nvitop = "^1.3.2"
+pyopenssl = "^23.0.0"
+cryptography = "^41.0.0"
+accelerate = "^1.1.1"
+gradio="5.7.1"
+[tool.poetry.dev-dependencies]
+pytest-asyncio = "^0.20.3"
+[[tool.poetry.source]]
+name = "pytorch_cpu"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "explicit"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+torch==2.4.1
+torchvision==0.19.1
+hydra-colorlog==1.2.0
+hydra-core==1.3.2
+lightning[extra]==2.4.0
+loguru==0.7.2
+rich==13.8.1
+rootutils==1.0.7
+tensorboard==2.17.1
+timm==1.0.9
+pandas>=2.2.3
+numpy>=1.26.0
+transformers>=4.45.2
+aim>=3.25.0
+mlflow>=2.17.1
+hydra-optuna-sweeper>=1.2.0
+aiologger>=0.7.0
+pyyaml>=6.0.2
+dvc-s3>=3.2.0
+litserve>=0.2.4
+gpustat>=1.1.1
+nvitop>=1.3.2
+gradio==5.7.1
+gradio-client>=1.5.0
+accelerate>=1.1.1
+cryptography>=44.0.0
+boto3
+pyopenssl>=24.3.0

src/__init__.py ADDED Viewed

File without changes

src/dataloader.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from loguru import logger
+import torch
+from torch.utils.data import DataLoader, Subset
+from torchvision import datasets, transforms
+import lightning as pl
+from typing import Optional
+from multiprocessing import cpu_count
+from sklearn.model_selection import train_test_split
+# Configure Loguru to save logs to the logs/ directory
+logger.add("logs/dataloader.log", rotation="1 MB", level="INFO")
+class MNISTDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        batch_size: int = 64,
+        data_dir: str = "./data",
+        num_workers: int = int(cpu_count()),
+        train_subset_fraction: float = 0.25,  # Fraction of training data to use
+    ):
+        """
+        Initializes the MNIST Data Module with configurations for dataloaders.
+        Args:
+            batch_size (int): Batch size for training, validation, and testing.
+            data_dir (str): Directory to download and store the dataset.
+            num_workers (int): Number of workers for data loading.
+            train_subset_fraction (float): Fraction of training data to use (0.0 < fraction <= 1.0).
+        """
+        super().__init__()
+        self.batch_size = batch_size
+        self.data_dir = data_dir
+        self.num_workers = num_workers
+        self.train_subset_fraction = train_subset_fraction
+        self.transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
+        )
+        logger.info(f"MNIST DataModule initialized with batch size {self.batch_size}")
+    def prepare_data(self):
+        """
+        Downloads the MNIST dataset if not already downloaded.
+        """
+        datasets.MNIST(root=self.data_dir, train=True, download=True)
+        datasets.MNIST(root=self.data_dir, train=False, download=True)
+        logger.info("MNIST dataset downloaded.")
+    def setup(self, stage: Optional[str] = None):
+        """
+        Set up the dataset for different stages.
+        Args:
+            stage (str, optional): One of "fit", "validate", "test", or "predict".
+        """
+        logger.info(f"Setting up data for stage: {stage}")
+        if stage == "fit" or stage is None:
+            full_train_dataset = datasets.MNIST(
+                root=self.data_dir, train=True, transform=self.transform
+            )
+            train_indices, _ = train_test_split(
+                range(len(full_train_dataset)),
+                train_size=self.train_subset_fraction,
+                random_state=42,
+            )
+            self.mnist_train = Subset(full_train_dataset, train_indices)
+            self.mnist_val = datasets.MNIST(
+                root=self.data_dir, train=False, transform=self.transform
+            )
+            logger.info(f"Loaded training subset: {len(self.mnist_train)} samples.")
+            logger.info(f"Loaded validation data: {len(self.mnist_val)} samples.")
+        if stage == "test" or stage is None:
+            self.mnist_test = datasets.MNIST(
+                root=self.data_dir, train=False, transform=self.transform
+            )
+            logger.info(f"Loaded test data: {len(self.mnist_test)} samples.")
+    def train_dataloader(self) -> DataLoader:
+        """
+        Returns the training DataLoader.
+        Returns:
+            DataLoader: Training data loader.
+        """
+        logger.info("Creating training DataLoader...")
+        return DataLoader(
+            self.mnist_train,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+        )
+    def val_dataloader(self) -> DataLoader:
+        """
+        Returns the validation DataLoader.
+        Returns:
+            DataLoader: Validation data loader.
+        """
+        logger.info("Creating validation DataLoader...")
+        return DataLoader(
+            self.mnist_val,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+        )
+    def test_dataloader(self) -> DataLoader:
+        """
+        Returns the test DataLoader.
+        Returns:
+            DataLoader: Test data loader.
+        """
+        logger.info("Creating test DataLoader...")
+        return DataLoader(
+            self.mnist_test,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+        )

src/model.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import lightning as pl
+import torch.nn as nn
+import torch
+from timm import create_model
+from torchmetrics.classification import Accuracy
+from torch.optim.lr_scheduler import StepLR
+import torch.optim as optim
+from loguru import logger
+logger.add("logs/model.log", rotation="1 MB", level="INFO")
+class LitEfficientNet(pl.LightningModule):
+    def __init__(
+        self,
+        model_name="tf_efficientnet_lite0",
+        num_classes=10,
+        lr=1e-3,
+        custom_loss=None,
+    ):
+        """
+        Initializes a CNN model from TIMM and integrates TorchMetrics.
+        Args:
+            model_name (str): TIMM model name (e.g., "tf_efficientnet_lite0").
+            num_classes (int): Number of output classes (e.g., 0–9 for MNIST).
+            lr (float): Learning rate for the optimizer.
+            custom_loss (callable, optional): Custom loss function. Defaults to CrossEntropyLoss.
+        """
+        super().__init__()
+        self.lr = lr
+        self.model = create_model(
+            model_name,
+            pretrained=True,
+            num_classes=num_classes,
+            in_chans=1,  # Set to 1 channel for grayscale input
+        )
+        self.loss_fn = custom_loss or nn.CrossEntropyLoss()
+        self.train_acc = Accuracy(num_classes=num_classes, task="multiclass")
+        self.val_acc = Accuracy(num_classes=num_classes, task="multiclass")
+        self.test_acc = Accuracy(num_classes=num_classes, task="multiclass")
+        logger.info(f"Model initialized with TIMM backbone: {model_name}")
+        logger.info(f"Number of output classes: {num_classes}")
+    def forward(self, x):
+        """
+        Forward pass of the model.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Model predictions.
+        """
+        return self.model(x)
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = self.loss_fn(y_hat, y)
+        self.train_acc.update(y_hat, y)
+        self.log("train_loss", loss, prog_bar=True, logger=True)
+        self.log("train_acc", self.train_acc, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = self.loss_fn(y_hat, y)
+        self.val_acc.update(y_hat, y)
+        self.log("val_loss", loss, prog_bar=True, logger=True)
+        self.log("val_acc", self.val_acc, prog_bar=True, logger=True)
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        self.test_acc.update(y_hat, y)
+        self.log("test_acc", self.test_acc, prog_bar=True, logger=True)
+    def configure_optimizers(self):
+        optimizer = optim.Adam(self.parameters(), lr=self.lr)
+        scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
+        logger.info(f"Optimizer: Adam, Learning Rate: {self.lr}")
+        logger.info("Scheduler: StepLR with step_size=1 and gamma=0.9")
+        return [optimizer], [scheduler]

src/test.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+from loguru import logger
+from src.model import LitEfficientNet
+from src.dataloader import MNISTDataModule
+from torchmetrics.classification import Accuracy
+from pathlib import Path
+from src.utils.aws_s3_services import S3Handler
+# Configure Loguru to save logs to the logs/ directory
+logger.add("logs/test.log", rotation="1 MB", level="INFO")
+def infer(checkpoint_path, image):
+    """
+    Perform inference on a single image using the model checkpoint.
+    Args:
+        checkpoint_path (str): Path to the model checkpoint.
+        image (torch.Tensor): Image tensor to predict (shape: [1, 28, 28] for MNIST).
+    Returns:
+        int: Predicted class (0-9).
+    """
+    logger.info(f"Loading model from checkpoint: {checkpoint_path} for inference...")
+    if not Path(checkpoint_path).exists():
+        logger.error(f"Checkpoint not found: {checkpoint_path}")
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+    # Detect device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Inference will run on device: {device}")
+    # Load the model
+    model = LitEfficientNet.load_from_checkpoint(checkpoint_path).to(device)
+    model.eval()
+    # Perform inference
+    with torch.no_grad():
+        if image.dim() == 3:
+            image = image.unsqueeze(0)  # Add batch dimension if needed
+        image = image.to(device)  # Ensure the image is on the same device as the model
+        prediction = model(image)
+        predicted_class = torch.argmax(prediction, dim=1).item()
+    logger.info(f"Predicted class: {predicted_class}")
+    return predicted_class
+def test_model(checkpoint_path):
+    """
+    Test the model using the test dataset and log metrics.
+    Args:
+        checkpoint_path (str): Path to the model checkpoint.
+    Returns:
+        float: Final test accuracy.
+    """
+    logger.info(f"Loading model from checkpoint: {checkpoint_path} for testing...")
+    if not Path(checkpoint_path).exists():
+        logger.error(f"Checkpoint not found: {checkpoint_path}")
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+    # Detect device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Testing will run on device: {device}")
+    # Load the model
+    model = LitEfficientNet.load_from_checkpoint(checkpoint_path).to(device)
+    model.eval()
+    # Set up data module and load test data
+    data_module = MNISTDataModule()
+    data_module.setup(stage="test")
+    test_loader = data_module.test_dataloader()
+    # Initialize accuracy metric
+    test_acc = Accuracy(num_classes=10, task="multiclass").to(device)
+    # Evaluate model on test data
+    logger.info("Evaluating on test dataset...")
+    with torch.no_grad():
+        for images, labels in test_loader:
+            images, labels = images.to(device), labels.to(
+                device
+            )  # Move data to the same device
+            outputs = model(images)
+            test_acc.update(outputs, labels)
+    accuracy = test_acc.compute().item()
+    logger.info(f"Final Test Accuracy (TorchMetrics): {accuracy:.2%}")
+    return accuracy
+if __name__ == "__main__":
+    # downloading from s3
+    s3_handler = S3Handler(bucket_name="deep-bucket-s3")
+    s3_handler.download_folder(
+        "checkpoints_test",
+        "checkpoints",
+    )
+    checkpoint_path = "./checkpoints/best_model.ckpt"
+    try:
+        # Perform testing
+        test_accuracy = test_model(checkpoint_path)
+        logger.info(f"Test completed successfully with accuracy: {test_accuracy:.2%}")
+        # Example inference
+        logger.info("Running inference on a single test image...")
+        dummy_image = torch.randn(1, 28, 28)  # Replace with actual test image
+        predicted_class = infer(checkpoint_path, dummy_image)
+        logger.info(f"Inference result: Predicted class {predicted_class}")
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")

src/train.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import lightning as pl
+from lightning.pytorch.callbacks import (
+    ModelCheckpoint,
+    EarlyStopping,
+    LearningRateMonitor,
+    RichProgressBar,
+)
+from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
+from lightning.pytorch.callbacks import ModelSummary
+from src.dataloader import MNISTDataModule
+from src.model import LitEfficientNet
+from loguru import logger
+import os
+from src.utils.aws_s3_services import S3Handler
+# Ensure the logs directory exists
+os.makedirs("logs", exist_ok=True)
+# Configure Loguru for logging
+logger.add("logs/training.log", rotation="1 MB", level="INFO")
+def main():
+    """
+    Main training loop for the model with advanced configuration (CPU training).
+    """
+    # Data Module
+    logger.info("Setting up data module...")
+    data_module = MNISTDataModule(batch_size=256)
+    # Model
+    logger.info("Setting up model...")
+    model = LitEfficientNet(model_name="tf_efficientnet_lite0", num_classes=10, lr=1e-3)
+    logger.info(model)
+    # Callbacks
+    logger.info("Setting up callbacks...")
+    checkpoint_callback = ModelCheckpoint(
+        monitor="val_acc",
+        dirpath="checkpoints/",
+        filename="best_model",
+        save_top_k=1,
+        mode="max",
+        auto_insert_metric_name=False,
+        verbose=True,
+        save_last=True,
+        enable_version_counter=False,
+    )
+    early_stopping_callback = EarlyStopping(
+        monitor="val_acc",
+        patience=5,  # Extended patience for advanced models
+        mode="max",
+        verbose=True,
+    )
+    lr_monitor = LearningRateMonitor(logging_interval="epoch")  # Log learning rate
+    rich_progress = RichProgressBar()
+    model_summary = ModelSummary(
+        max_depth=1
+    )  # Show only the first level of model layers
+    # Loggers
+    logger.info("Setting up loggers...")
+    csv_logger = CSVLogger("logs/", name="mnist_csv")
+    tb_logger = TensorBoardLogger("logs/", name="mnist_tb")
+    # Trainer Configuration for CPU
+    logger.info("Setting up trainer...")
+    trainer = pl.Trainer(
+        max_epochs=2,
+        callbacks=[
+            checkpoint_callback,
+            early_stopping_callback,
+            lr_monitor,
+            rich_progress,
+            model_summary,
+        ],
+        logger=[csv_logger, tb_logger],
+        deterministic=True,
+        accelerator="auto",
+        devices="auto",
+    )
+    # Train the model
+    logger.info("Training the model...")
+    trainer.fit(model, datamodule=data_module)
+    # Test the model
+    logger.info("Testing the model...")
+    data_module.setup(stage="test")
+    trainer.test(model, datamodule=data_module)
+    # write a checkpoints/train_done.flag
+    with open("checkpoints/train_done.flag", "w") as f:
+        f.write("Training done.")
+    # upload checkpoints to S3
+    s3_handler = S3Handler(bucket_name="deep-bucket-s3")
+    s3_handler.upload_folder(
+        "checkpoints",
+        "checkpoints_test",
+    )
+if __name__ == "__main__":
+    main()

src/utils/aws_s3_services.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import boto3
+import os
+from pathlib import Path
+from dotenv import load_dotenv, find_dotenv
+# Load environment variables from .env file
+load_dotenv(find_dotenv(".env"))
+class S3Handler:
+    def __init__(self, bucket_name):
+        self.bucket_name = bucket_name
+        self.s3 = boto3.client(
+            "s3",
+            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+            region_name=os.getenv("AWS_REGION"),
+        )
+    def upload_folder(self, source_folder, dest_folder, filenames=None):
+        """
+        Upload specified files or all files from a local folder to an S3 folder.
+        Args:
+            source_folder (str): Local source folder path.
+            dest_folder (str): Destination folder path in S3.
+            filenames (list): List of filenames to upload (relative to source_folder). If None, uploads all files.
+        """
+        source_folder = Path(source_folder)
+        # Select files based on filenames list or all files if filenames is None
+        files_to_upload = (
+            [source_folder / file for file in filenames]
+            if filenames
+            else list(source_folder.rglob("*"))
+        )
+        for file_path in files_to_upload:
+            if file_path.is_file():
+                s3_path = f"{dest_folder}/{file_path.relative_to(source_folder)}"
+                self.s3.upload_file(str(file_path), self.bucket_name, s3_path)
+                print(f"Uploaded: {file_path} to {s3_path}")
+            else:
+                print(f"File not found: {file_path}")
+    def download_folder(self, s3_folder, dest_folder):
+        """
+        Download all files from an S3 folder to a local folder.
+        Args:
+            s3_folder (str): Source folder in S3.
+            dest_folder (str): Local destination folder path.
+        """
+        dest_folder = Path(dest_folder).resolve()
+        paginator = self.s3.get_paginator("list_objects_v2")
+        for page in paginator.paginate(Bucket=self.bucket_name, Prefix=s3_folder):
+            for obj in page.get("Contents", []):
+                s3_path = obj["Key"]
+                # Skip folder itself if returned by S3
+                if s3_path.endswith("/"):
+                    continue
+                # Compute relative path and local destination
+                relative_path = Path(s3_path[len(s3_folder) :].lstrip("/"))
+                local_path = dest_folder / relative_path
+                # Create necessary local directories
+                local_path.parent.mkdir(parents=True, exist_ok=True)
+                # Download file
+                self.s3.download_file(self.bucket_name, s3_path, str(local_path))
+                print(f"Downloaded: {s3_path} to {local_path}")
+# Usage Example
+if __name__ == "__main__":
+    # Initialize with bucket name
+    s3_handler = S3Handler(bucket_name="deep-bucket-s3")
+    # Upload specific files
+    s3_handler.upload_folder(
+        "checkpoints_test",
+        "checkpoints_test",
+    )
+    # Download example
+    s3_handler.download_folder("checkpoints_test", "checkpoints_test")