ardaatahan commited on
Commit
462cdb6
1 Parent(s): ad25137

add automations for new data generation

Browse files
.github/scripts/check_dataset_update.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ from huggingface_hub import HfApi, login
5
+
6
+
7
+ def check_dataset_updates(dataset_id):
8
+ api = HfApi()
9
+ dataset_info = api.dataset_info(dataset_id)
10
+ last_modified = dataset_info.lastModified.isoformat()
11
+ current_sha = dataset_info.sha
12
+
13
+ cache_dir = "dashboard_data"
14
+
15
+ cache_file = os.path.join(cache_dir, "version.json")
16
+
17
+ if os.path.exists(cache_file):
18
+ with open(cache_file, "r") as f:
19
+ cached_data = json.load(f)
20
+ if cached_data.get("sha") == current_sha:
21
+ with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
22
+ print(f"has_updates=false", file=fh)
23
+ return
24
+
25
+ with open(cache_file, "w") as f:
26
+ json.dump(
27
+ {
28
+ "last_modified": last_modified,
29
+ "sha": current_sha,
30
+ "releases": ["9f493bc", "a9b92c4"],
31
+ },
32
+ f,
33
+ )
34
+
35
+ with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
36
+ print(f"has_updates=true", file=fh)
37
+
38
+
39
+ if __name__ == "__main__":
40
+ login(token=os.environ["HF_TOKEN"])
41
+ check_dataset_updates("argmaxinc/whisperkit-evals-dataset")
.github/workflows/dataset_update.yml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: WhisperKit Evals Dataset Update Workflow
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 * * * *"
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ update-datasets:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ contents: write
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.11"
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install huggingface_hub requests
25
+
26
+ - name: Check dataset updates
27
+ id: check_updates
28
+ env:
29
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
30
+ run: python .github/scripts/check_dataset_update.py
31
+
32
+ - name: Install full requirements
33
+ if: steps.check_updates.outputs.has_updates == 'true'
34
+ run: |
35
+ pip install -r requirements.txt
36
+
37
+ - name: Run generate script
38
+ if: steps.check_updates.outputs.has_updates == 'true'
39
+ run: |
40
+ make update-performance-data
41
+
42
+ - name: Commit and push if changed
43
+ if: steps.check_updates.outputs.has_updates == 'true'
44
+ env:
45
+ GH_TOKEN: ${{ secrets.BOT_TOKEN }}
46
+ GH_USERNAME: ${{ secrets.BOT_USERNAME }}
47
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
48
+ run: |
49
+ git config --global user.email "bots@takeargmax.com"
50
+ git config --global user.name $GH_USERNAME
51
+
52
+ git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/${GITHUB_REPOSITORY}.git
53
+
54
+ git add .
55
+ git commit -m "update dataset files" || echo "No changes to commit"
56
+ git push
57
+ git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/argmaxinc/whisperkit-benchmarks-internal
Makefile CHANGED
@@ -10,3 +10,6 @@ use-huggingface-data:
10
 
11
  use-local-data:
12
  @python performance_generate.py
 
 
 
 
10
 
11
  use-local-data:
12
  @python performance_generate.py
13
+
14
+ update-performance-data:
15
+ @python performance_generate.py download
dashboard_data/version.json DELETED
@@ -1 +0,0 @@
1
- {"last_modified": "", "sha": "", "releases": ["9f493bc", "a9b92c4"]}