dvshah13 commited on
Commit
76835c1
1 Parent(s): aaa6c16

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +47 -27
  2. config.json +15 -62
  3. preprocessor_config.json +8 -4
  4. pytorch_model.bin +2 -2
  5. training_args.bin +2 -2
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- license: apache-2.0
3
  tags:
4
  - generated_from_trainer
5
  datasets:
@@ -7,19 +7,32 @@ datasets:
7
  metrics:
8
  - accuracy
9
  model-index:
10
- - name: distilhubert-finetuned-gtzan-v3
11
- results: []
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # distilhubert-finetuned-gtzan-v3
18
 
19
- This model is a fine-tuned version of [ntu-spml/distilhubert](https://huggingface.co/ntu-spml/distilhubert) on the GTZAN dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.5752
22
- - Accuracy: 0.83
23
 
24
  ## Model description
25
 
@@ -39,38 +52,45 @@ More information needed
39
 
40
  The following hyperparameters were used during training:
41
  - learning_rate: 5e-05
42
- - train_batch_size: 8
43
- - eval_batch_size: 8
44
  - seed: 42
 
 
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - lr_scheduler_warmup_ratio: 0.1
48
- - num_epochs: 15
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
- | 1.9108 | 1.0 | 113 | 1.9472 | 0.43 |
55
- | 1.3286 | 2.0 | 226 | 1.4173 | 0.65 |
56
- | 1.032 | 3.0 | 339 | 0.9815 | 0.67 |
57
- | 0.726 | 4.0 | 452 | 0.7403 | 0.79 |
58
- | 0.4621 | 5.0 | 565 | 0.6390 | 0.8 |
59
- | 0.3439 | 6.0 | 678 | 0.5248 | 0.85 |
60
- | 0.1592 | 7.0 | 791 | 0.4861 | 0.86 |
61
- | 0.1283 | 8.0 | 904 | 0.4995 | 0.87 |
62
- | 0.1191 | 9.0 | 1017 | 0.4804 | 0.87 |
63
- | 0.0236 | 10.0 | 1130 | 0.6737 | 0.8 |
64
- | 0.0146 | 11.0 | 1243 | 0.6211 | 0.81 |
65
- | 0.0105 | 12.0 | 1356 | 0.5806 | 0.86 |
66
- | 0.008 | 13.0 | 1469 | 0.5645 | 0.84 |
67
- | 0.0082 | 14.0 | 1582 | 0.6033 | 0.83 |
68
- | 0.0072 | 15.0 | 1695 | 0.5752 | 0.83 |
 
 
 
 
 
69
 
70
 
71
  ### Framework versions
72
 
73
- - Transformers 4.30.0
74
  - Pytorch 2.0.1+cu118
75
- - Datasets 2.12.0
76
  - Tokenizers 0.13.3
 
1
  ---
2
+ license: bsd-3-clause
3
  tags:
4
  - generated_from_trainer
5
  datasets:
 
7
  metrics:
8
  - accuracy
9
  model-index:
10
+ - name: ast-finetuned-audioset-10-10-0.4593-finetuned-gtzan
11
+ results:
12
+ - task:
13
+ name: Audio Classification
14
+ type: audio-classification
15
+ dataset:
16
+ name: GTZAN
17
+ type: marsyas/gtzan
18
+ config: all
19
+ split: train
20
+ args: all
21
+ metrics:
22
+ - name: Accuracy
23
+ type: accuracy
24
+ value: 0.9
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
28
  should probably proofread and complete it, then remove this comment. -->
29
 
30
+ # ast-finetuned-audioset-10-10-0.4593-finetuned-gtzan
31
 
32
+ This model is a fine-tuned version of [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) on the GTZAN dataset.
33
  It achieves the following results on the evaluation set:
34
+ - Loss: 0.4717
35
+ - Accuracy: 0.9
36
 
37
  ## Model description
38
 
 
52
 
53
  The following hyperparameters were used during training:
54
  - learning_rate: 5e-05
55
+ - train_batch_size: 4
56
+ - eval_batch_size: 4
57
  - seed: 42
58
+ - gradient_accumulation_steps: 4
59
+ - total_train_batch_size: 16
60
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
61
  - lr_scheduler_type: linear
62
  - lr_scheduler_warmup_ratio: 0.1
63
+ - num_epochs: 20
64
 
65
  ### Training results
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
69
+ | 0.7581 | 1.0 | 56 | 0.7029 | 0.78 |
70
+ | 0.3942 | 1.99 | 112 | 0.4646 | 0.86 |
71
+ | 0.3298 | 2.99 | 168 | 0.3861 | 0.88 |
72
+ | 0.1227 | 4.0 | 225 | 0.4702 | 0.86 |
73
+ | 0.0774 | 5.0 | 281 | 0.4492 | 0.9 |
74
+ | 0.0039 | 5.99 | 337 | 0.4607 | 0.9 |
75
+ | 0.0014 | 6.99 | 393 | 0.5022 | 0.9 |
76
+ | 0.0022 | 8.0 | 450 | 0.4711 | 0.9 |
77
+ | 0.0193 | 9.0 | 506 | 0.5226 | 0.86 |
78
+ | 0.0004 | 9.99 | 562 | 0.6055 | 0.82 |
79
+ | 0.0003 | 10.99 | 618 | 0.4793 | 0.89 |
80
+ | 0.0002 | 12.0 | 675 | 0.5052 | 0.9 |
81
+ | 0.0002 | 13.0 | 731 | 0.4652 | 0.89 |
82
+ | 0.0001 | 13.99 | 787 | 0.4617 | 0.9 |
83
+ | 0.0001 | 14.99 | 843 | 0.4653 | 0.9 |
84
+ | 0.0001 | 16.0 | 900 | 0.4635 | 0.91 |
85
+ | 0.0001 | 17.0 | 956 | 0.4693 | 0.9 |
86
+ | 0.0001 | 17.99 | 1012 | 0.4697 | 0.9 |
87
+ | 0.0001 | 18.99 | 1068 | 0.4715 | 0.9 |
88
+ | 0.0025 | 19.91 | 1120 | 0.4717 | 0.9 |
89
 
90
 
91
  ### Framework versions
92
 
93
+ - Transformers 4.31.0.dev0
94
  - Pytorch 2.0.1+cu118
95
+ - Datasets 2.13.1
96
  - Tokenizers 0.13.3
config.json CHANGED
@@ -1,52 +1,12 @@
1
  {
2
- "_name_or_path": "ntu-spml/distilhubert",
3
- "activation_dropout": 0.1,
4
- "apply_spec_augment": false,
5
  "architectures": [
6
- "HubertForSequenceClassification"
7
  ],
8
- "attention_dropout": 0.1,
9
- "bos_token_id": 1,
10
- "classifier_proj_size": 256,
11
- "conv_bias": false,
12
- "conv_dim": [
13
- 512,
14
- 512,
15
- 512,
16
- 512,
17
- 512,
18
- 512,
19
- 512
20
- ],
21
- "conv_kernel": [
22
- 10,
23
- 3,
24
- 3,
25
- 3,
26
- 3,
27
- 2,
28
- 2
29
- ],
30
- "conv_stride": [
31
- 5,
32
- 2,
33
- 2,
34
- 2,
35
- 2,
36
- 2,
37
- 2
38
- ],
39
- "ctc_loss_reduction": "sum",
40
- "ctc_zero_infinity": false,
41
- "do_stable_layer_norm": false,
42
- "eos_token_id": 2,
43
- "feat_extract_activation": "gelu",
44
- "feat_extract_norm": "group",
45
- "feat_proj_dropout": 0.0,
46
- "feat_proj_layer_norm": false,
47
- "final_dropout": 0.0,
48
  "hidden_act": "gelu",
49
- "hidden_dropout": 0.1,
50
  "hidden_size": 768,
51
  "id2label": {
52
  "0": "blues",
@@ -74,23 +34,16 @@
74
  "reggae": "8",
75
  "rock": "9"
76
  },
77
- "layer_norm_eps": 1e-05,
78
- "layerdrop": 0.0,
79
- "mask_feature_length": 10,
80
- "mask_feature_min_masks": 0,
81
- "mask_feature_prob": 0.0,
82
- "mask_time_length": 10,
83
- "mask_time_min_masks": 2,
84
- "mask_time_prob": 0.05,
85
- "model_type": "hubert",
86
  "num_attention_heads": 12,
87
- "num_conv_pos_embedding_groups": 16,
88
- "num_conv_pos_embeddings": 128,
89
- "num_feat_extract_layers": 7,
90
- "num_hidden_layers": 2,
91
- "pad_token_id": 0,
 
92
  "torch_dtype": "float32",
93
- "transformers_version": "4.30.0",
94
- "use_weighted_layer_sum": false,
95
- "vocab_size": 32
96
  }
 
1
  {
2
+ "_name_or_path": "MIT/ast-finetuned-audioset-10-10-0.4593",
 
 
3
  "architectures": [
4
+ "ASTForAudioClassification"
5
  ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "frequency_stride": 10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
  "hidden_size": 768,
11
  "id2label": {
12
  "0": "blues",
 
34
  "reggae": "8",
35
  "rock": "9"
36
  },
37
+ "layer_norm_eps": 1e-12,
38
+ "max_length": 1024,
39
+ "model_type": "audio-spectrogram-transformer",
 
 
 
 
 
 
40
  "num_attention_heads": 12,
41
+ "num_hidden_layers": 12,
42
+ "num_mel_bins": 128,
43
+ "patch_size": 16,
44
+ "problem_type": "single_label_classification",
45
+ "qkv_bias": true,
46
+ "time_stride": 10,
47
  "torch_dtype": "float32",
48
+ "transformers_version": "4.31.0.dev0"
 
 
49
  }
preprocessor_config.json CHANGED
@@ -1,9 +1,13 @@
1
  {
2
  "do_normalize": true,
3
- "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
  "feature_size": 1,
 
 
 
5
  "padding_side": "right",
6
- "padding_value": 0,
7
- "return_attention_mask": true,
8
- "sampling_rate": 16000
 
9
  }
 
1
  {
2
  "do_normalize": true,
3
+ "feature_extractor_type": "ASTFeatureExtractor",
4
  "feature_size": 1,
5
+ "max_length": 1024,
6
+ "mean": -4.2677393,
7
+ "num_mel_bins": 128,
8
  "padding_side": "right",
9
+ "padding_value": 0.0,
10
+ "return_attention_mask": false,
11
+ "sampling_rate": 16000,
12
+ "std": 4.5689974
13
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c7ff2fa0a414f601870868e3a609aa033ce737f4453016a85f434bec089417d
3
- size 94783376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaddab9baccf21fdd4b71c5748750967fccfbd7145c9f067636b0798b06c99d8
3
+ size 344860025
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfb1e4cd9b526d63fc9658531b0fec58392615665647ca817e210356d56aba70
3
- size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:820d83430bf5ca24a07288b6880ffe6c418e766a8e4e37faf5d2821727ea8e2f
3
+ size 4027