xezpeleta commited on
Commit
7417384
1 Parent(s): 7589e01

Training in progress, step 500

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  checkpoint-*/
2
  nohup.out
3
  .venv/
 
 
1
  checkpoint-*/
2
  nohup.out
3
  .venv/
4
+ wandb/
config.json CHANGED
@@ -35,7 +35,7 @@
35
  "pad_token_id": 50257,
36
  "scale_embedding": false,
37
  "torch_dtype": "float32",
38
- "transformers_version": "4.26.0.dev0",
39
  "use_cache": false,
40
  "vocab_size": 51865
41
  }
 
35
  "pad_token_id": 50257,
36
  "scale_embedding": false,
37
  "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0",
39
  "use_cache": false,
40
  "vocab_size": 51865
41
  }
merges.txt CHANGED
@@ -1,4 +1,5 @@
1
  #version: 0.2
 
2
  Ġ a
3
  Ġt h
4
  i n
 
1
  #version: 0.2
2
+ Ġ t
3
  Ġ a
4
  Ġt h
5
  i n
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b1ff010d6786cde899f8f8a3fca4dd5a11855df5bee8d7926f0a56ad6cfff49
3
- size 3055754841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:599c99286b519fadb5bbe0f6646b50209f593eb7a153a19220d8b3f89d46f23f
3
+ size 3055755286
run.sh CHANGED
@@ -1,11 +1,12 @@
1
- python run_speech_recognition_seq2seq_streaming.py \
 
2
  --model_name_or_path="openai/whisper-medium" \
3
- --dataset_name="mozilla-foundation/common_voice_16_0" \
4
  --dataset_config_name="eu" \
5
  --language="basque" \
6
  --train_split_name="train+validation" \
7
  --eval_split_name="test" \
8
- --model_index_name="Whisper Small Basque" \
9
  --max_steps="8000" \
10
  --output_dir="./" \
11
  --per_device_train_batch_size="4" \
@@ -35,5 +36,6 @@ python run_speech_recognition_seq2seq_streaming.py \
35
  --predict_with_generate \
36
  --do_normalize_eval \
37
  --streaming \
38
- --use_auth_token \
39
- --push_to_hub
 
 
1
+ WANDB_PROJECT=whisper-medium-eu \
2
+ python run_speech_recognition_seq2seq_streaming.py \
3
  --model_name_or_path="openai/whisper-medium" \
4
+ --dataset_name="mozilla-foundation/common_voice_17_0" \
5
  --dataset_config_name="eu" \
6
  --language="basque" \
7
  --train_split_name="train+validation" \
8
  --eval_split_name="test" \
9
+ --model_index_name="Whisper Medium Basque" \
10
  --max_steps="8000" \
11
  --output_dir="./" \
12
  --per_device_train_batch_size="4" \
 
36
  --predict_with_generate \
37
  --do_normalize_eval \
38
  --streaming \
39
+ --push_to_hub \
40
+ --report_to "wandb" \
41
+ --run_name "whisper-medium-eu"
special_tokens_map.json CHANGED
@@ -111,22 +111,28 @@
111
  "bos_token": {
112
  "content": "<|endoftext|>",
113
  "lstrip": false,
114
- "normalized": true,
115
  "rstrip": false,
116
  "single_word": false
117
  },
118
  "eos_token": {
119
  "content": "<|endoftext|>",
120
  "lstrip": false,
121
- "normalized": true,
 
 
 
 
 
 
 
122
  "rstrip": false,
123
  "single_word": false
124
  },
125
- "pad_token": "<|endoftext|>",
126
  "unk_token": {
127
  "content": "<|endoftext|>",
128
  "lstrip": false,
129
- "normalized": true,
130
  "rstrip": false,
131
  "single_word": false
132
  }
 
111
  "bos_token": {
112
  "content": "<|endoftext|>",
113
  "lstrip": false,
114
+ "normalized": false,
115
  "rstrip": false,
116
  "single_word": false
117
  },
118
  "eos_token": {
119
  "content": "<|endoftext|>",
120
  "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
+ "pad_token": {
126
+ "content": "<|endoftext|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
  "rstrip": false,
130
  "single_word": false
131
  },
 
132
  "unk_token": {
133
  "content": "<|endoftext|>",
134
  "lstrip": false,
135
+ "normalized": false,
136
  "rstrip": false,
137
  "single_word": false
138
  }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8e9e15ffccfaaf4d559de6137ba0f5621acb1fd54cf075e1a15b20e3ec814ed
3
- size 3643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1673d568bf0b0080cbabe15292938f618760ad94bb79a88f657ec45344a354b5
3
+ size 4024