lucio commited on
Commit
031abfe
1 Parent(s): ba6e038

Training in progress, step 500

Browse files
.ipynb_checkpoints/eval-checkpoint.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ from typing import Dict
5
+
6
+ from datasets import Audio, Dataset, load_dataset, load_metric
7
+
8
+ from transformers import AutoFeatureExtractor, pipeline
9
+
10
+
11
+ def log_results(result: Dataset, args: Dict[str, str]):
12
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
13
+
14
+ log_outputs = args.log_outputs
15
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
16
+
17
+ # load metric
18
+ wer = load_metric("wer")
19
+ cer = load_metric("cer")
20
+
21
+ # compute metrics
22
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
23
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
24
+
25
+ # print & log results
26
+ result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
27
+ print(result_str)
28
+
29
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
30
+ f.write(result_str)
31
+
32
+ # log all results in text file. Possibly interesting for analysis
33
+ if log_outputs is not None:
34
+ pred_file = f"log_{dataset_id}_predictions.txt"
35
+ target_file = f"log_{dataset_id}_targets.txt"
36
+
37
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
38
+
39
+ # mapping function to write output
40
+ def write_to_file(batch, i):
41
+ p.write(f"{i}" + "\n")
42
+ p.write(batch["prediction"] + "\n")
43
+ t.write(f"{i}" + "\n")
44
+ t.write(batch["target"] + "\n")
45
+
46
+ result.map(write_to_file, with_indices=True)
47
+
48
+
49
+ def normalize_text(text: str) -> str:
50
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
51
+
52
+ chars_to_ignore_regex = '[!"%,.:;?\\_|©«¬»،؛؟‒–—’“”„…‹›−☺♂�\\\\-]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
53
+
54
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
55
+
56
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
57
+ # note that order is important here!
58
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
59
+
60
+ for t in token_sequences_to_ignore:
61
+ text = " ".join(text.split(t))
62
+
63
+ return text
64
+
65
+
66
+ def main(args):
67
+ # load dataset
68
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
69
+
70
+ # for testing: only process the first two examples as a test
71
+ # dataset = dataset.select(range(10))
72
+
73
+ # load processor
74
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
75
+ sampling_rate = feature_extractor.sampling_rate
76
+
77
+ # resample audio
78
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
79
+
80
+ # load eval pipeline
81
+ asr = pipeline("automatic-speech-recognition", model=args.model_id)
82
+
83
+ # map function to decode audio
84
+ def map_to_pred(batch):
85
+ prediction = asr(
86
+ batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
87
+ )
88
+
89
+ batch["prediction"] = prediction["text"]
90
+ batch["target"] = normalize_text(batch["sentence"])
91
+ return batch
92
+
93
+ # run inference on all examples
94
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
95
+
96
+ # compute and log_results
97
+ # do not change function below
98
+ log_results(result, args)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ parser = argparse.ArgumentParser()
103
+
104
+ parser.add_argument(
105
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
106
+ )
107
+ parser.add_argument(
108
+ "--dataset",
109
+ type=str,
110
+ required=True,
111
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
112
+ )
113
+ parser.add_argument(
114
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
115
+ )
116
+ parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
117
+ parser.add_argument(
118
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
119
+ )
120
+ parser.add_argument(
121
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
122
+ )
123
+ parser.add_argument(
124
+ "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
125
+ )
126
+ args = parser.parse_args()
127
+
128
+ main(args)
.ipynb_checkpoints/run-checkpoint.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python kyrgiz/run_speech_recognition_ctc.py \
2
+ --dataset_name="mozilla-foundation/common_voice_8_0" \
3
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
4
+ --dataset_config_name="ky" \
5
+ --train_split_name="train+validation[:50%]" \
6
+ --eval_split_name="validation[50%:]" \
7
+ --output_dir="./xls-r-kyrgiz-cv8" \
8
+ --overwrite_output_dir \
9
+ --num_train_epochs="50" \
10
+ --per_device_train_batch_size="16" \
11
+ --per_device_eval_batch_size="8" \
12
+ --gradient_accumulation_steps="4" \
13
+ --learning_rate="1e-4" \
14
+ --warmup_steps="250" \
15
+ --length_column_name="input_length" \
16
+ --evaluation_strategy="steps" \
17
+ --text_column_name="sentence" \
18
+ --chars_to_ignore , ? . ! \- \; \: \\ _ \| ‒ ☺ ♂ © « ¬ » \" „ “ % ” � — ’ ، ؛ ؟ ‹ › − … – \
19
+ --eval_metrics="wer" \
20
+ --save_steps="500" \
21
+ --eval_steps="500" \
22
+ --logging_steps="100" \
23
+ --min_duration_in_seconds="0.2" \
24
+ --layerdrop="0.01" \
25
+ --activation_dropout="0.1" \
26
+ --save_total_limit="3" \
27
+ --freeze_feature_encoder \
28
+ --feat_proj_dropout="0.01" \
29
+ --mask_time_prob="0.50" \
30
+ --mask_time_length="10" \
31
+ --mask_feature_prob="0.25" \
32
+ --mask_feature_length="64" \
33
+ --gradient_checkpointing \
34
+ --use_auth_token \
35
+ --fp16 \
36
+ --group_by_length \
37
+ --do_train --do_eval \
38
+ --push_to_hub
config.json CHANGED
@@ -49,7 +49,7 @@
49
  "feat_extract_activation": "gelu",
50
  "feat_extract_dropout": 0.0,
51
  "feat_extract_norm": "layer",
52
- "feat_proj_dropout": 0.0,
53
  "feat_quantizer_dropout": 0.0,
54
  "final_dropout": 0.0,
55
  "hidden_act": "gelu",
@@ -58,13 +58,13 @@
58
  "initializer_range": 0.02,
59
  "intermediate_size": 4096,
60
  "layer_norm_eps": 1e-05,
61
- "layerdrop": 0.0,
62
  "mask_feature_length": 64,
63
  "mask_feature_min_masks": 0,
64
  "mask_feature_prob": 0.25,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
67
- "mask_time_prob": 0.75,
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
 
49
  "feat_extract_activation": "gelu",
50
  "feat_extract_dropout": 0.0,
51
  "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.01,
53
  "feat_quantizer_dropout": 0.0,
54
  "final_dropout": 0.0,
55
  "hidden_act": "gelu",
 
58
  "initializer_range": 0.02,
59
  "intermediate_size": 4096,
60
  "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.01,
62
  "mask_feature_length": 64,
63
  "mask_feature_min_masks": 0,
64
  "mask_feature_prob": 0.25,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.5,
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
log_mozilla-foundation_common_voice_8_0_ky_test_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_8_0_ky_test_targets.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0011d45cb89b71516dfd84ec016513f83e261f45eb3b14519cb156fc9a56e796
3
  size 1262095857
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6199b399576e56ccf0b75f137f2c3b014f6ed6fc8036caf25a21c670c49ffc76
3
  size 1262095857
run.sh CHANGED
@@ -6,12 +6,12 @@ python kyrgiz/run_speech_recognition_ctc.py \
6
  --eval_split_name="validation[50%:]" \
7
  --output_dir="./xls-r-kyrgiz-cv8" \
8
  --overwrite_output_dir \
9
- --num_train_epochs="200" \
10
  --per_device_train_batch_size="16" \
11
  --per_device_eval_batch_size="8" \
12
  --gradient_accumulation_steps="4" \
13
  --learning_rate="1e-4" \
14
- --warmup_steps="500" \
15
  --length_column_name="input_length" \
16
  --evaluation_strategy="steps" \
17
  --text_column_name="sentence" \
@@ -21,12 +21,12 @@ python kyrgiz/run_speech_recognition_ctc.py \
21
  --eval_steps="500" \
22
  --logging_steps="100" \
23
  --min_duration_in_seconds="0.2" \
24
- --layerdrop="0.0" \
25
  --activation_dropout="0.1" \
26
  --save_total_limit="3" \
27
  --freeze_feature_encoder \
28
- --feat_proj_dropout="0.0" \
29
- --mask_time_prob="0.75" \
30
  --mask_time_length="10" \
31
  --mask_feature_prob="0.25" \
32
  --mask_feature_length="64" \
 
6
  --eval_split_name="validation[50%:]" \
7
  --output_dir="./xls-r-kyrgiz-cv8" \
8
  --overwrite_output_dir \
9
+ --num_train_epochs="50" \
10
  --per_device_train_batch_size="16" \
11
  --per_device_eval_batch_size="8" \
12
  --gradient_accumulation_steps="4" \
13
  --learning_rate="1e-4" \
14
+ --warmup_steps="250" \
15
  --length_column_name="input_length" \
16
  --evaluation_strategy="steps" \
17
  --text_column_name="sentence" \
 
21
  --eval_steps="500" \
22
  --logging_steps="100" \
23
  --min_duration_in_seconds="0.2" \
24
+ --layerdrop="0.01" \
25
  --activation_dropout="0.1" \
26
  --save_total_limit="3" \
27
  --freeze_feature_encoder \
28
+ --feat_proj_dropout="0.01" \
29
+ --mask_time_prob="0.50" \
30
  --mask_time_length="10" \
31
  --mask_feature_prob="0.25" \
32
  --mask_feature_length="64" \
runs/Feb04_19-31-13_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1644003121.6455085/events.out.tfevents.1644003121.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1462870.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8684f706f1b6d98f1f2811df032c25d412d6d070d7b0c81d8687661090103c97
3
+ size 4802
runs/Feb04_19-31-13_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1644003121.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1462870.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e6cc2357c5c3b93d68aa44f852cec869b2128d20e9d741ec82a1789e6a3afa9
3
+ size 4756
runs/Feb04_19-35-31_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1644003377.3563116/events.out.tfevents.1644003377.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1464599.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed147ab6a4afce4c3e6fb99e4d8f47da146b3db51412c4c65c6a8d03d50da309
3
+ size 4802
runs/Feb04_19-35-31_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1644003377.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1464599.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99720c701214c1653a05e1396a33a187eb008fdc29e97e62c9a9607f1ad7823
3
+ size 5856
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cde060aa868bd66f6b2774ce16ec7877e122267f0549334cd1553b0d3ad50ad5
3
  size 3055
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4de827673e3dfe665950f44fa7245a141f080f380ad5cea7b1833a16096357
3
  size 3055