AlexN commited on
Commit
5a444dc
1 Parent(s): a799336

working tokenizer

Browse files
.ipynb_checkpoints/added_tokens-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {}
 
1
+ {"<s>": 216, "</s>": 217, "<pad>": 218}
.ipynb_checkpoints/eval-checkpoint.py CHANGED
@@ -85,7 +85,7 @@ def main(args):
85
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
86
 
87
  # for testing: only process the first two examples as a test
88
- #dataset = dataset.select(range(20))
89
 
90
  # load processor
91
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
@@ -105,7 +105,7 @@ def main(args):
105
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
106
  )
107
 
108
- batch["prediction"] = "".join(prediction["text"].split("<s>"))
109
  batch["target"] = normalize_text(batch["sentence"])
110
  return batch
111
 
 
85
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
86
 
87
  # for testing: only process the first two examples as a test
88
+ dataset = dataset.select(range(2))
89
 
90
  # load processor
91
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
 
105
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
106
  )
107
 
108
+ batch["prediction"] = prediction["text"] # "".join(prediction["text"].split("<s>"))
109
  batch["target"] = normalize_text(batch["sentence"])
110
  return batch
111
 
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_predictions-checkpoint.txt ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_targets-checkpoint.txt ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/special_tokens_map-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": null, "eos_token": null, "unk_token": "<unk>", "pad_token": "<pad>"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
.ipynb_checkpoints/tokenizer_config-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<unk>", "bos_token": null, "eos_token": null, "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "padding": true, "truncation": true, "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {}
 
1
+ {"<s>": 216, "</s>": 217, "<pad>": 218}
eval.py CHANGED
@@ -85,7 +85,7 @@ def main(args):
85
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
86
 
87
  # for testing: only process the first two examples as a test
88
- #dataset = dataset.select(range(20))
89
 
90
  # load processor
91
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
@@ -105,7 +105,7 @@ def main(args):
105
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
106
  )
107
 
108
- batch["prediction"] = "".join(prediction["text"].split("<s>"))
109
  batch["target"] = normalize_text(batch["sentence"])
110
  return batch
111
 
 
85
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
86
 
87
  # for testing: only process the first two examples as a test
88
+ dataset = dataset.select(range(2))
89
 
90
  # load processor
91
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
 
105
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
106
  )
107
 
108
+ batch["prediction"] = prediction["text"] # "".join(prediction["text"].split("<s>"))
109
  batch["target"] = normalize_text(batch["sentence"])
110
  return batch
111
 
log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt CHANGED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_8_0_fr_test_targets.txt CHANGED
The diff for this file is too large to render. See raw diff
 
mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt CHANGED
@@ -1,2 +1,2 @@
1
- WER: 0.21587470509795875
2
- CER: 0.06356032070032196
 
1
+ WER: 0.0625
2
+ CER: 0.06382978723404255
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": null, "eos_token": null, "unk_token": "<unk>", "pad_token": "<pad>"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<unk>", "bos_token": null, "eos_token": null, "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "padding": true, "truncation": true, "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}