marinone94
commited on
Commit
•
ba980b2
1
Parent(s):
bf11fb8
new training
Browse files- added_tokens.json +1 -1
- alphabet.json +0 -1
- config.json +4 -5
- run.sh +1 -2
- run_speech_recognition_ctc.py +48 -23
- special_tokens_map.json +1 -1
- vocab.json +1 -0
added_tokens.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"<s>":
|
|
|
1 |
+
{"<s>": 33, "</s>": 34}
|
alphabet.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"labels": [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e4", "\u00e5", "\u00e9", "\u00f4", "\u00f6", "\u00fc", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
|
|
|
|
config.json
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
"add_adapter": false,
|
7 |
"apply_spec_augment": true,
|
8 |
"architectures": [
|
9 |
-
"
|
10 |
],
|
11 |
"attention_dropout": 0.0,
|
12 |
"bos_token_id": 1,
|
@@ -84,7 +84,7 @@
|
|
84 |
"num_hidden_layers": 24,
|
85 |
"num_negatives": 100,
|
86 |
"output_hidden_size": 1024,
|
87 |
-
"pad_token_id":
|
88 |
"proj_codevector_dim": 768,
|
89 |
"tdnn_dilation": [
|
90 |
1,
|
@@ -107,9 +107,8 @@
|
|
107 |
1,
|
108 |
1
|
109 |
],
|
110 |
-
"
|
111 |
-
"transformers_version": "4.16.0.dev0",
|
112 |
"use_weighted_layer_sum": false,
|
113 |
-
"vocab_size":
|
114 |
"xvector_output_dim": 512
|
115 |
}
|
|
|
6 |
"add_adapter": false,
|
7 |
"apply_spec_augment": true,
|
8 |
"architectures": [
|
9 |
+
"Wav2Vec2ForPreTraining"
|
10 |
],
|
11 |
"attention_dropout": 0.0,
|
12 |
"bos_token_id": 1,
|
|
|
84 |
"num_hidden_layers": 24,
|
85 |
"num_negatives": 100,
|
86 |
"output_hidden_size": 1024,
|
87 |
+
"pad_token_id": 32,
|
88 |
"proj_codevector_dim": 768,
|
89 |
"tdnn_dilation": [
|
90 |
1,
|
|
|
107 |
1,
|
108 |
1
|
109 |
],
|
110 |
+
"transformers_version": "4.17.0.dev0",
|
|
|
111 |
"use_weighted_layer_sum": false,
|
112 |
+
"vocab_size": 35,
|
113 |
"xvector_output_dim": 512
|
114 |
}
|
run.sh
CHANGED
@@ -4,7 +4,6 @@ python run_speech_recognition_ctc.py \
|
|
4 |
--dataset_config_name="sv-SE,distant_channel" \
|
5 |
--train_split_name="train+validation,train" \
|
6 |
--eval_split_name="test,None" \
|
7 |
-
--preprocessing_only \
|
8 |
--output_dir="./" \
|
9 |
--overwrite_output_dir \
|
10 |
--num_train_epochs="5" \
|
@@ -17,7 +16,7 @@ python run_speech_recognition_ctc.py \
|
|
17 |
--evaluation_strategy="epoch" \
|
18 |
--save_strategy="epoch" \
|
19 |
--text_column_name="sentence" \
|
20 |
-
--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … –
|
21 |
--logging_steps="100" \
|
22 |
--layerdrop="0.0" \
|
23 |
--activation_dropout="0.1" \
|
|
|
4 |
--dataset_config_name="sv-SE,distant_channel" \
|
5 |
--train_split_name="train+validation,train" \
|
6 |
--eval_split_name="test,None" \
|
|
|
7 |
--output_dir="./" \
|
8 |
--overwrite_output_dir \
|
9 |
--num_train_epochs="5" \
|
|
|
16 |
--evaluation_strategy="epoch" \
|
17 |
--save_strategy="epoch" \
|
18 |
--text_column_name="sentence" \
|
19 |
+
--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
|
20 |
--logging_steps="100" \
|
21 |
--layerdrop="0.0" \
|
22 |
--activation_dropout="0.1" \
|
run_speech_recognition_ctc.py
CHANGED
@@ -321,25 +321,20 @@ def create_vocabulary_from_data(
|
|
321 |
pad_token: Optional[str] = None,
|
322 |
):
|
323 |
# Given training and test labels create vocabulary
|
324 |
-
def extract_all_chars(batch):
|
325 |
-
all_text = " ".join(batch
|
326 |
-
|
327 |
-
return {"vocab": [vocab], "all_text": [all_text]}
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
# take union of all unique characters in each dataset
|
338 |
-
vocab_set = functools.reduce(
|
339 |
-
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
|
340 |
-
)
|
341 |
|
342 |
-
vocab_dict = {v: k for k, v in enumerate(sorted(
|
343 |
|
344 |
# replace white space with delimiter token
|
345 |
if word_delimiter_token is not None:
|
@@ -458,7 +453,7 @@ def main():
|
|
458 |
)
|
459 |
min_columns_train = common_cols(min_columns_train, new_dataset.column_names)
|
460 |
else:
|
461 |
-
logging.warning(f"{dataset_name} {dataset_config_name} as split is {train_split_name}")
|
462 |
|
463 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
464 |
raise ValueError(
|
@@ -512,7 +507,7 @@ def main():
|
|
512 |
)
|
513 |
min_columns_eval = common_cols(min_columns_eval, new_dataset.column_names)
|
514 |
else:
|
515 |
-
logging.warning(f"{dataset_name} {dataset_config_name} as split is {eval_split_name}")
|
516 |
|
517 |
if data_args.max_eval_samples is not None:
|
518 |
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
@@ -536,9 +531,32 @@ def main():
|
|
536 |
|
537 |
def remove_special_characters(batch):
|
538 |
if chars_to_ignore_regex is not None:
|
539 |
-
batch["target_text"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
540 |
else:
|
541 |
-
batch["target_text"] = batch[text_column_name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
return batch
|
543 |
|
544 |
num_workers = data_args.preprocessing_num_workers
|
@@ -694,9 +712,16 @@ def main():
|
|
694 |
return batch
|
695 |
|
696 |
with training_args.main_process_first(desc="dataset map preprocessing"):
|
697 |
-
vectorized_datasets =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
698 |
prepare_dataset,
|
699 |
-
remove_columns=
|
700 |
num_proc=num_workers,
|
701 |
desc="preprocess datasets",
|
702 |
)
|
|
|
321 |
pad_token: Optional[str] = None,
|
322 |
):
|
323 |
# Given training and test labels create vocabulary
|
324 |
+
def extract_all_chars(batch, vocab):
|
325 |
+
all_text = " ".join(batch)
|
326 |
+
return list(set(list(set(all_text)) + vocab))
|
|
|
327 |
|
328 |
+
batch_size = 10000
|
329 |
+
vocab = []
|
330 |
+
for i in range(0, datasets["train"].num_rows, 10000):
|
331 |
+
batch = datasets["train"].select(range(i, min(datasets["train"].num_rows, i+batch_size)))
|
332 |
+
vocab = extract_all_chars(batch["target_text"], vocab)
|
333 |
+
for i in range(0, datasets["eval"].num_rows, 10000):
|
334 |
+
batch = datasets["eval"].select(range(i, min(datasets["eval"].num_rows, i+batch_size)))
|
335 |
+
vocab = extract_all_chars(batch["target_text"], vocab)
|
|
|
|
|
|
|
|
|
336 |
|
337 |
+
vocab_dict = {v: k for k, v in enumerate(sorted(vocab))}
|
338 |
|
339 |
# replace white space with delimiter token
|
340 |
if word_delimiter_token is not None:
|
|
|
453 |
)
|
454 |
min_columns_train = common_cols(min_columns_train, new_dataset.column_names)
|
455 |
else:
|
456 |
+
logging.warning(f"{dataset_name} {dataset_config_name} train not loaded as split is {train_split_name}")
|
457 |
|
458 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
459 |
raise ValueError(
|
|
|
507 |
)
|
508 |
min_columns_eval = common_cols(min_columns_eval, new_dataset.column_names)
|
509 |
else:
|
510 |
+
logging.warning(f"{dataset_name} {dataset_config_name} eval not loaded as split is {eval_split_name}")
|
511 |
|
512 |
if data_args.max_eval_samples is not None:
|
513 |
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
|
|
531 |
|
532 |
def remove_special_characters(batch):
|
533 |
if chars_to_ignore_regex is not None:
|
534 |
+
batch["target_text"] = \
|
535 |
+
re.sub(chars_to_ignore_regex, "", batch[text_column_name]) \
|
536 |
+
.replace("\\\\Punkt", "") \
|
537 |
+
.replace("\\\\Komma", "") \
|
538 |
+
.replace("è", "e") \
|
539 |
+
.replace("é", "e") \
|
540 |
+
.replace("î", "i") \
|
541 |
+
.replace("ü", "u") \
|
542 |
+
.replace("ÿ", "y") \
|
543 |
+
.replace("\\", "") \
|
544 |
+
.replace("/", "") \
|
545 |
+
.replace("|", "") \
|
546 |
+
.lower() + " "
|
547 |
else:
|
548 |
+
batch["target_text"] = batch[text_column_name] \
|
549 |
+
.replace("\\\\Punkt", "") \
|
550 |
+
.replace("\\\\Komma", "") \
|
551 |
+
.replace("è", "e") \
|
552 |
+
.replace("é", "e") \
|
553 |
+
.replace("î", "i") \
|
554 |
+
.replace("ü", "u") \
|
555 |
+
.replace("ÿ", "y") \
|
556 |
+
.replace("\\", "") \
|
557 |
+
.replace("/", "") \
|
558 |
+
.replace("|", "") \
|
559 |
+
.lower() + " "
|
560 |
return batch
|
561 |
|
562 |
num_workers = data_args.preprocessing_num_workers
|
|
|
712 |
return batch
|
713 |
|
714 |
with training_args.main_process_first(desc="dataset map preprocessing"):
|
715 |
+
vectorized_datasets = DatasetDict()
|
716 |
+
vectorized_datasets["train"] = raw_datasets["train"].map(
|
717 |
+
prepare_dataset,
|
718 |
+
remove_columns=raw_datasets["train"].column_names,
|
719 |
+
num_proc=num_workers,
|
720 |
+
desc="preprocess datasets",
|
721 |
+
)
|
722 |
+
vectorized_datasets["eval"] = raw_datasets["eval"].map(
|
723 |
prepare_dataset,
|
724 |
+
remove_columns=raw_datasets["eval"].column_names,
|
725 |
num_proc=num_workers,
|
726 |
desc="preprocess datasets",
|
727 |
)
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "ä": 27, "å": 28, "ô": 29, "ö": 30, "|": 0, "[UNK]": 31, "[PAD]": 32}
|