Training in progress, step 5

Browse files

Files changed (11) hide show

.gitignore +1 -0
added_tokens.json +1 -0
config.json +1 -1
preprocessor_config.json +0 -1
pytorch_model.bin +3 -0
run.sh +1 -1
special_tokens_map.json +1 -0
test_install.py +18 -0
tokenizer_config.json +1 -0
training_args.bin +3 -0
vocab.json +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint-*/

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<s>": 51, "</s>": 52}

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "mattchurgin/xls-r-eng",
   "activation_dropout": 0.0,
   "adapter_kernel_size": 3,
   "adapter_stride": 2,

 {
+  "_name_or_path": "hf-test/xls-r-dummy",
   "activation_dropout": 0.0,
   "adapter_kernel_size": 3,
   "adapter_stride": 2,

preprocessor_config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_processor_class": null,
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,

 {
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58eddd2cf0682445faa984d804066cdcb33d226b8255e030fa92403cfa2f100b
+size 143910

run.sh CHANGED Viewed

@@ -1,6 +1,6 @@
 python run_speech_recognition_ctc.py \
 	--dataset_name="mozilla-foundation/common_voice_7_0" \
-	--model_name_or_path="mattchurgin/xls-r-eng" \
 	--dataset_config_name="ab" \
 	--output_dir="./" \
 	--overwrite_output_dir \

 python run_speech_recognition_ctc.py \
 	--dataset_name="mozilla-foundation/common_voice_7_0" \
+	--model_name_or_path="hf-test/xls-r-dummy" \
 	--dataset_config_name="ab" \
 	--output_dir="./" \
 	--overwrite_output_dir \

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

test_install.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import AutoModelForCTC, AutoProcessor
+from datasets import load_dataset
+import torch
+dummy_dataset = load_dataset("common_voice", "ab", split="test")
+model = AutoModelForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+model.to("cuda")
+processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+input_values = processor(dummy_dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
+input_values = input_values.to("cuda")
+logits = model(input_values).logits
+assert logits.shape[-1] == 32

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbdf2d3958633fdb29fe0d2d7d5677f2ba26d588d895af42a9d045f43bd17f85
+size 2991

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"!": 1, ",": 2, "-": 3, ".": 4, ":": 5, ";": 6, "?": 7, "а": 8, "б": 9, "в": 10, "г": 11, "д": 12, "е": 13, "ж": 14, "з": 15, "и": 16, "к": 17, "л": 18, "м": 19, "н": 20, "о": 21, "п": 22, "р": 23, "с": 24, "т": 25, "у": 26, "ф": 27, "х": 28, "ц": 29, "ч": 30, "ш": 31, "ы": 32, "ь": 33, "џ": 34, "қ": 35, "ҟ": 36, "ҩ": 37, "ҭ": 38, "ҳ": 39, "ҵ": 40, "ҷ": 41, "ҽ": 42, "ҿ": 43, "ә": 44, "ӡ": 45, "ӷ": 46, "ԥ": 47, "–": 48, "|": 0, "[UNK]": 49, "[PAD]": 50}