diff --git "a/basaa_training_script.ipynb" "b/basaa_training_script.ipynb" new file mode 100644--- /dev/null +++ "b/basaa_training_script.ipynb" @@ -0,0 +1,1925 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HuggingFace challenge - Debugger notebook\n", + "Run this notebook to verify your libraries versions, check GPU config and run a quick training" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "T2utsYSKszvv" + }, + "outputs": [], + "source": [ + "import platform\n", + "import multiprocessing\n", + "\n", + "import torch\n", + "import transformers\n", + "import datasets\n", + "\n", + "import soundfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print main infos" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5P6I-W9ts-kR", + "outputId": "939bd550-1486-46a6-8371-e82ada0f448c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10\n", + "CPU cores: 60\n", + "Python version: 3.8.8\n", + "PyTorch version: 1.10.1+cu102\n", + "GPU is visible: True\n", + "Transformers version: 4.16.0.dev0\n", + "Datasets version: 1.17.1.dev0\n", + "soundfile version: 0.10.3\n" + ] + } + ], + "source": [ + "print(f\"Platform: {platform.platform()}\")\n", + "print(f\"CPU cores: {multiprocessing.cpu_count()}\")\n", + "\n", + "print(f\"Python version: {platform.python_version()}\")\n", + "\n", + "print(f\"PyTorch version: {torch.__version__}\")\n", + "print(f\"GPU is visible: {torch.cuda.is_available()}\")\n", + "\n", + "print(f\"Transformers version: {transformers.__version__}\")\n", + "print(f\"Datasets version: {datasets.__version__}\")\n", + "\n", + "print(f\"soundfile version: {soundfile.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check your GPU informations (if any)\n", + "If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).\n", + "Driver and CUDA version " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YT7fRnKctggU", + "outputId": "f355a3e0-20da-489f-bd1f-5e508e792a68" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mon Jan 24 17:23:29 2022 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla V100S-PCI... Off | 00000000:00:06.0 Off | 0 |\n", + "| N/A 36C P0 26W / 250W | 4MiB / 32510MiB | 0% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fa897b4afc049229144599af9e3f807", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='
\\n] 29.64K --.-KB/s in 0.001s \n", + "\n", + "2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]\n", + "\n" + ] + } + ], + "source": [ + "!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# \t--learning_rate=\"7.5e-5\" \\\n", + "# 84.5" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mz4bubhxxsad", + "outputId": "23398525-cc19-43c2-9fec-497e06214f29" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "01/25/2022 02:48:41 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: True\n", + "01/25/2022 02:48:41 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", + "_n_gpu=1,\n", + "adafactor=False,\n", + "adam_beta1=0.9,\n", + "adam_beta2=0.999,\n", + "adam_epsilon=1e-08,\n", + "bf16=False,\n", + "bf16_full_eval=False,\n", + "dataloader_drop_last=False,\n", + "dataloader_num_workers=0,\n", + "dataloader_pin_memory=True,\n", + "ddp_bucket_cap_mb=None,\n", + "ddp_find_unused_parameters=None,\n", + "debug=[],\n", + "deepspeed=None,\n", + "disable_tqdm=False,\n", + "do_eval=True,\n", + "do_predict=False,\n", + "do_train=True,\n", + "eval_accumulation_steps=None,\n", + "eval_steps=500,\n", + "evaluation_strategy=IntervalStrategy.STEPS,\n", + "fp16=True,\n", + "fp16_backend=auto,\n", + "fp16_full_eval=False,\n", + "fp16_opt_level=O1,\n", + "gradient_accumulation_steps=1,\n", + "gradient_checkpointing=True,\n", + "greater_is_better=None,\n", + "group_by_length=True,\n", + "half_precision_backend=auto,\n", + "hub_model_id=None,\n", + "hub_strategy=HubStrategy.EVERY_SAVE,\n", + "hub_token=,\n", + "ignore_data_skip=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=7e-05,\n", + "length_column_name=input_length,\n", + "load_best_model_at_end=False,\n", + "local_rank=-1,\n", + "log_level=-1,\n", + "log_level_replica=-1,\n", + "log_on_each_node=True,\n", + "logging_dir=./wav2vec2-large-xls-r-300m-basaa/runs/Jan25_02-48-41_job-8be8b741-e32e-4579-bbec-1e00d9824b4f,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=100,\n", + "logging_strategy=IntervalStrategy.STEPS,\n", + "lr_scheduler_type=SchedulerType.LINEAR,\n", + "max_grad_norm=1.0,\n", + "max_steps=-1,\n", + "metric_for_best_model=None,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=200.0,\n", + "optim=OptimizerNames.ADAMW_HF,\n", + "output_dir=./wav2vec2-large-xls-r-300m-basaa,\n", + "overwrite_output_dir=True,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=32,\n", + "per_device_train_batch_size=32,\n", + "prediction_loss_only=False,\n", + "push_to_hub=True,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "remove_unused_columns=True,\n", + "report_to=[],\n", + "resume_from_checkpoint=None,\n", + "run_name=./wav2vec2-large-xls-r-300m-basaa,\n", + "save_on_each_node=False,\n", + "save_steps=500,\n", + "save_strategy=IntervalStrategy.STEPS,\n", + "save_total_limit=2,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "tf32=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_legacy_prediction_loop=False,\n", + "warmup_ratio=0.0,\n", + "warmup_steps=500,\n", + "weight_decay=0.0,\n", + "xpu_backend=None,\n", + ")\n", + "01/25/2022 02:48:43 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", + "01/25/2022 02:48:46 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", + "01/25/2022 02:48:46 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-3a6331ffb89c104d.arrow\n", + "01/25/2022 02:48:46 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-5e7d861af8466195.arrow\n", + "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", + "Model config Wav2Vec2Config {\n", + " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", + " \"activation_dropout\": 0.0,\n", + " \"adapter_kernel_size\": 3,\n", + " \"adapter_stride\": 2,\n", + " \"add_adapter\": false,\n", + " \"apply_spec_augment\": true,\n", + " \"architectures\": [\n", + " \"Wav2Vec2ForPreTraining\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"bos_token_id\": 1,\n", + " \"classifier_proj_size\": 256,\n", + " \"codevector_dim\": 768,\n", + " \"contrastive_logits_temperature\": 0.1,\n", + " \"conv_bias\": true,\n", + " \"conv_dim\": [\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512\n", + " ],\n", + " \"conv_kernel\": [\n", + " 10,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 2,\n", + " 2\n", + " ],\n", + " \"conv_stride\": [\n", + " 5,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2\n", + " ],\n", + " \"ctc_loss_reduction\": \"sum\",\n", + " \"ctc_zero_infinity\": false,\n", + " \"diversity_loss_weight\": 0.1,\n", + " \"do_stable_layer_norm\": true,\n", + " \"eos_token_id\": 2,\n", + " \"feat_extract_activation\": \"gelu\",\n", + " \"feat_extract_dropout\": 0.0,\n", + " \"feat_extract_norm\": \"layer\",\n", + " \"feat_proj_dropout\": 0.1,\n", + " \"feat_quantizer_dropout\": 0.0,\n", + " \"final_dropout\": 0.0,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout\": 0.1,\n", + " \"hidden_size\": 1024,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 4096,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"layerdrop\": 0.1,\n", + " \"mask_feature_length\": 10,\n", + " \"mask_feature_min_masks\": 0,\n", + " \"mask_feature_prob\": 0.0,\n", + " \"mask_time_length\": 10,\n", + " \"mask_time_min_masks\": 2,\n", + " \"mask_time_prob\": 0.075,\n", + " \"model_type\": \"wav2vec2\",\n", + " \"num_adapter_layers\": 3,\n", + " \"num_attention_heads\": 16,\n", + " \"num_codevector_groups\": 2,\n", + " \"num_codevectors_per_group\": 320,\n", + " \"num_conv_pos_embedding_groups\": 16,\n", + " \"num_conv_pos_embeddings\": 128,\n", + " \"num_feat_extract_layers\": 7,\n", + " \"num_hidden_layers\": 24,\n", + " \"num_negatives\": 100,\n", + " \"output_hidden_size\": 1024,\n", + " \"pad_token_id\": 0,\n", + " \"proj_codevector_dim\": 768,\n", + " \"tdnn_dilation\": [\n", + " 1,\n", + " 2,\n", + " 3,\n", + " 1,\n", + " 1\n", + " ],\n", + " \"tdnn_dim\": [\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 1500\n", + " ],\n", + " \"tdnn_kernel\": [\n", + " 5,\n", + " 3,\n", + " 3,\n", + " 1,\n", + " 1\n", + " ],\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.16.0.dev0\",\n", + " \"use_weighted_layer_sum\": false,\n", + " \"vocab_size\": 32,\n", + " \"xvector_output_dim\": 512\n", + "}\n", + "\n", + "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 19.86ba/s]\n", + "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 38.36ba/s]\n", + "Didn't find file ./wav2vec2-large-xls-r-300m-basaa/tokenizer_config.json. We won't load it.\n", + "Didn't find file ./wav2vec2-large-xls-r-300m-basaa/added_tokens.json. We won't load it.\n", + "Didn't find file ./wav2vec2-large-xls-r-300m-basaa/special_tokens_map.json. We won't load it.\n", + "Didn't find file ./wav2vec2-large-xls-r-300m-basaa/tokenizer.json. We won't load it.\n", + "loading file ./wav2vec2-large-xls-r-300m-basaa/vocab.json\n", + "loading file None\n", + "loading file None\n", + "loading file None\n", + "loading file None\n", + "file ./wav2vec2-large-xls-r-300m-basaa/config.json not found\n", + "Adding to the vocabulary\n", + "Adding to the vocabulary\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", + "Model config Wav2Vec2Config {\n", + " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", + " \"activation_dropout\": 0.0,\n", + " \"adapter_kernel_size\": 3,\n", + " \"adapter_stride\": 2,\n", + " \"add_adapter\": false,\n", + " \"apply_spec_augment\": true,\n", + " \"architectures\": [\n", + " \"Wav2Vec2ForPreTraining\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"bos_token_id\": 1,\n", + " \"classifier_proj_size\": 256,\n", + " \"codevector_dim\": 768,\n", + " \"contrastive_logits_temperature\": 0.1,\n", + " \"conv_bias\": true,\n", + " \"conv_dim\": [\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512\n", + " ],\n", + " \"conv_kernel\": [\n", + " 10,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 2,\n", + " 2\n", + " ],\n", + " \"conv_stride\": [\n", + " 5,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2\n", + " ],\n", + " \"ctc_loss_reduction\": \"sum\",\n", + " \"ctc_zero_infinity\": false,\n", + " \"diversity_loss_weight\": 0.1,\n", + " \"do_stable_layer_norm\": true,\n", + " \"eos_token_id\": 2,\n", + " \"feat_extract_activation\": \"gelu\",\n", + " \"feat_extract_dropout\": 0.0,\n", + " \"feat_extract_norm\": \"layer\",\n", + " \"feat_proj_dropout\": 0.1,\n", + " \"feat_quantizer_dropout\": 0.0,\n", + " \"final_dropout\": 0.0,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout\": 0.1,\n", + " \"hidden_size\": 1024,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 4096,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"layerdrop\": 0.1,\n", + " \"mask_feature_length\": 10,\n", + " \"mask_feature_min_masks\": 0,\n", + " \"mask_feature_prob\": 0.0,\n", + " \"mask_time_length\": 10,\n", + " \"mask_time_min_masks\": 2,\n", + " \"mask_time_prob\": 0.075,\n", + " \"model_type\": \"wav2vec2\",\n", + " \"num_adapter_layers\": 3,\n", + " \"num_attention_heads\": 16,\n", + " \"num_codevector_groups\": 2,\n", + " \"num_codevectors_per_group\": 320,\n", + " \"num_conv_pos_embedding_groups\": 16,\n", + " \"num_conv_pos_embeddings\": 128,\n", + " \"num_feat_extract_layers\": 7,\n", + " \"num_hidden_layers\": 24,\n", + " \"num_negatives\": 100,\n", + " \"output_hidden_size\": 1024,\n", + " \"pad_token_id\": 0,\n", + " \"proj_codevector_dim\": 768,\n", + " \"tdnn_dilation\": [\n", + " 1,\n", + " 2,\n", + " 3,\n", + " 1,\n", + " 1\n", + " ],\n", + " \"tdnn_dim\": [\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 1500\n", + " ],\n", + " \"tdnn_kernel\": [\n", + " 5,\n", + " 3,\n", + " 3,\n", + " 1,\n", + " 1\n", + " ],\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.16.0.dev0\",\n", + " \"use_weighted_layer_sum\": false,\n", + " \"vocab_size\": 32,\n", + " \"xvector_output_dim\": 512\n", + "}\n", + "\n", + "loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n", + "Feature extractor Wav2Vec2FeatureExtractor {\n", + " \"do_normalize\": true,\n", + " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", + " \"feature_size\": 1,\n", + " \"padding_side\": \"right\",\n", + " \"padding_value\": 0,\n", + " \"return_attention_mask\": true,\n", + " \"sampling_rate\": 16000\n", + "}\n", + "\n", + "loading weights file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n", + "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'project_q.bias', 'project_hid.bias', 'quantizer.weight_proj.weight', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.weight']\n", + "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "preprocess datasets: 100%|█████████████████| 1008/1008 [00:07<00:00, 138.56ex/s]\n", + "preprocess datasets: 100%|███████████████████| 375/375 [00:02<00:00, 153.03ex/s]\n", + "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 995.09ba/s]\n", + "100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 970.45ba/s]\n", + "Configuration saved in ./wav2vec2-large-xls-r-300m-basaa/preprocessor_config.json\n", + "tokenizer config file saved in ./wav2vec2-large-xls-r-300m-basaa/tokenizer_config.json\n", + "Special tokens file saved in ./wav2vec2-large-xls-r-300m-basaa/special_tokens_map.json\n", + "added tokens file saved in ./wav2vec2-large-xls-r-300m-basaa/added_tokens.json\n", + "Configuration saved in ./wav2vec2-large-xls-r-300m-basaa/config.json\n", + "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-basaa/preprocessor_config.json\n", + "loading configuration file ./wav2vec2-large-xls-r-300m-basaa/config.json\n", + "Model config Wav2Vec2Config {\n", + " \"_name_or_path\": \"./wav2vec2-large-xls-r-300m-basaa\",\n", + " \"activation_dropout\": 0.1,\n", + " \"adapter_kernel_size\": 3,\n", + " \"adapter_stride\": 2,\n", + " \"add_adapter\": false,\n", + " \"apply_spec_augment\": true,\n", + " \"architectures\": [\n", + " \"Wav2Vec2ForPreTraining\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"classifier_proj_size\": 256,\n", + " \"codevector_dim\": 768,\n", + " \"contrastive_logits_temperature\": 0.1,\n", + " \"conv_bias\": true,\n", + " \"conv_dim\": [\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512\n", + " ],\n", + " \"conv_kernel\": [\n", + " 10,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 2,\n", + " 2\n", + " ],\n", + " \"conv_stride\": [\n", + " 5,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2\n", + " ],\n", + " \"ctc_loss_reduction\": \"mean\",\n", + " \"ctc_zero_infinity\": false,\n", + " \"diversity_loss_weight\": 0.1,\n", + " \"do_stable_layer_norm\": true,\n", + " \"eos_token_id\": 2,\n", + " \"feat_extract_activation\": \"gelu\",\n", + " \"feat_extract_dropout\": 0.0,\n", + " \"feat_extract_norm\": \"layer\",\n", + " \"feat_proj_dropout\": 0.0,\n", + " \"feat_quantizer_dropout\": 0.0,\n", + " \"final_dropout\": 0.0,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout\": 0.0,\n", + " \"hidden_size\": 1024,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 4096,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"layerdrop\": 0.0,\n", + " \"mask_feature_length\": 64,\n", + " \"mask_feature_min_masks\": 0,\n", + " \"mask_feature_prob\": 0.25,\n", + " \"mask_time_length\": 10,\n", + " \"mask_time_min_masks\": 2,\n", + " \"mask_time_prob\": 0.75,\n", + " \"model_type\": \"wav2vec2\",\n", + " \"num_adapter_layers\": 3,\n", + " \"num_attention_heads\": 16,\n", + " \"num_codevector_groups\": 2,\n", + " \"num_codevectors_per_group\": 320,\n", + " \"num_conv_pos_embedding_groups\": 16,\n", + " \"num_conv_pos_embeddings\": 128,\n", + " \"num_feat_extract_layers\": 7,\n", + " \"num_hidden_layers\": 24,\n", + " \"num_negatives\": 100,\n", + " \"output_hidden_size\": 1024,\n", + " \"pad_token_id\": 30,\n", + " \"proj_codevector_dim\": 768,\n", + " \"tdnn_dilation\": [\n", + " 1,\n", + " 2,\n", + " 3,\n", + " 1,\n", + " 1\n", + " ],\n", + " \"tdnn_dim\": [\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 512,\n", + " 1500\n", + " ],\n", + " \"tdnn_kernel\": [\n", + " 5,\n", + " 3,\n", + " 3,\n", + " 1,\n", + " 1\n", + " ],\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.16.0.dev0\",\n", + " \"use_weighted_layer_sum\": false,\n", + " \"vocab_size\": 33,\n", + " \"xvector_output_dim\": 512\n", + "}\n", + "\n", + "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-basaa/preprocessor_config.json\n", + "Feature extractor Wav2Vec2FeatureExtractor {\n", + " \"do_normalize\": true,\n", + " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", + " \"feature_size\": 1,\n", + " \"padding_side\": \"right\",\n", + " \"padding_value\": 0,\n", + " \"return_attention_mask\": true,\n", + " \"sampling_rate\": 16000\n", + "}\n", + "\n", + "Didn't find file ./wav2vec2-large-xls-r-300m-basaa/tokenizer.json. We won't load it.\n", + "loading file ./wav2vec2-large-xls-r-300m-basaa/vocab.json\n", + "loading file ./wav2vec2-large-xls-r-300m-basaa/tokenizer_config.json\n", + "loading file ./wav2vec2-large-xls-r-300m-basaa/added_tokens.json\n", + "loading file ./wav2vec2-large-xls-r-300m-basaa/special_tokens_map.json\n", + "loading file None\n", + "Adding to the vocabulary\n", + "Adding to the vocabulary\n", + "Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-basaa into local empty directory.\n", + "01/25/2022 02:49:13 - WARNING - huggingface_hub.repository - Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-basaa into local empty directory.\n", + "Using amp half precision backend\n", + "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", + "/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "***** Running training *****\n", + " Num examples = 1008\n", + " Num Epochs = 200\n", + " Instantaneous batch size per device = 32\n", + " Total train batch size (w. parallel, distributed & accumulation) = 32\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 6400\n", + "{'loss': 22.3442, 'learning_rate': 1.3439999999999998e-05, 'epoch': 3.12} \n", + "{'loss': 5.9373, 'learning_rate': 2.7439999999999998e-05, 'epoch': 6.25} \n", + "{'loss': 3.7179, 'learning_rate': 4.1439999999999996e-05, 'epoch': 9.38} \n", + "{'loss': 3.0789, 'learning_rate': 5.544e-05, 'epoch': 12.5} \n", + "{'loss': 2.9287, 'learning_rate': 6.944e-05, 'epoch': 15.62} \n", + " 8%|██▉ | 500/6400 [12:29<2:18:06, 1.40s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", + "***** Running Evaluation *****\n", + " Num examples = 375\n", + " Batch size = 32\n", + "\n", + " 0%| | 0/12 [00:00 main\n", + "\n", + "01/25/2022 05:49:03 - WARNING - huggingface_hub.repository - To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-basaa\n", + " 4004a5d..cba4cc3 main -> main\n", + "\n", + "Upload file pytorch_model.bin: 100%|███████| 1.18G/1.18G [01:03<00:00, 19.8MB/s]\n", + "Dropping the following result as it does not have all the necessary fields:\n", + "{'dataset': {'name': 'MOZILLA-FOUNDATION/COMMON_VOICE_7_0 - BAS', 'type': 'common_voice', 'args': 'Config: bas, Training split: train+validation, Eval split: test'}}\n", + "To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-basaa\n", + " cba4cc3..5a62cc8 main -> main\n", + "\n", + "01/25/2022 05:49:12 - WARNING - huggingface_hub.repository - To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-basaa\n", + " cba4cc3..5a62cc8 main -> main\n", + "\n" + ] + } + ], + "source": [ + "!python run_speech_recognition_ctc.py \\\n", + "\t--dataset_name=\"mozilla-foundation/common_voice_7_0\" \\\n", + "\t--model_name_or_path=\"facebook/wav2vec2-xls-r-300m\" \\\n", + "\t--dataset_config_name=\"bas\" \\\n", + "\t--output_dir=\"./wav2vec2-large-xls-r-300m-basaa\" \\\n", + "\t--overwrite_output_dir \\\n", + "\t--num_train_epochs=\"200\" \\\n", + "\t--per_device_train_batch_size=\"32\" \\\n", + "\t--per_device_eval_batch_size=\"32\" \\\n", + "\t--gradient_accumulation_steps=\"1\" \\\n", + "\t--learning_rate=\"7e-5\" \\\n", + "\t--warmup_steps=\"500\" \\\n", + "\t--length_column_name=\"input_length\" \\\n", + "\t--evaluation_strategy=\"steps\" \\\n", + "\t--text_column_name=\"sentence\" \\\n", + "\t--chars_to_ignore , ? . ! \\- \\; \\: \\\" “ % ‘ ” � — ’ … – \\\n", + "\t--save_steps=\"500\" \\\n", + "\t--eval_steps=\"500\" \\\n", + "\t--logging_steps=\"100\" \\\n", + "\t--layerdrop=\"0.0\" \\\n", + "\t--activation_dropout=\"0.1\" \\\n", + "\t--save_total_limit=\"2\" \\\n", + "\t--freeze_feature_encoder \\\n", + "\t--feat_proj_dropout=\"0.0\" \\\n", + "\t--mask_time_prob=\"0.75\" \\\n", + "\t--mask_time_length=\"10\" \\\n", + "\t--mask_feature_prob=\"0.25\" \\\n", + "\t--mask_feature_length=\"64\" \\\n", + "\t--gradient_checkpointing \\\n", + "\t--use_auth_token \\\n", + "\t--fp16 \\\n", + "\t--group_by_length \\\n", + "\t--do_train --do_eval \\\n", + " --push_to_hub" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# !rm -rf wav2vec2-large-xls-r-300m-basaa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ls -ltr" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filesystem Size Used Avail Use% Mounted on\n", + "overlay 3.5T 963G 2.4T 29% /\n", + "tmpfs 64M 0 64M 0% /dev\n", + "tmpfs 87G 0 87G 0% /sys/fs/cgroup\n", + "tmpfs 87G 8.0K 87G 1% /dev/shm\n", + "/dev/md0 3.5T 963G 2.4T 29% /etc/group\n", + "tmpfs 87G 12K 87G 1% /proc/driver/nvidia\n", + "/dev/vda1 49G 6.4G 42G 14% /usr/bin/nvidia-smi\n", + "udev 87G 0 87G 0% /dev/nvidia0\n", + "tmpfs 87G 0 87G 0% /proc/acpi\n", + "tmpfs 87G 0 87G 0% /proc/scsi\n", + "tmpfs 87G 0 87G 0% /sys/firmware\n" + ] + } + ], + "source": [ + "!df -h" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading and preparing dataset common_voice/bas to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "65173ad791c049b582dc4533c3555696", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/52.1M [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcommon_voice_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcommon_voice_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremove_columns\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"accent\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"age\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"client_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"down_votes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"gender\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"locale\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"segment\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"up_votes\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mcommon_voice_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcommon_voice_test\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremove_columns\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"accent\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"age\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"client_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"down_votes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"gender\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"locale\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"segment\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"up_votes\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Dataset\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"self\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[0;31m# apply actual function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 522\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Dataset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"DatasetDict\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 523\u001b[0m \u001b[0mdatasets\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Dataset\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdatasets\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0;31m# Call actual function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 411\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 412\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0;31m# Update fingerprint of in-place transforms + update in-place history of transforms\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36mremove_columns\u001b[0;34m(self, column_names, new_fingerprint)\u001b[0m\n\u001b[1;32m 1498\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcolumn_name\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_name\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1500\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1501\u001b[0m \u001b[0;34mf\"Column name {column_name} not in the dataset. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1502\u001b[0m \u001b[0;34mf\"Current columns in the dataset: {dataset._data.column_names}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Column name accent not in the dataset. Current columns in the dataset: ['path', 'audio', 'sentence']" + ] + } + ], + "source": [ + "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", + "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import ClassLabel\n", + "import random\n", + "import pandas as pd\n", + "from IPython.display import display, HTML\n", + "\n", + "def show_random_elements(dataset, num_examples=10):\n", + " assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n", + " picks = []\n", + " for _ in range(num_examples):\n", + " pick = random.randint(0, len(dataset)-1)\n", + " while pick in picks:\n", + " pick = random.randint(0, len(dataset)-1)\n", + " picks.append(pick)\n", + " \n", + " df = pd.DataFrame(dataset[picks])\n", + " display(HTML(df.to_html()))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentence
0Hiônde hi mbémba.
1Me mbagbana le kiñe a tagbe.
2Péé i sômi, i mbem kôsi.
3Yokel yaga nyen u bike i mok.
4Me mbada kômôl i wom.
5Hibee hi mbudla.
6Mudaa nunu a nyi téñ añgis yéé!
7Me ñkon i jôl.
8A nsugul bijek.
9Di bak i ntoa pôôga i kôdôk.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "chars_to_remove_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–]'\n", + "\n", + "def remove_special_characters(batch):\n", + " batch[\"sentence\"] = re.sub(chars_to_remove_regex, '', batch[\"sentence\"]).lower()\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3ef8fa5c4a82429da47176ab713a7b66", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1008 [00:00 main\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-basaa/commit/1d27e6b41659e85ed1b87d2bb3db58e96472fe1f'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab_dict[\"|\"] = vocab_dict[\" \"]\n", + "del vocab_dict[\" \"]\n", + "\n", + "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n", + "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n", + "print(len(vocab_dict))\n", + "\n", + "import json\n", + "with open('./vocab.json', 'w') as vocab_file:\n", + " json.dump(vocab_dict, vocab_file)\n", + " \n", + "from transformers import Wav2Vec2CTCTokenizer\n", + "\n", + "tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n", + "\n", + "repo_name = \"wav2vec2-large-xls-r-300m-basaa\"\n", + "\n", + "tokenizer.push_to_hub(repo_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2022-01-25 05:51:53-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4421 (4.3K) [text/plain]\n", + "Saving to: ‘eval.py’\n", + "\n", + "eval.py 100%[===================>] 4.32K --.-KB/s in 0s \n", + "\n", + "2022-01-25 05:51:53 (11.6 MB/s) - ‘eval.py’ saved [4421/4421]\n", + "\n", + "total 1232556\n", + "-rw-r--r-- 1 ovh ovh 272 Jan 25 02:49 vocab.json\n", + "-rw-r--r-- 1 ovh ovh 260 Jan 25 02:49 tokenizer_config.json\n", + "-rw-r--r-- 1 ovh ovh 309 Jan 25 02:49 special_tokens_map.json\n", + "-rw-r--r-- 1 ovh ovh 23 Jan 25 02:49 added_tokens.json\n", + "drwxr-xr-x 2 ovh ovh 4096 Jan 25 05:21 checkpoint-5500\n", + "drwxr-xr-x 2 ovh ovh 4096 Jan 25 05:35 checkpoint-6000\n", + "-rw-r--r-- 1 ovh ovh 197 Jan 25 05:46 train_results.json\n", + "-rw-r--r-- 1 ovh ovh 11278 Jan 25 05:46 trainer_state.json\n", + "-rw-r--r-- 1 ovh ovh 224 Jan 25 05:46 eval_results.json\n", + "-rw-r--r-- 1 ovh ovh 2033 Jan 25 05:46 config.json\n", + "-rw-r--r-- 1 ovh ovh 399 Jan 25 05:46 all_results.json\n", + "-rw-r--r-- 1 ovh ovh 1262058993 Jan 25 05:46 pytorch_model.bin\n", + "-rw-r--r-- 1 ovh ovh 3055 Jan 25 05:46 training_args.bin\n", + "-rw-r--r-- 1 ovh ovh 212 Jan 25 05:46 preprocessor_config.json\n", + "-rw-r--r-- 1 ovh ovh 2253 Jan 25 05:49 README.md\n", + "-rw-r--r-- 1 ovh ovh 4421 Jan 25 05:51 eval.py\n" + ] + } + ], + "source": [ + "!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", + "!cp eval.py wav2vec2-large-xls-r-300m-basaa\n", + "!ls -ltr wav2vec2-large-xls-r-300m-basaa" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", + "100%|█████████████████████████████████████████| 375/375 [03:03<00:00, 2.04ex/s]\n", + "WER: 1.0408274360370169\n", + "CER: 2.2848350566223536\n", + "100%|██████████████████████████████████████| 375/375 [00:00<00:00, 20474.93ex/s]\n" + ] + } + ], + "source": [ + "!cd wav2vec2-large-xls-r-300m-basaa; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config bas --split test --log_outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "24592b0be30e4eafb1949cf09d1c4fb4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/260 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m: 33" + ] + } + ], + "source": [ + "from transformers import AutoModelForCTC, AutoProcessor\n", + "from datasets import load_dataset\n", + "\n", + "model = AutoModelForCTC.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-basaa\")\n", + "processor = AutoProcessor.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-basaa\")\n", + "\n", + "input_values = processor(common_voice_test[0][\"audio\"][\"array\"], return_tensors=\"pt\", sampling_rate=16_000).input_values\n", + "# input_values = input_values.to(\"cuda\")\n", + "\n", + "logits = model(input_values).logits\n", + "\n", + "assert logits.shape[-1] == 32, logits.shape[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyM3OaMlm9YQtKpl28c8gBBd", + "include_colab_link": true, + "name": "DebugOVHTransformers.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}