{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HuggingFace challenge - Debugger notebook\n", "Run this notebook to verify your libraries versions, check GPU config and run a quick training" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "T2utsYSKszvv" }, "outputs": [], "source": [ "import platform\n", "import multiprocessing\n", "\n", "import torch\n", "import transformers\n", "import datasets\n", "\n", "import soundfile" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Print main infos" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5P6I-W9ts-kR", "outputId": "939bd550-1486-46a6-8371-e82ada0f448c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10\n", "CPU cores: 60\n", "Python version: 3.8.8\n", "PyTorch version: 1.10.1+cu102\n", "GPU is visible: True\n", "Transformers version: 4.16.0.dev0\n", "Datasets version: 1.17.1.dev0\n", "soundfile version: 0.10.3\n" ] } ], "source": [ "print(f\"Platform: {platform.platform()}\")\n", "print(f\"CPU cores: {multiprocessing.cpu_count()}\")\n", "\n", "print(f\"Python version: {platform.python_version()}\")\n", "\n", "print(f\"PyTorch version: {torch.__version__}\")\n", "print(f\"GPU is visible: {torch.cuda.is_available()}\")\n", "\n", "print(f\"Transformers version: {transformers.__version__}\")\n", "print(f\"Datasets version: {datasets.__version__}\")\n", "\n", "print(f\"soundfile version: {soundfile.__version__}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check your GPU informations (if any)\n", "If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).\n", "Driver and CUDA version " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YT7fRnKctggU", "outputId": "f355a3e0-20da-489f-bd1f-5e508e792a68" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Thu Jan 27 02:55:35 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla V100S-PCI... Off | 00000000:00:06.0 Off | 0 |\n", "| N/A 34C P0 25W / 250W | 4MiB / 32510MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2fa897b4afc049229144599af9e3f807", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
\\n] 29.64K --.-KB/s in 0.001s \n", "\n", "2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]\n", "\n" ] } ], "source": [ "!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# \t--learning_rate=\"7.5e-5\" \\\n", "# 84.5" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mz4bubhxxsad", "outputId": "23398525-cc19-43c2-9fec-497e06214f29" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "01/27/2022 03:05:04 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: True\n", "01/27/2022 03:05:04 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=False,\n", "do_train=True,\n", "eval_accumulation_steps=None,\n", "eval_steps=500,\n", "evaluation_strategy=IntervalStrategy.STEPS,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=True,\n", "greater_is_better=None,\n", "group_by_length=True,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_strategy=HubStrategy.EVERY_SAVE,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=7e-05,\n", "length_column_name=input_length,\n", "load_best_model_at_end=False,\n", "local_rank=-1,\n", "log_level=-1,\n", "log_level_replica=-1,\n", "log_on_each_node=True,\n", "logging_dir=./wav2vec2-large-xls-r-300m-finnish/runs/Jan27_03-05-04_job-8be8b741-e32e-4579-bbec-1e00d9824b4f,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=IntervalStrategy.STEPS,\n", "lr_scheduler_type=SchedulerType.LINEAR,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=70.0,\n", "optim=OptimizerNames.ADAMW_HF,\n", "output_dir=./wav2vec2-large-xls-r-300m-finnish,\n", "overwrite_output_dir=True,\n", "past_index=-1,\n", "per_device_eval_batch_size=32,\n", "per_device_train_batch_size=32,\n", "prediction_loss_only=False,\n", "push_to_hub=True,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=./wav2vec2-large-xls-r-300m-finnish,\n", "save_on_each_node=False,\n", "save_steps=500,\n", "save_strategy=IntervalStrategy.STEPS,\n", "save_total_limit=2,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_legacy_prediction_loop=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=500,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "01/27/2022 03:05:06 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", "01/27/2022 03:05:09 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", "01/27/2022 03:05:09 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-02a235731bb40486.arrow\n", "01/27/2022 03:05:09 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-4a027b259934c0ca.arrow\n", "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 4.60ba/s]\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 22.77ba/s]\n", "Didn't find file ./wav2vec2-large-xls-r-300m-finnish/tokenizer_config.json. We won't load it.\n", "Didn't find file ./wav2vec2-large-xls-r-300m-finnish/added_tokens.json. We won't load it.\n", "Didn't find file ./wav2vec2-large-xls-r-300m-finnish/special_tokens_map.json. We won't load it.\n", "Didn't find file ./wav2vec2-large-xls-r-300m-finnish/tokenizer.json. We won't load it.\n", "loading file ./wav2vec2-large-xls-r-300m-finnish/vocab.json\n", "loading file None\n", "loading file None\n", "loading file None\n", "loading file None\n", "file ./wav2vec2-large-xls-r-300m-finnish/config.json not found\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "loading weights file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n", "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_hid.weight']\n", "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "preprocess datasets: 100%|█████████████████| 3627/3627 [00:26<00:00, 138.44ex/s]\n", "preprocess datasets: 100%|█████████████████| 1599/1599 [00:11<00:00, 133.52ex/s]\n", "100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 343.45ba/s]\n", "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 491.54ba/s]\n", "Configuration saved in ./wav2vec2-large-xls-r-300m-finnish/preprocessor_config.json\n", "tokenizer config file saved in ./wav2vec2-large-xls-r-300m-finnish/tokenizer_config.json\n", "Special tokens file saved in ./wav2vec2-large-xls-r-300m-finnish/special_tokens_map.json\n", "added tokens file saved in ./wav2vec2-large-xls-r-300m-finnish/added_tokens.json\n", "Configuration saved in ./wav2vec2-large-xls-r-300m-finnish/config.json\n", "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-finnish/preprocessor_config.json\n", "loading configuration file ./wav2vec2-large-xls-r-300m-finnish/config.json\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"./wav2vec2-large-xls-r-300m-finnish\",\n", " \"activation_dropout\": 0.1,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"mean\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.0,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.0,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.0,\n", " \"mask_feature_length\": 64,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.25,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.75,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 32,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 35,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-finnish/preprocessor_config.json\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "Didn't find file ./wav2vec2-large-xls-r-300m-finnish/tokenizer.json. We won't load it.\n", "loading file ./wav2vec2-large-xls-r-300m-finnish/vocab.json\n", "loading file ./wav2vec2-large-xls-r-300m-finnish/tokenizer_config.json\n", "loading file ./wav2vec2-large-xls-r-300m-finnish/added_tokens.json\n", "loading file ./wav2vec2-large-xls-r-300m-finnish/special_tokens_map.json\n", "loading file None\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-finnish into local empty directory.\n", "01/27/2022 03:06:06 - WARNING - huggingface_hub.repository - Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-finnish into local empty directory.\n", "Using amp half precision backend\n", "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "***** Running training *****\n", " Num examples = 3627\n", " Num Epochs = 70\n", " Instantaneous batch size per device = 32\n", " Total train batch size (w. parallel, distributed & accumulation) = 32\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 7980\n", "{'loss': 8.2074, 'learning_rate': 1.3719999999999999e-05, 'epoch': 0.88} \n", "{'loss': 3.7205, 'learning_rate': 2.772e-05, 'epoch': 1.75} \n", "{'loss': 3.1583, 'learning_rate': 4.1719999999999994e-05, 'epoch': 2.63} \n", "{'loss': 2.9766, 'learning_rate': 5.5719999999999995e-05, 'epoch': 3.51} \n", "{'loss': 2.9032, 'learning_rate': 6.971999999999999e-05, 'epoch': 4.39} \n", " 6%|██▍ | 500/7980 [10:07<2:27:07, 1.18s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "***** Running Evaluation *****\n", " Num examples = 1599\n", " Batch size = 32\n", "\n", " 0%| | 0/50 [00:00 main\n", "\n", "01/27/2022 06:21:58 - WARNING - huggingface_hub.repository - To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-finnish\n", " 423d9bc..236f82b main -> main\n", "\n", "Upload file pytorch_model.bin: 100%|███████| 1.18G/1.18G [01:25<00:00, 14.7MB/s]\n", "Dropping the following result as it does not have all the necessary fields:\n", "{'dataset': {'name': 'MOZILLA-FOUNDATION/COMMON_VOICE_7_0 - FI', 'type': 'common_voice', 'args': 'Config: fi, Training split: train+validation, Eval split: test'}}\n", "To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-finnish\n", " 236f82b..310dc81 main -> main\n", "\n", "01/27/2022 06:22:07 - WARNING - huggingface_hub.repository - To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-finnish\n", " 236f82b..310dc81 main -> main\n", "\n" ] } ], "source": [ "!python run_speech_recognition_ctc.py \\\n", "\t--dataset_name=\"mozilla-foundation/common_voice_7_0\" \\\n", "\t--model_name_or_path=\"facebook/wav2vec2-xls-r-300m\" \\\n", "\t--dataset_config_name=\"fi\" \\\n", "\t--output_dir=\"./wav2vec2-large-xls-r-300m-finnish\" \\\n", "\t--overwrite_output_dir \\\n", "\t--num_train_epochs=\"70\" \\\n", "\t--per_device_train_batch_size=\"32\" \\\n", "\t--per_device_eval_batch_size=\"32\" \\\n", "\t--gradient_accumulation_steps=\"1\" \\\n", "\t--learning_rate=\"7e-5\" \\\n", "\t--warmup_steps=\"500\" \\\n", "\t--length_column_name=\"input_length\" \\\n", "\t--evaluation_strategy=\"steps\" \\\n", "\t--text_column_name=\"sentence\" \\\n", "\t--chars_to_ignore , ? . ! \\- \\; \\: \\\" “ % ‘ ” � — ’ … – \\\n", "\t--save_steps=\"500\" \\\n", "\t--eval_steps=\"500\" \\\n", "\t--logging_steps=\"100\" \\\n", "\t--layerdrop=\"0.0\" \\\n", "\t--activation_dropout=\"0.1\" \\\n", "\t--save_total_limit=\"2\" \\\n", "\t--freeze_feature_encoder \\\n", "\t--feat_proj_dropout=\"0.0\" \\\n", "\t--mask_time_prob=\"0.75\" \\\n", "\t--mask_time_length=\"10\" \\\n", "\t--mask_feature_prob=\"0.25\" \\\n", "\t--mask_feature_length=\"64\" \\\n", "\t--gradient_checkpointing \\\n", "\t--use_auth_token \\\n", "\t--fp16 \\\n", "\t--group_by_length \\\n", "\t--do_train --do_eval \\\n", " --push_to_hub" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# !rm -rf wav2vec2-large-xls-r-300m-bashkir" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!ls -ltr" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Filesystem Size Used Avail Use% Mounted on\n", "overlay 3.5T 1.2T 2.2T 34% /\n", "tmpfs 64M 0 64M 0% /dev\n", "tmpfs 87G 0 87G 0% /sys/fs/cgroup\n", "tmpfs 87G 0 87G 0% /dev/shm\n", "/dev/md0 3.5T 1.2T 2.2T 34% /etc/group\n", "tmpfs 87G 12K 87G 1% /proc/driver/nvidia\n", "/dev/vda1 49G 6.5G 42G 14% /usr/bin/nvidia-smi\n", "udev 87G 0 87G 0% /dev/nvidia0\n", "tmpfs 87G 0 87G 0% /proc/acpi\n", "tmpfs 87G 0 87G 0% /proc/scsi\n", "tmpfs 87G 0 87G 0% /sys/firmware\n" ] } ], "source": [ "!df -h" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3627\n" ] } ], "source": [ "from datasets import load_dataset, load_metric, Audio\n", "\n", "common_voice_train = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"fi\", use_auth_token=True, split=\"train+validation\")\n", "common_voice_test = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"fi\", use_auth_token=True, split=\"test\")\n", "\n", "print(len(common_voice_train))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7934.0625" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(common_voice_train) * 70 / 32" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "from datasets import ClassLabel\n", "import random\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "def show_random_elements(dataset, num_examples=10):\n", " assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n", " picks = []\n", " for _ in range(num_examples):\n", " pick = random.randint(0, len(dataset)-1)\n", " while pick in picks:\n", " pick = random.randint(0, len(dataset)-1)\n", " picks.append(pick)\n", " \n", " df = pd.DataFrame(dataset[picks])\n", " display(HTML(df.to_html()))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence
0Äänestimme mietintöä vastaan edellä esitettyjen voimakkaiden mielipiteidemme vuoksi.
1\"Aikaa lähtövalmisteluihi on tunti\"\".\"
2Ja sen jälkeen lähtee jono rannalle.
3Huokaisin helpotuksesta.
4Lämpö, joka jatkui ja jatkui.
5Hän varmasti tiesi, mitä olin aikeissa tehdä.
6Ei näy, mistä siihen pääsisi ylös.
7Äänestän sen vuoksi tämän tärkeän mietinnön puolesta.
8Porsaasta johtuivat ajatukset Kaisan taloon.
9Aivan oikein.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "import re\n", "chars_to_remove_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–]'\n", "\n", "def remove_special_characters(batch):\n", " batch[\"sentence\"] = re.sub(chars_to_remove_regex, '', batch[\"sentence\"]).lower()\n", " return batch" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-079dfa6e4746ae78.arrow\n", "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-474e82849d2d9c95.arrow\n" ] } ], "source": [ "common_voice_train = common_voice_train.map(remove_special_characters)\n", "common_voice_test = common_voice_test.map(remove_special_characters)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# start_with_ar = common_voice_train.filter(lambda example: \"'\" in example['sentence'])\n", "# start_with_ar[0]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# start_with_ar[1]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "def replace_hatted_characters(batch):\n", "# batch[\"sentence\"] = re.sub('[â]', 'a', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[î]', 'i', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[ô]', 'o', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[û]', 'u', batch[\"sentence\"])\n", " return batch" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-7b7878ba6acb4302.arrow\n", "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba/cache-93e7f03d125d9e56.arrow\n" ] } ], "source": [ "common_voice_train = common_voice_train.map(replace_hatted_characters)\n", "common_voice_test = common_voice_test.map(replace_hatted_characters)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"sentence\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "705e09df2c5644d9b4ddab3d367b44d6", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00 main\n", "\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-finnish/commit/212cf89ca0491548a79a7ba213ca8a9e91b5303e'" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_dict[\"|\"] = vocab_dict[\" \"]\n", "del vocab_dict[\" \"]\n", "\n", "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n", "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n", "print(len(vocab_dict))\n", "\n", "import json\n", "with open('./vocab.json', 'w') as vocab_file:\n", " json.dump(vocab_dict, vocab_file)\n", " \n", "from transformers import Wav2Vec2CTCTokenizer\n", "\n", "tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n", "\n", "repo_name = \"wav2vec2-large-xls-r-300m-finnish\"\n", "\n", "tokenizer.push_to_hub(repo_name)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-01-27 02:33:09-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4421 (4.3K) [text/plain]\n", "Saving to: ‘eval.py’\n", "\n", "eval.py 100%[===================>] 4.32K --.-KB/s in 0s \n", "\n", "2022-01-27 02:33:09 (13.9 MB/s) - ‘eval.py’ saved [4421/4421]\n", "\n", "total 1232608\n", "-rw-r--r-- 1 ovh ovh 435 Jan 26 15:14 vocab.json\n", "-rw-r--r-- 1 ovh ovh 293 Jan 26 15:14 tokenizer_config.json\n", "-rw-r--r-- 1 ovh ovh 502 Jan 26 15:14 special_tokens_map.json\n", "-rw-r--r-- 1 ovh ovh 23 Jan 26 15:14 added_tokens.json\n", "drwxr-xr-x 2 ovh ovh 4096 Jan 26 17:34 checkpoint-5000\n", "drwxr-xr-x 2 ovh ovh 4096 Jan 26 17:49 checkpoint-5500\n", "-rw-r--r-- 1 ovh ovh 195 Jan 26 17:55 train_results.json\n", "-rw-r--r-- 1 ovh ovh 10133 Jan 26 17:55 trainer_state.json\n", "-rw-r--r-- 1 ovh ovh 222 Jan 26 17:55 eval_results.json\n", "-rw-r--r-- 1 ovh ovh 395 Jan 26 17:55 all_results.json\n", "-rw-r--r-- 1 ovh ovh 2033 Jan 26 17:55 config.json\n", "-rw-r--r-- 1 ovh ovh 1262112241 Jan 26 17:55 pytorch_model.bin\n", "-rw-r--r-- 1 ovh ovh 3055 Jan 26 17:55 training_args.bin\n", "-rw-r--r-- 1 ovh ovh 212 Jan 26 17:55 preprocessor_config.json\n", "-rw-r--r-- 1 ovh ovh 2182 Jan 26 17:57 README.md\n", "-rw-r--r-- 1 ovh ovh 4421 Jan 27 02:33 eval.py\n" ] } ], "source": [ "!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", "!cp eval.py wav2vec2-large-xls-r-300m-chuvash\n", "!ls -ltr wav2vec2-large-xls-r-300m-chuvash" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/cv/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", " 74%|██████████████████████████████▍ | 601/810 [05:33<01:52, 1.85ex/s]" ] } ], "source": [ "!cd wav2vec2-large-xls-r-300m-chuvash; python eval.py \\\n", " --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config cv --split test --log_outputs" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "24592b0be30e4eafb1949cf09d1c4fb4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/260 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m: 55" ] } ], "source": [ "# from transformers import AutoModelForCTC, AutoProcessor\n", "# from datasets import load_dataset\n", "\n", "# model = AutoModelForCTC.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-bashkir\")\n", "# processor = AutoProcessor.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-bashkir\")\n", "\n", "# input_values = processor(common_voice_test[0][\"audio\"][\"array\"], return_tensors=\"pt\", sampling_rate=16_000).input_values\n", "# # input_values = input_values.to(\"cuda\")\n", "\n", "# logits = model(input_values).logits\n", "\n", "# assert logits.shape[-1] == 32, logits.shape[-1]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "76613aaa9bd3471f9cdc2a3771250713", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/10 [00:00