{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HuggingFace challenge - Debugger notebook\n", "Run this notebook to verify your libraries versions, check GPU config and run a quick training" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "T2utsYSKszvv" }, "outputs": [], "source": [ "import platform\n", "import multiprocessing\n", "\n", "import torch\n", "import transformers\n", "import datasets\n", "\n", "import soundfile" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Print main infos" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5P6I-W9ts-kR", "outputId": "939bd550-1486-46a6-8371-e82ada0f448c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10\n", "CPU cores: 60\n", "Python version: 3.8.8\n", "PyTorch version: 1.10.1+cu102\n", "GPU is visible: True\n", "Transformers version: 4.16.0.dev0\n", "Datasets version: 1.17.1.dev0\n", "soundfile version: 0.10.3\n" ] } ], "source": [ "print(f\"Platform: {platform.platform()}\")\n", "print(f\"CPU cores: {multiprocessing.cpu_count()}\")\n", "\n", "print(f\"Python version: {platform.python_version()}\")\n", "\n", "print(f\"PyTorch version: {torch.__version__}\")\n", "print(f\"GPU is visible: {torch.cuda.is_available()}\")\n", "\n", "print(f\"Transformers version: {transformers.__version__}\")\n", "print(f\"Datasets version: {datasets.__version__}\")\n", "\n", "print(f\"soundfile version: {soundfile.__version__}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check your GPU informations (if any)\n", "If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).\n", "Driver and CUDA version " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YT7fRnKctggU", "outputId": "f355a3e0-20da-489f-bd1f-5e508e792a68" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Jan 26 07:06:25 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla V100S-PCI... Off | 00000000:00:06.0 Off | 0 |\n", "| N/A 35C P0 26W / 250W | 4MiB / 32510MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2fa897b4afc049229144599af9e3f807", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
\\n] 29.64K --.-KB/s in 0.001s \n", "\n", "2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]\n", "\n" ] } ], "source": [ "!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# \t--learning_rate=\"7.5e-5\" \\\n", "# 84.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "collapsed": true, "id": "Mz4bubhxxsad", "jupyter": { "outputs_hidden": true }, "outputId": "23398525-cc19-43c2-9fec-497e06214f29" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "01/26/2022 07:14:04 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: True\n", "01/26/2022 07:14:04 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=False,\n", "do_train=True,\n", "eval_accumulation_steps=None,\n", "eval_steps=500,\n", "evaluation_strategy=IntervalStrategy.STEPS,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=True,\n", "greater_is_better=None,\n", "group_by_length=True,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_strategy=HubStrategy.EVERY_SAVE,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=7e-05,\n", "length_column_name=input_length,\n", "load_best_model_at_end=False,\n", "local_rank=-1,\n", "log_level=-1,\n", "log_level_replica=-1,\n", "log_on_each_node=True,\n", "logging_dir=./wav2vec2-large-xls-r-300m-breton/runs/Jan26_07-14-04_job-8be8b741-e32e-4579-bbec-1e00d9824b4f,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=IntervalStrategy.STEPS,\n", "lr_scheduler_type=SchedulerType.LINEAR,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=50.0,\n", "optim=OptimizerNames.ADAMW_HF,\n", "output_dir=./wav2vec2-large-xls-r-300m-breton,\n", "overwrite_output_dir=True,\n", "past_index=-1,\n", "per_device_eval_batch_size=32,\n", "per_device_train_batch_size=32,\n", "prediction_loss_only=False,\n", "push_to_hub=True,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=./wav2vec2-large-xls-r-300m-breton,\n", "save_on_each_node=False,\n", "save_steps=500,\n", "save_strategy=IntervalStrategy.STEPS,\n", "save_total_limit=2,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_legacy_prediction_loop=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=500,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "01/26/2022 07:14:06 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/br/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", "01/26/2022 07:14:09 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/br/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", "remove special characters from datasets: 100%|█| 4790/4790 [00:00<00:00, 5636.80\n", "remove special characters from datasets: 100%|█| 2079/2079 [00:00<00:00, 5623.18\n", "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 5.43ba/s]\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 20.51ba/s]\n", "Didn't find file ./wav2vec2-large-xls-r-300m-breton/tokenizer_config.json. We won't load it.\n", "Didn't find file ./wav2vec2-large-xls-r-300m-breton/added_tokens.json. We won't load it.\n", "Didn't find file ./wav2vec2-large-xls-r-300m-breton/special_tokens_map.json. We won't load it.\n", "Didn't find file ./wav2vec2-large-xls-r-300m-breton/tokenizer.json. We won't load it.\n", "loading file ./wav2vec2-large-xls-r-300m-breton/vocab.json\n", "loading file None\n", "loading file None\n", "loading file None\n", "loading file None\n", "file ./wav2vec2-large-xls-r-300m-breton/config.json not found\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "loading weights file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n", "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_hid.bias', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_q.bias', 'project_hid.weight']\n", "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "preprocess datasets: 100%|█████████████████| 4790/4790 [00:30<00:00, 158.18ex/s]\n", "preprocess datasets: 100%|█████████████████| 2079/2079 [00:13<00:00, 149.33ex/s]\n", "100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 486.87ba/s]\n", "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 608.84ba/s]\n", "Configuration saved in ./wav2vec2-large-xls-r-300m-breton/preprocessor_config.json\n", "tokenizer config file saved in ./wav2vec2-large-xls-r-300m-breton/tokenizer_config.json\n", "Special tokens file saved in ./wav2vec2-large-xls-r-300m-breton/special_tokens_map.json\n", "added tokens file saved in ./wav2vec2-large-xls-r-300m-breton/added_tokens.json\n", "Configuration saved in ./wav2vec2-large-xls-r-300m-breton/config.json\n", "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-breton/preprocessor_config.json\n", "loading configuration file ./wav2vec2-large-xls-r-300m-breton/config.json\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"./wav2vec2-large-xls-r-300m-breton\",\n", " \"activation_dropout\": 0.1,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"mean\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.0,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.0,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.0,\n", " \"mask_feature_length\": 64,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.25,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.75,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 43,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 46,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-breton/preprocessor_config.json\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "Didn't find file ./wav2vec2-large-xls-r-300m-breton/tokenizer.json. We won't load it.\n", "loading file ./wav2vec2-large-xls-r-300m-breton/vocab.json\n", "loading file ./wav2vec2-large-xls-r-300m-breton/tokenizer_config.json\n", "loading file ./wav2vec2-large-xls-r-300m-breton/added_tokens.json\n", "loading file ./wav2vec2-large-xls-r-300m-breton/special_tokens_map.json\n", "loading file None\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-breton into local empty directory.\n", "01/26/2022 07:15:12 - WARNING - huggingface_hub.repository - Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-breton into local empty directory.\n", "Using amp half precision backend\n", "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "***** Running training *****\n", " Num examples = 4790\n", " Num Epochs = 50\n", " Instantaneous batch size per device = 32\n", " Total train batch size (w. parallel, distributed & accumulation) = 32\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 7500\n", "{'loss': 16.4709, 'learning_rate': 1.3579999999999999e-05, 'epoch': 0.67} \n", "{'loss': 5.6159, 'learning_rate': 2.758e-05, 'epoch': 1.33} \n", "{'loss': 3.8197, 'learning_rate': 4.157999999999999e-05, 'epoch': 2.0} \n", "{'loss': 3.1231, 'learning_rate': 5.558e-05, 'epoch': 2.67} \n", "{'loss': 2.9205, 'learning_rate': 6.958e-05, 'epoch': 3.33} \n", " 7%|██▌ | 500/7500 [08:36<2:12:15, 1.13s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "***** Running Evaluation *****\n", " Num examples = 2079\n", " Batch size = 32\n", "\n", " 0%| | 0/65 [00:00\n", " \n", " \n", " \n", " sentence\n", " \n", " \n", " \n", " \n", " 0\n", " Ar sivienn freskañ.\n", " \n", " \n", " 1\n", " Evidout e vefe ur skoazell.\n", " \n", " \n", " 2\n", " Setu ar soñj, pep tra a chome d’ober avat !\n", " \n", " \n", " 3\n", " \"N'eo ket stanket an hent.\"\n", " \n", " \n", " 4\n", " Studiañ a ra e skol-veur Boston.\n", " \n", " \n", " 5\n", " Ne ouien ket e vije bet resevet.\n", " \n", " \n", " 6\n", " da damall emañ.\n", " \n", " \n", " 7\n", " Mat eo an holl.\n", " \n", " \n", " 8\n", " Renkit al levrioù en urzh.\n", " \n", " \n", " 9\n", " Daoust ha ne ra ket glav ?\n", " \n", " \n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import re\n", "chars_to_remove_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–\\(\\)\\/]'\n", "\n", "def remove_special_characters(batch):\n", " batch[\"sentence\"] = re.sub(chars_to_remove_regex, '', batch[\"sentence\"]).lower()\n", " return batch" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c9ca7320466f403aa57582f4821d8862", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/4790 [00:00 main\n", "\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-breton/commit/2ab120eb1c15adb268746f2e79396b7fc91986cd'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_dict[\"|\"] = vocab_dict[\" \"]\n", "del vocab_dict[\" \"]\n", "\n", "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n", "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n", "print(len(vocab_dict))\n", "\n", "import json\n", "with open('./vocab.json', 'w') as vocab_file:\n", " json.dump(vocab_dict, vocab_file)\n", " \n", "from transformers import Wav2Vec2CTCTokenizer\n", "\n", "tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n", "\n", "repo_name = \"wav2vec2-large-xls-r-300m-breton\"\n", "\n", "tokenizer.push_to_hub(repo_name)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-01-26 10:15:59-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4421 (4.3K) [text/plain]\n", "Saving to: ‘eval.py’\n", "\n", "eval.py 100%[===================>] 4.32K --.-KB/s in 0s \n", "\n", "2022-01-26 10:15:59 (15.2 MB/s) - ‘eval.py’ saved [4421/4421]\n", "\n", "total 1232612\n", "-rw-r--r-- 1 ovh ovh 388 Jan 26 07:15 vocab.json\n", "-rw-r--r-- 1 ovh ovh 260 Jan 26 07:15 tokenizer_config.json\n", "-rw-r--r-- 1 ovh ovh 309 Jan 26 07:15 special_tokens_map.json\n", "-rw-r--r-- 1 ovh ovh 23 Jan 26 07:15 added_tokens.json\n", "drwxr-xr-x 2 ovh ovh 4096 Jan 26 09:42 checkpoint-7000\n", "drwxr-xr-x 2 ovh ovh 4096 Jan 26 09:53 checkpoint-7500\n", "-rw-r--r-- 1 ovh ovh 195 Jan 26 09:54 train_results.json\n", "-rw-r--r-- 1 ovh ovh 12925 Jan 26 09:54 trainer_state.json\n", "-rw-r--r-- 1 ovh ovh 225 Jan 26 09:55 eval_results.json\n", "-rw-r--r-- 1 ovh ovh 2033 Jan 26 09:55 config.json\n", "-rw-r--r-- 1 ovh ovh 399 Jan 26 09:55 all_results.json\n", "-rw-r--r-- 1 ovh ovh 1262112241 Jan 26 09:55 pytorch_model.bin\n", "-rw-r--r-- 1 ovh ovh 3055 Jan 26 09:55 training_args.bin\n", "-rw-r--r-- 1 ovh ovh 212 Jan 26 09:55 preprocessor_config.json\n", "-rw-r--r-- 1 ovh ovh 2418 Jan 26 09:56 README.md\n", "-rw-r--r-- 1 ovh ovh 4421 Jan 26 10:16 eval.py\n" ] } ], "source": [ "!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", "!cp eval.py wav2vec2-large-xls-r-300m-breton\n", "!ls -ltr wav2vec2-large-xls-r-300m-breton" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/br/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n", "100%|███████████████████████████████████████| 2079/2079 [16:39<00:00, 2.08ex/s]\n", "WER: 1.0795533155450143\n", "CER: 3.7933085055261895\n", "100%|████████████████████████████████████| 2079/2079 [00:00<00:00, 20347.25ex/s]\n" ] } ], "source": [ "!cd wav2vec2-large-xls-r-300m-breton; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config br --split test --log_outputs" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "24592b0be30e4eafb1949cf09d1c4fb4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/260 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m: 55" ] } ], "source": [ "# from transformers import AutoModelForCTC, AutoProcessor\n", "# from datasets import load_dataset\n", "\n", "# model = AutoModelForCTC.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-bashkir\")\n", "# processor = AutoProcessor.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-bashkir\")\n", "\n", "# input_values = processor(common_voice_test[0][\"audio\"][\"array\"], return_tensors=\"pt\", sampling_rate=16_000).input_values\n", "# # input_values = input_values.to(\"cuda\")\n", "\n", "# logits = model(input_values).logits\n", "\n", "# assert logits.shape[-1] == 32, logits.shape[-1]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/br/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "486109521f8c4c6186f88f8318ffd375", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/1.99k [00:00hrjn'yg jdldmc'kk\", \"md udunlo jds chv'q nq bg'q'msdy duhs 'q xdyg\", \"dldm dl'ñ 'q lhkkjkdtay 'q edrsmny 'q rs'c \", \"nadq ' q'm tm cdydmm v'q 'q udydfhdyg\", \"tq a'mmd chfnq j'knm gn on \", \"'m s'clnt'kbg\", \"tm s'll\", \"fntynts ' n'q aqdygnmdf \", \"mh ' f'qn\", \"'q odmm'c chvdyg'ñl'ñ\"]\n" ] }, { "data": { "text/plain": [ "'Biskoazh kemend-all !'" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import Audio, Dataset, load_dataset, load_metric\n", "from transformers import AutoFeatureExtractor, pipeline\n", "\n", "dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"br\", use_auth_token=True, split=\"train+validation\")\n", "\n", "# for testing: only process the first two examples as a test\n", "dataset = dataset.select(range(10))\n", "\n", "repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-breton'\n", "\n", "# load processor\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n", "# feature_extractor = processor_with_lm.feature_extractor\n", "sampling_rate = feature_extractor.sampling_rate\n", "\n", "# resample audio\n", "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))\n", "\n", "# load eval pipeline\n", "asr = pipeline(\"automatic-speech-recognition\", model=repo_name, feature_extractor=feature_extractor)\n", "\n", "# map function to decode audio\n", "def map_to_pred(batch):\n", " prediction = asr(\n", " batch[\"audio\"][\"array\"])\n", "\n", " batch[\"prediction\"] = prediction[\"text\"]\n", " batch[\"target\"] = batch[\"sentence\"]\n", " return batch\n", "\n", "# run inference on all examples\n", "result = dataset.map(map_to_pred, remove_columns=dataset.column_names)\n", "print(result[\"prediction\"])\n", "\n", "result[0]['target']" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"ahrjn'yg jdldmc'kk\",\n", " \"md udunlo jds chv'q nq bg'q'msdy duhs 'q xdyg\",\n", " \"dldm dl'ñ 'q lhkkjkdtay 'q edrsmny 'q rs'c \",\n", " \"nadq ' q'm tm cdydmm v'q 'q udydfhdyg\",\n", " \"tq a'mmd chfnq j'knm gn on \",\n", " \"'m s'clnt'kbg\",\n", " \"tm s'll\",\n", " \"fntynts ' n'q aqdygnmdf \",\n", " \"mh ' f'qn\",\n", " \"'q odmm'c chvdyg'ñl'ñ\"]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[x.replace('', '') for x in result[\"prediction\"]]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Biskoazh kemend-all !',\n", " 'Ne vevomp ket diwar hor c’harantez evit ar yezh.',\n", " 'E-men emañ ar Mille-Clubs, ar fest-noz, ar stad ?',\n", " 'Ober a ran un dezenn war ar vezegiezh',\n", " 'Ur banne digor kalon ho po ?',\n", " '\"An tad-moualc\\'h.\"',\n", " 'un tamm.',\n", " 'Gouzout a oar brezhoneg ?',\n", " 'ni a garo.',\n", " 'Ar pennad diwezhañ-mañ.']" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[x.replace('', '') for x in result[\"target\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "authorship_tag": "ABX9TyM3OaMlm9YQtKpl28c8gBBd", "include_colab_link": true, "name": "DebugOVHTransformers.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }