# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [1]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [2]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
CPU cores: 60
Python version: 3.8.8
PyTorch version: 1.10.1+cu102
GPU is visible: True
Transformers version: 4.16.0.dev0
Datasets version: 1.17.1.dev0
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [3]:
!nvidia-smi

Fri Jan 21 03:07:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [5]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py

--2022-01-21 03:07:52--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30348 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-21 03:07:52 (21.5 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]



In [46]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="or" \
	--output_dir="./wav2vec2-large-xls-r-300m-odia" \
	--overwrite_output_dir \
	--num_train_epochs="120" \
	--per_device_train_batch_size="16" \
	--per_device_eval_batch_size="16" \
	--gradient_accumulation_steps="2" \
	--learning_rate="7.5e-5" \
	--warmup_steps="500" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — \’ … \– \' \’ \– \
	--save_steps="500" \
	--eval_steps="500" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="3" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/21/2022 06:29:10 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=2,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=7.5e-05,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log

In [None]:
import pandas as pd

df = pd.DataFrame([
    {}
])

In [13]:
# !zip -r wav2vec2-large-xls-r-300m-odia.zip wav2vec2-large-xls-r-300m-odia/
# !rm wav2vec2-large-xls-r-300m-odia.zip

In [10]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  557G  2.8T  17% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G     0   87G   0% /dev/shm
/dev/md0        3.5T  557G  2.8T  17% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.6G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [16]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "or", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "or", use_auth_token=True, split="test")

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)


In [54]:
len(common_voice_train) * 120 / 32

2013.75

In [17]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [18]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [19]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,"ସେ କଥା ଯାଉ, ଆମ୍ଭମାନଙ୍କୁ ଆଉ କଥା ଲେଖିବାକୁ ହେବ ।"
1,"ଯାହା ଦରମା ଗଣ୍ଡାକ ପାଉଥିଲେ, ପେଟ ପିଠିକୁ ନିଅଣ୍ଟ, ବିଧବା ଲାଗି ସାଇତି ଯିବେ କଣ?"
2,"ଯେ ଯେଡ଼େ ହୁସିଆର ହେବ, ଆପଦ ବିପଦ କାହାରିକୁ ଛାଡ଼ିନାହିଁ ।"
3,"ମୁଁ ପୂଜା ସାରି ସେମାନଙ୍କୁ କିଛି ଭୋଗ ଦେଇ ଘରେ ଛାଡ଼ିଆସିଲି, ବାକି ଭୋଗକୁ ବାନ୍ଧିଲି ।"
4,"ବାସୁ ଦୁଇ ଟଙ୍କାର ନଡ଼ା କିଣି ବାଡ଼ିରେ ଗଦେଇଅଛି, ଶରଣ ଦେବାରୁ ଛପରବନ୍ଦି ହୋଇପାରି ନାହିଁ ।"
5,"ଦେଖି ଦେଖି ମନରେ କଲା, ଆଜି ଏ କଣ ହେଉଛି ।"
6,"ଶାଶୁ ମୁହଁକୁ ଚାହିଁ ଗାଳି ଦିଏ ନାହିଁ; ଓଢ଼ଣା ପଡ଼ିଥାଏ, ପଛ କରି ବରବର କରି ବକିଯାଏ ।"
7,ଆଜି ମହାପ୍ରସାଦ ଉଠା ପରା ।
8,"""""""ଯାହାର ବାହା ସେ ଖେଳୁଛି ପଶା ଧାଇଁ ବୁଲୁଛନ୍ତି ସାଇ ପଡିଶା ।"""""""
9,ଅଶୀ ବର୍ଷର ପୁରୁଷ ବି ବିଭା ହୋଇ ପାରେ ।


In [36]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\’\–]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [37]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

  0%|          | 0/537 [00:00<?, ?ex/s]

  0%|          | 0/112 [00:00<?, ?ex/s]

In [38]:
def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
    batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
    return batch

In [39]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

  0%|          | 0/537 [00:00<?, ?ex/s]

  0%|          | 0/112 [00:00<?, ?ex/s]

In [40]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [41]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [42]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [43]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '|': 1,
 '।': 2,
 'ଁ': 3,
 'ଂ': 4,
 'ଃ': 5,
 'ଅ': 6,
 'ଆ': 7,
 'ଇ': 8,
 'ଈ': 9,
 'ଉ': 10,
 'ଊ': 11,
 'ଏ': 12,
 'ଓ': 13,
 'କ': 14,
 'ଖ': 15,
 'ଗ': 16,
 'ଘ': 17,
 'ଙ': 18,
 'ଚ': 19,
 'ଛ': 20,
 'ଜ': 21,
 'ଝ': 22,
 'ଞ': 23,
 'ଟ': 24,
 'ଠ': 25,
 'ଡ': 26,
 'ଢ': 27,
 'ଣ': 28,
 'ତ': 29,
 'ଥ': 30,
 'ଦ': 31,
 'ଧ': 32,
 'ନ': 33,
 'ପ': 34,
 'ଫ': 35,
 'ବ': 36,
 'ଭ': 37,
 'ମ': 38,
 'ଯ': 39,
 'ର': 40,
 'ଲ': 41,
 'ଳ': 42,
 'ଵ': 43,
 'ଶ': 44,
 'ଷ': 45,
 'ସ': 46,
 'ହ': 47,
 '଼': 48,
 'ା': 49,
 'ି': 50,
 'ୀ': 51,
 'ୁ': 52,
 'ୂ': 53,
 'ୃ': 54,
 'େ': 55,
 'ୈ': 56,
 'ୋ': 57,
 'ୌ': 58,
 '୍': 59,
 'ୟ': 60,
 'ୱ': 61}

In [48]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-odia
!ls -ltr wav2vec2-large-xls-r-300m-odia

--2022-01-21 08:33:50--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4419 (4.3K) [text/plain]
Saving to: ‘eval.py’


2022-01-21 08:33:50 (14.9 MB/s) - ‘eval.py’ saved [4419/4419]

total 1232676
-rw-r--r-- 1 ovh ovh        686 Jan 21 06:29 vocab.json
-rw-r--r-- 1 ovh ovh        290 Jan 21 06:29 tokenizer_config.json
-rw-r--r-- 1 ovh ovh        502 Jan 21 06:29 special_tokens_map.json
-rw-r--r-- 1 ovh ovh         23 Jan 21 06:29 added_tokens.json
drwxr-xr-x 2 ovh ovh       4096 Jan 21 07:02 checkpoint-1000
drwxr-xr-x 2 ovh ovh       4096 Jan 21 07:19 checkpoint-1500
drwxr-xr-x 2 ovh ovh       4096 Jan 21 07:37 checkpoint-2000
-rw-r--r-- 

In [50]:
!cd wav2vec2-large-xls-r-300m-odia; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config or --split test --log_outputs

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
100%|███████████████████████████████████████████| 10/10 [00:06<00:00,  1.55ex/s]
Downloading: 5.61kB [00:00, 2.23MB/s]                                           
WER: 1.0921052631578947
CER: 2.5547945205479454
100%|████████████████████████████████████████| 10/10 [00:00<00:00, 13001.56ex/s]
