# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [2]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [3]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
CPU cores: 60
Python version: 3.8.8
PyTorch version: 1.10.1+cu102
GPU is visible: True
Transformers version: 4.16.0.dev0
Datasets version: 1.17.1.dev0
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [4]:
!nvidia-smi

Mon Jan 24 17:23:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [5]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py

--2022-01-22 15:01:09--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30348 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]



In [None]:
# 	--learning_rate="7.5e-5" \
# 84.5

In [None]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="hy-AM" \
	--output_dir="./wav2vec2-large-xls-r-300m-armenian" \
	--overwrite_output_dir \
	--num_train_epochs="200" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="32" \
	--gradient_accumulation_steps="1" \
	--learning_rate="3e-4" \
	--warmup_steps="500" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � \' \’ \– \
	--save_steps="500" \
	--eval_steps="500" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="2" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/24/2022 17:28:58 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0003,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_

In [None]:
!ls -ltr

In [None]:
import pandas as pd

df = pd.DataFrame([
    {'eval_loss': 1.4175914525985718, 'eval_wer': 0.8282476024411508, 'eval_runtime': 5.6701, 'eval_samples_per_second': 25.044, 'eval_steps_per_second': 0.882, 'epoch': 41.67},
    {'eval_loss': 1.791098952293396, 'eval_wer': 0.7733217088055798, 'eval_runtime': 5.4161, 'eval_samples_per_second': 26.218, 'eval_steps_per_second': 0.923, 'epoch': 125.0},
    {'eval_loss': 1.761537790298462, 'eval_wer': 0.8169136878814298, 'eval_runtime': 5.7426, 'eval_samples_per_second': 24.728, 'eval_steps_per_second': 0.871, 'epoch': 166.67},
    {'eval_loss': 1.9240303039550781, 'eval_wer': 0.8456843940714909, 'eval_runtime': 5.3949, 'eval_samples_per_second': 26.321, 'eval_steps_per_second': 0.927, 'epoch': 208.33},
])

In [13]:
# !zip -r wav2vec2-large-xls-r-300m-odia.zip wav2vec2-large-xls-r-300m-odia/
# !rm wav2vec2-large-xls-r-300m-odia.zip

In [10]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  557G  2.8T  17% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G     0   87G   0% /dev/shm
/dev/md0        3.5T  557G  2.8T  17% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.6G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [6]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "hy-AM", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "hy-AM", use_auth_token=True, split="test")

print(len(common_voice_train))

Downloading and preparing dataset common_voice/hy-AM to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hy-AM/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba...


Downloading:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset common_voice downloaded and prepared to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hy-AM/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba. Subsequent calls will reuse this data.


Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hy-AM/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)


554


In [10]:
len(common_voice_train) * 200 / 32

3462.5

In [11]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [12]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [13]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,Ռեմիի մանկությունն անցնում է աղքատության և չքավորության մեջ։
1,Տղան դուրս չի գալիս կոմայից և մահանում է։
2,Հին հունական ողբերգության խորոսի տեքստերը հնչել են ասերգությամբ։
3,"Այս դեպքում բուժումը վիրահատական է, իսկ դեղանյութն քիչ է արդյունավետ։"
4,"Կինը շատ զարմացավ, բայց կարեց այն։"
5,Նախանձի և կատաղության մարմնավորում է։
6,Սովորել է տեղի միջնակարգ դպրոցում։
7,"Կարելի է տեսնել, որ լուծումը կատարվել է տարբեր չափերով։"
8,Մարմնի փոխադարձ խնամքը կարող է զուգակցվել յուրահատուկ ձայներով։
9,Նրա ծնողները ամենահայտնի և շատ սիրելի հերոսներ են։


In [14]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\’\–]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [15]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

  0%|          | 0/554 [00:00<?, ?ex/s]

  0%|          | 0/212 [00:00<?, ?ex/s]

In [16]:
def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
    batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
    return batch

In [17]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

  0%|          | 0/554 [00:00<?, ?ex/s]

  0%|          | 0/212 [00:00<?, ?ex/s]

In [18]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [19]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [21]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '(': 1,
 ')': 2,
 '«': 3,
 '»': 4,
 '՛': 5,
 '՝': 6,
 '՞': 7,
 'ա': 8,
 'բ': 9,
 'գ': 10,
 'դ': 11,
 'ե': 12,
 'զ': 13,
 'է': 14,
 'ը': 15,
 'թ': 16,
 'ժ': 17,
 'ի': 18,
 'լ': 19,
 'խ': 20,
 'ծ': 21,
 'կ': 22,
 'հ': 23,
 'ձ': 24,
 'ղ': 25,
 'ճ': 26,
 'մ': 27,
 'յ': 28,
 'ն': 29,
 'շ': 30,
 'ո': 31,
 'չ': 32,
 'պ': 33,
 'ջ': 34,
 'ռ': 35,
 'ս': 36,
 'վ': 37,
 'տ': 38,
 'ր': 39,
 'ց': 40,
 'ւ': 41,
 'փ': 42,
 'ք': 43,
 'օ': 44,
 'ֆ': 45,
 'և': 46,
 '։': 47}

In [31]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-urdu
!ls -ltr wav2vec2-large-xls-r-300m-urdu

--2022-01-23 02:32:51--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4419 (4.3K) [text/plain]
Saving to: ‘eval.py’


2022-01-23 02:32:51 (18.3 MB/s) - ‘eval.py’ saved [4419/4419]

total 1232640
drwxr-xr-x 2 ovh ovh       4096 Jan 22 18:04 checkpoint-5500
drwxr-xr-x 2 ovh ovh       4096 Jan 22 18:20 checkpoint-6000
-rw-r--r-- 1 ovh ovh        195 Jan 22 18:22 train_results.json
-rw-r--r-- 1 ovh ovh      10758 Jan 22 18:22 trainer_state.json
-rw-r--r-- 1 ovh ovh        222 Jan 22 18:22 eval_results.json
-rw-r--r-- 1 ovh ovh       2033 Jan 22 18:22 config.json
-rw-r--r-- 1 ovh ovh        395 Jan 22 18:22 all_results.json
-rw-r--r-- 1 ovh 

In [32]:
!cd wav2vec2-large-xls-r-300m-urdu; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config ur --split test --log_outputs

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ur/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
Traceback (most recent call last):
  File "eval.py", line 128, in <module>
    main(args)
  File "eval.py", line 81, in main
    asr = pipeline("automatic-speech-recognition", model=args.model_id)
  File "/opt/conda/lib/python3.8/site-packages/transformers/pipelines/__init__.py", line 590, in pipeline
    tokenizer = AutoTokenizer.from_pretrained(
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py", line 566, in from_pretrained
    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 1731, in from_pretrained
    raise EnvironmentError(msg)
OSError: Can't load tokenizer for './'. Make sure that:

- './' is a correct model identifier listed 

In [1]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-urdu")
processor = Wav2Vec2Processor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-urdu")



Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]