# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [1]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [2]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
CPU cores: 60
Python version: 3.8.8
PyTorch version: 1.10.1+cu102
GPU is visible: True
Transformers version: 4.16.0.dev0
Datasets version: 1.17.1.dev0
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [3]:
!nvidia-smi

Thu Jan 27 07:23:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   34C    P0    25W / 250W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [5]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py

--2022-01-22 15:01:09--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30348 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]



In [None]:
# 	--learning_rate="7.5e-5" \
# 84.5

In [51]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="gl" \
	--output_dir="./wav2vec2-large-xls-r-300m-galician" \
	--overwrite_output_dir \
	--num_train_epochs="20" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="32" \
	--gradient_accumulation_steps="1" \
	--learning_rate="7e-5" \
	--warmup_steps="500" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
	--save_steps="500" \
	--eval_steps="500" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="2" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/27/2022 09:23:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=7e-05,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_l

In [25]:
# !rm -rf wav2vec2-large-xls-r-300m-bashkir

In [None]:
!ls -ltr

In [4]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  1.2T  2.2T  34% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G     0   87G   0% /dev/shm
/dev/md0        3.5T  1.2T  2.2T  34% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.5G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [52]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "gl", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "gl", use_auth_token=True, split="test")

print(len(common_voice_train))

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/gl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)
Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/gl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)


3670


In [53]:
len(common_voice_train) * 50 / 32

5734.375

In [54]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [55]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [56]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,A torre do campanario amosa tres corpos.
1,Viaxou por Inglaterra e Europa.
2,Foi veciño do Concello de Ordes
3,Butch é outro personaxe.
4,Na colmea máis vella había unha alza sen mel
5,Tiña carácter humorístico.
6,No soto están as salas Thorne.
7,Alfredo Guisado tamén escribiu poesía usando o pseudónimo de Pedro de Meneses.
8,Tamén aparece nun dos laterais unha fornela coa imaxe do San Miguel.
9,A letra provén das coplas típicas.


In [57]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\—\’\…\–]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [58]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

  0%|          | 0/3670 [00:00<?, ?ex/s]

  0%|          | 0/1716 [00:00<?, ?ex/s]

In [59]:
# start_with_ar = common_voice_train.filter(lambda example: "−" in example['sentence'])
# start_with_ar[0]

In [36]:
# start_with_ar

In [60]:
def replace_hatted_characters(batch):
#     batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
#     batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
#     batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
#     batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
    return batch

In [61]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

  0%|          | 0/3670 [00:00<?, ?ex/s]

  0%|          | 0/1716 [00:00<?, ?ex/s]

In [62]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [63]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [64]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [65]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 "'": 1,
 '_': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28,
 '¡': 29,
 '«': 30,
 '»': 31,
 '¿': 32,
 'á': 33,
 'é': 34,
 'í': 35,
 'ñ': 36,
 'ó': 37,
 'ú': 38,
 'ü': 39,
 '−': 40}

In [66]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

import json
with open('./vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
    
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

repo_name = "wav2vec2-large-xls-r-300m-galician"

tokenizer.push_to_hub(repo_name)

file ./config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


43


In [47]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-galician
!ls -ltr wav2vec2-large-xls-r-300m-galician

--2022-01-27 08:52:26--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4421 (4.3K) [text/plain]
Saving to: ‘eval.py’


2022-01-27 08:52:27 (15.4 MB/s) - ‘eval.py’ saved [4421/4421]

total 1232592
-rw-r--r-- 1 ovh ovh        398 Jan 27 08:09 vocab.json
-rw-r--r-- 1 ovh ovh        260 Jan 27 08:09 tokenizer_config.json
-rw-r--r-- 1 ovh ovh        309 Jan 27 08:09 special_tokens_map.json
-rw-r--r-- 1 ovh ovh         23 Jan 27 08:09 added_tokens.json
drwxr-xr-x 2 ovh ovh       4096 Jan 27 08:22 checkpoint-400
drwxr-xr-x 2 ovh ovh       4096 Jan 27 08:35 checkpoint-800
-rw-r--r-- 1 ovh ovh       2369 Jan 27 08:46 trainer_state.json
-rw-r--r--

In [48]:
!cd wav2vec2-large-xls-r-300m-galician; python eval.py \
    --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config gl --split test --log_outputs

Downloading: 100%|█████████████████████████| 9.88k/9.88k [00:00<00:00, 4.62MB/s]
Downloading: 100%|█████████████████████████| 2.98k/2.98k [00:00<00:00, 1.68MB/s]
Downloading and preparing dataset common_voice/gl to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/gl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b...
Dataset common_voice downloaded and prepared to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/gl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b. Subsequent calls will reuse this data.
100%|███████████████████████████████████████| 1716/1716 [15:28<00:00,  1.85ex/s]
WER: 1.0185525232698032
CER: 3.5303200243189132
100%|████████████████████████████████████| 1716/1716 [00:00<00:00, 20749.40ex/s]


In [1]:
# from transformers import AutoModelForCTC, Wav2Vec2Processor

# model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
# processor = Wav2Vec2Processor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")



Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [23]:
# from transformers import AutoModelForCTC, AutoProcessor
# from datasets import load_dataset

# model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
# processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")

# input_values = processor(common_voice_test[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
# # input_values = input_values.to("cuda")

# logits = model(input_values).logits

# assert logits.shape[-1] == 32, logits.shape[-1]

Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

AssertionError: 55

In [67]:
from datasets import Audio, Dataset, load_dataset, load_metric
from transformers import AutoFeatureExtractor, pipeline

dataset = load_dataset("mozilla-foundation/common_voice_7_0", "gl", use_auth_token=True, split="train+validation")

# for testing: only process the first two examples as a test
dataset = dataset.select(range(10))

repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-galician'

# load processor
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)
# feature_extractor = processor_with_lm.feature_extractor
sampling_rate = feature_extractor.sampling_rate

# resample audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

# load eval pipeline
asr = pipeline("automatic-speech-recognition", model=repo_name, feature_extractor=feature_extractor)

# map function to decode audio
def map_to_pred(batch):
    prediction = asr(
        batch["audio"]["array"])

    batch["prediction"] = prediction["text"]
    batch["target"] = batch["sentence"]
    return batch

# run inference on all examples
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
print(result["prediction"])

result[0]['target']

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/gl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?ex/s]

['[UNK]_[UNK] [UNK]sh[UNK] c[UNK]d ptd[UNK] [UNK]ld[UNK]á[UNK]r [UNK]rd[UNK]m[UNK]cn[UNK]', '[UNK]h[UNK]ln[UNK]r r[UNK]h[UNK]k_[UNK]m[UNK]cd[UNK]h[UNK]q[UNK]n[UNK]r[UNK] [UNK]ad[UNK]h[UNK]ch[UNK]í_[UNK] cn[UNK] [UNK]_[UNK]q[UNK] _[UNK]m[UNK]s[UNK]d[UNK]r [UNK]ptd[UNK] [UNK]d[UNK]rd[UNK] [UNK]qé[UNK]n[UNK] [UNK]n[UNK] [UNK]kd[UNK]u[UNK]d[UNK] [UNK]o[UNK]_[UNK]q_ n[UNK] [UNK]l_[UNK]q[UNK]', '[UNK]r[UNK]d[UNK] [UNK]u_[UNK]r[UNK] [UNK]on[UNK]k_[UNK]r [UNK]ok[UNK]_[UNK]sd[UNK]qé[UNK]_[UNK]r[UNK] [UNK]q[UNK]d[UNK]y_[UNK]q[UNK] [UNK]m_[UNK] [UNK]bn[UNK]q[UNK]s[UNK]h[UNK]bd[UNK]k_[UNK] [UNK]sn[UNK]l_[UNK]q¿[UNK]r _[UNK]t[UNK]fn [UNK]ud[UNK]m[UNK]c[UNK]h[UNK]s_[UNK] [UNK]cn[UNK]r [UNK]b[UNK]_[UNK]a_[UNK]kh[UNK]í[UNK]n[UNK]r[UNK] [UNK]cd[UNK] [UNK]od[UNK]c[UNK]q_[UNK]', '[UNK]t[UNK]mg_ [UNK]ud[UNK]q[UNK]rhñ[UNK]m [UNK]mn[UNK]q[UNK]l_[UNK]k cn[UNK] [UNK]o[UNK]k_[UNK]md[UNK]s_[UNK]', '[UNK]d[UNK]t[UNK] [UNK]o[UNK]d[UNK]q[UNK]c[UNK]é[UNK]m[UNK] [UNK] [UNK]sn[UNK]c_[UNK] [UNK]¿[UNK] [UNK]e_[UNK]lh[U

'E ti de quen ves sendo?'

In [49]:
result[2]['prediction'].replace('[UNK]', '')

"qtod'sjn qdllhhm u'h ds"

In [50]:
result[2]['target'].replace('[UNK]', '')

'Rupeatko remmiin, vai et?'