# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [1]:
# %%capture
# !pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode
# !pip install datasets==1.18.1
# !pip install git+https://github.com/huggingface/transformers.git
# !pip install huggingface_hub==0.1
# !pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# !pip install jiwer
# !pip install -U git+https://github.com/huggingface/transformers.git

In [2]:
# !pip install -U git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-i45amciw
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-i45amciw
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [1]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [2]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
CPU cores: 60
Python version: 3.8.8
PyTorch version: 1.10.1+cu102
GPU is visible: True
Transformers version: 4.16.0.dev0
Datasets version: 1.17.1.dev0
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [3]:
!nvidia-smi

Sat Jan 29 03:27:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-crendential store but this isn't the helper defined on your machine.
You will have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal to set it as the default

git config --global credential.helper store[0m


In [8]:
%%capture
!apt install git-lfs

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [12]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
# !wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py

--2022-01-28 09:12:30--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31209 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-28 09:12:30 (21.4 MB/s) - ‘run_speech_recognition_ctc.py’ saved [31209/31209]



In [9]:
# 	--learning_rate="7.5e-5" \
# 84.5

In [9]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0


In [10]:
!pip install bitsandbytes-cuda111

Collecting bitsandbytes-cuda111
  Downloading bitsandbytes_cuda111-0.26.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
[?25hInstalling collected packages: bitsandbytes-cuda111
Successfully installed bitsandbytes-cuda111-0.26.0


In [None]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="lv" \
	--output_dir="./wav2vec2-large-xls-r-300m-latvian" \
	--overwrite_output_dir \
	--num_train_epochs="100" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="1" \
	--gradient_accumulation_steps="1" \
	--learning_rate="7e-5" \
	--warmup_steps="2000" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \' \
	--save_steps="2000" \
	--eval_steps="2000" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="2" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub > out.log

remove special characters from datasets: 100%|█| 4963/4963 [00:00<00:00, 5401.29
remove special characters from datasets: 100%|█| 2084/2084 [00:00<00:00, 5657.57
loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-xls-r-300m",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 51%|█████████████████▍                | 8000/15600 [2:36:45<3:02:56,  1.44s/it]The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.
***** Running Evaluation *****
  Num examples = 2084
  Batch size = 1

  0%|                                                  | 0/2084 [00:00<?, ?it/s][A
  0%|                                          | 3/2084 [00:00<01:24, 24.48it/s][A
  0%|                                          | 6/2084 [00:00<01:24, 24.63it/s][A
  0%|▏                                         | 9/2084 [00:00<01:29, 23.17it/s][A
  1%|▏                                        | 12/2084 [00:00<01:30, 22.85it/s][A
  1%|▎                                        | 15/2084 [00:00<01:27, 23.63it/s][A
  1%|▎                                        | 18/2084 [00:00<01:25, 24.05it/s][A
  1%|▍                                        | 21/2084 [00:00<01:28, 23.29it/s][A
  1%|▍                         

In [14]:
# !python run_speech_recognition_ctc.py \
# 	--dataset_name="mozilla-foundation/common_voice_7_0" \
# 	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
# 	--dataset_config_name="ha" \
# 	--max_duration_in_seconds="10" \
# 	--output_dir="./wav2vec2-large-xls-r-300m-hausa" \
# 	--overwrite_output_dir \
# 	--num_train_epochs="100" \
# 	--per_device_train_batch_size="32" \
# 	--per_device_eval_batch_size="32" \
# 	--gradient_accumulation_steps="1" \
# 	--learning_rate="7e-5" \
# 	--warmup_steps="500" \
# 	--length_column_name="input_length" \
# 	--evaluation_strategy="steps" \
# 	--text_column_name="sentence" \
# 	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
# 	--save_steps="500" \
# 	--eval_steps="500" \
# 	--logging_steps="100" \
# 	--layerdrop="0.0" \
# 	--activation_dropout="0.1" \
# 	--save_total_limit="2" \
# 	--freeze_feature_encoder \
# 	--feat_proj_dropout="0.0" \
# 	--mask_time_prob="0.75" \
# 	--mask_time_length="10" \
# 	--mask_feature_prob="0.25" \
# 	--mask_feature_length="64" \
# 	--gradient_checkpointing \
# 	--use_auth_token \
# 	--fp16 \
# 	--group_by_length \
# 	--do_train --do_eval \
#     --push_to_hub

01/28/2022 09:21:58 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=7e-05,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_l

In [None]:
# !rm -rf wav2vec2-large-xls-r-300m-bashkir

In [None]:
!ls -ltr

In [None]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  1.2T  2.2T  34% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G     0   87G   0% /dev/shm
/dev/md0        3.5T  1.2T  2.2T  34% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.5G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [None]:
# !pip install -U datasets

In [16]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "lv", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "lv", use_auth_token=True, split="test")

print(len(common_voice_train))

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/lv/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)
Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/lv/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)


4963


In [17]:
common_voice_train

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 4963
})

In [18]:
len(common_voice_train) * 100 / 32

15509.375

In [19]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [20]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [21]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,Nav ļaunākā vieta.
1,Jā. Jā. Vajag tik uzlādēt.
2,"Labi, bet tu šo visu gribi?"
3,Tas ir viss?
4,Paldies. Labi.
5,Kurš būtu tas pazīstamais?
6,Pasveicini savus putnus no manis.
7,"Ja tavs tētis tā teica, tad viņš kaut ko ir aizmirsis."
8,Tas ir pārāk bīstami.
9,"Jā, tātad Džons pamazām zaudē sajēgu."


In [22]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\']'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [23]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

  0%|          | 0/4963 [00:00<?, ?ex/s]

  0%|          | 0/2084 [00:00<?, ?ex/s]

In [24]:
# start_with_ar = common_voice_train.filter(lambda example: "⅛" in example['sentence'])
# start_with_ar[0]

In [25]:
# start_with_ar

In [26]:
def replace_hatted_characters(batch):
#     batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
#     batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
#     batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
#     batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
#     batch["sentence"] = re.sub('&', 'and', batch["sentence"])
    return batch

In [27]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

  0%|          | 0/4963 [00:00<?, ?ex/s]

  0%|          | 0/2084 [00:00<?, ?ex/s]

In [28]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [29]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [31]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'x': 22,
 'z': 23,
 'ā': 24,
 'č': 25,
 'ē': 26,
 'ģ': 27,
 'ī': 28,
 'ķ': 29,
 'ļ': 30,
 'ņ': 31,
 'š': 32,
 'ū': 33,
 'ž': 34}

In [32]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

import json
with open('./vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
    
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

repo_name = "wav2vec2-large-xls-r-300m-latvian"

# tokenizer.save_pretrained(repo_name)

tokenizer.push_to_hub(repo_name)

37


file ./config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-latvian into local empty directory.
To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-latvian
   c9ee86b..4051bde  main -> main



'https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-latvian/commit/4051bde73a4b7c152b72a22d5babb62c46df97b5'

In [66]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-irish
!cp run_speech_recognition_ctc.py wav2vec2-large-xls-r-300m-irish
!ls -ltr wav2vec2-large-xls-r-300m-irish

--2022-01-30 07:10:29--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4738 (4.6K) [text/plain]
Saving to: ‘eval.py’


2022-01-30 07:10:29 (16.6 MB/s) - ‘eval.py’ saved [4738/4738]

total 1232584
-rw-r--r-- 1 ovh ovh        300 Jan 30 02:51 vocab.json
-rw-r--r-- 1 ovh ovh        260 Jan 30 02:51 tokenizer_config.json
-rw-r--r-- 1 ovh ovh        309 Jan 30 02:51 special_tokens_map.json
-rw-r--r-- 1 ovh ovh         23 Jan 30 02:51 added_tokens.json
drwxr-xr-x 2 ovh ovh       4096 Jan 30 04:36 checkpoint-500
drwxr-xr-x 2 ovh ovh       4096 Jan 30 06:22 checkpoint-1000
-rw-r--r-- 1 ovh ovh       2521 Jan 30 07:06 trainer_state.json
-rw-r--r-

In [67]:
!cd wav2vec2-large-xls-r-300m-i;python eval.py \
    --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config kmr --split test --log_outputs

Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.
	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.


In [68]:
!cd wav2vec2-large-xls-r-300m-irish; python eval.py \
    --model_id ./ --dataset speech-recognition-community-v2/dev_data \
    --config kmr --split validation --chunk_length_s 10 --stride_length_s 1

Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.
	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.


In [None]:
# from transformers import AutoModelForCTC, Wav2Vec2Processor

# model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
# processor = Wav2Vec2Processor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")



Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [None]:
# from transformers import AutoModelForCTC, AutoProcessor
# from datasets import load_dataset

# model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
# processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")

# input_values = processor(common_voice_test[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
# # input_values = input_values.to("cuda")

# logits = model(input_values).logits

# assert logits.shape[-1] == 32, logits.shape[-1]

Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

AssertionError: 55

In [1]:
from datasets import Audio, Dataset, load_dataset, load_metric
from transformers import AutoFeatureExtractor, pipeline

dataset = load_dataset("mozilla-foundation/common_voice_7_0", "lv", use_auth_token=True, split="train+validation")

# for testing: only process the first two examples as a test
dataset = dataset.select(range(10))

repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-latvian'

# load processor
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)
# feature_extractor = processor_with_lm.feature_extractor
sampling_rate = feature_extractor.sampling_rate

# resample audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

# load eval pipeline
asr = pipeline("automatic-speech-recognition", model=repo_name, feature_extractor=feature_extractor)

# map function to decode audio
def map_to_pred(batch):
    prediction = asr(
        batch["audio"]["array"])

    batch["prediction"] = prediction["text"]
    batch["target"] = batch["sentence"]
    return batch

# run inference on all examples
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
print(result["prediction"])

print(result['target'])

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/lv/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)


Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?ex/s]

['nebija nekā tīra ko uzvilkt', 'cēlonis tam ne viens vien', 'visi vilki nav pelēki', 'iedzert aukstu alu būtu labi', 'vai mani mati bija glīti', 'lēnām nesasteidz', 'nerunā man rupjības', 'es vairs nevaru būt tavs elks', 'es atradu mūsu zemes gabalu', 'ko tas sīkais sūds ar mani darītu']
['Nebija nekā tīra, ko uzvilkt?', 'Cēlonis tam - ne viens vien.', 'Visi vilki nav pelēki.', 'Iedzert aukstu alu būtu labi.', 'Vai mani mati bija glīti?', 'Lēnām, nesasteidz.', 'Nerunā man rupjības.', 'Es vairs nevaru būt tavs elks.', 'Es atradu mūsu zemes gabalu.', 'Ko tas sīkais sūds ar mani darītu?']


In [6]:
result[0]["prediction"].replace('[UNK]', '')

"e'ess' qted j'ms' ' ɓ'jhm s' s'm' jtj' jtr'm 'v' ɓ'x'"