In [11]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor, AutoProcessor, Wav2Vec2ProcessorWithLM
from datasets import load_dataset, load_metric, Audio
from pyctcdecode import build_ctcdecoder
from pydub import AudioSegment
from pydub.playback import play

import numpy as np
import torch
import kenlm
import pandas as pd
import random
import soundfile as sf
from tqdm.auto import tqdm

In [17]:
KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/language_model/km_text.arpa'

In [5]:
processor = AutoProcessor.from_pretrained("vitouphy/xls-r-300m-km")

In [7]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
print(sorted_vocab_dict)

{'|': 0, 'ក': 1, 'ខ': 2, 'គ': 3, 'ឃ': 4, 'ង': 5, 'ច': 6, 'ឆ': 7, 'ជ': 8, 'ឈ': 9, 'ញ': 10, 'ដ': 11, 'ឋ': 12, 'ឌ': 13, 'ឍ': 14, 'ណ': 15, 'ត': 16, 'ថ': 17, 'ទ': 18, 'ធ': 19, 'ន': 20, 'ប': 21, 'ផ': 22, 'ព': 23, 'ភ': 24, 'ម': 25, 'យ': 26, 'រ': 27, 'ល': 28, 'វ': 29, 'ស': 30, 'ហ': 31, 'ឡ': 32, 'អ': 33, 'ឥ': 34, 'ឧ': 35, 'ឪ': 36, 'ឫ': 37, 'ឬ': 38, 'ឭ': 39, 'ឮ': 40, 'ឯ': 41, 'ឱ': 42, 'ា': 43, 'ិ': 44, 'ី': 45, 'ឹ': 46, 'ឺ': 47, 'ុ': 48, 'ូ': 49, 'ួ': 50, 'ើ': 51, 'ឿ': 52, 'ៀ': 53, 'េ': 54, 'ែ': 55, 'ៃ': 56, 'ោ': 57, 'ៅ': 58, 'ំ': 59, 'ះ': 60, 'ៈ': 61, '៉': 62, '៊': 63, '់': 64, '៌': 65, '៍': 66, '៎': 67, '៏': 68, '័': 69, '្': 70, '[unk]': 71, '[pad]': 72, '<s>': 73, '</s>': 74}


In [10]:
decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path=KENLM_MODEL_LOC,
)

Loading the LM will be faster if you build a binary file.
Reading /workspace/xls-r-300m-km/language_model/km_text.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Only 82 unigrams passed as vocabulary. Is this small or artificial data?


In [12]:
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

In [15]:
processor_with_lm.save_pretrained("vitouphy/xls-r-300m-km")

FileExistsError: [Errno 17] File exists: 'vitouphy/xls-r-300m-km/language_model'

In [None]:
model = AutoModelForCTC.from_pretrained(".")

In [None]:
# %%bash 
# wget https://www.openslr.org/resources/42/km_kh_male.zip
# unzip km_kh_male.zip

--2022-02-03 05:13:35--  https://www.openslr.org/resources/42/km_kh_male.zip
Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 866086951 (826M) [application/zip]
Saving to: ‘km_kh_male.zip’

     0K .......... .......... .......... .......... ..........  0%  264K 53m29s
    50K .......... .......... .......... .......... ..........  0%  525K 40m9s
   100K .......... .......... .......... .......... ..........  0%  523K 35m45s
   150K .......... .......... .......... .......... ..........  0%  523K 33m33s
   200K .......... .......... .......... .......... ..........  0%  264K 37m31s
   250K .......... .......... .......... .......... ..........  0%  523K 35m45s
   300K .......... .......... .......... .......... ..........  0%  526K 34m28s
   350K .......... .......... .......... .......... ..........  0%  524K 33m31s
   400K .......... ....

Archive:  km_kh_male.zip
   creating: km_kh_male/
  inflating: km_kh_male/line_index.tsv  
   creating: km_kh_male/wavs/
  inflating: km_kh_male/wavs/khm_1161_3945210975.wav  
  inflating: km_kh_male/wavs/khm_1161_3840820726.wav  
  inflating: km_kh_male/wavs/khm_1161_3632689663.wav  
  inflating: km_kh_male/wavs/khm_1161_3514535297.wav  
  inflating: km_kh_male/wavs/khm_1161_3445330166.wav  
  inflating: km_kh_male/wavs/khm_1161_3170598248.wav  
  inflating: km_kh_male/wavs/khm_1161_3030414325.wav  
  inflating: km_kh_male/wavs/khm_1161_2858870182.wav  
  inflating: km_kh_male/wavs/khm_1161_2835582962.wav  
  inflating: km_kh_male/wavs/khm_1161_2662937440.wav  
  inflating: km_kh_male/wavs/khm_1161_2449542221.wav  
  inflating: km_kh_male/wavs/khm_1161_2294049689.wav  
  inflating: km_kh_male/wavs/khm_1161_2136371765.wav  
  inflating: km_kh_male/wavs/khm_1161_2092267195.wav  
  inflating: km_kh_male/wavs/khm_1161_2061040949.wav  
  inflating: km_kh_male/wavs/khm_1161_2054764716.wav  

### Load KH Data

In [6]:
# from sklearn.model_selection import train_test_split
# import pandas as pd
# from datasets import load_dataset
 
# colnames=['path','drop','text'] 
# df  = pd.read_csv('km_kh_male/line_index.tsv',sep='\t',header=None,names=colnames)
# df['path'] = '/workspace/xls-r-300m-km/km_kh_male/wavs/'+df['path'] +'.wav'

# train_valid, test = train_test_split(df, test_size=0.1)
# train, valid = train_test_split(train_valid, test_size=0.1)

# train.to_csv('./km_kh_male/line_index_train.csv')
# valid.to_csv('./km_kh_male/line_index_valid.csv')
# test.to_csv('./km_kh_male/line_index_test.csv')

In [2]:
common_voice_train  = load_dataset('csv', data_files='km_kh_male/line_index_train.csv', split= 'train')
common_voice_valid  = load_dataset('csv', data_files='km_kh_male/line_index_valid.csv', split = 'train')

Using custom data configuration default-decaf49f8e8b5be8
Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-decaf49f8e8b5be8/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
Using custom data configuration default-2ae3784a8d52f12b
Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-2ae3784a8d52f12b/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


In [3]:
common_voice_train = (common_voice_train
                      .remove_columns(["Unnamed: 0", "drop"])
                      .rename_column('text', 'sentence'))
common_voice_valid  = (common_voice_valid
                      .remove_columns(["Unnamed: 0", "drop"])
                      .rename_column('text', 'sentence'))

### Clean Up the Text

In [4]:
# # Remove character
# import re
# chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

# def remove_special_characters(batch):
#     batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"])
#     return batch

In [5]:
# # TODO: somehow this break thigns. WHY?
# common_voice_train = common_voice_train.map(remove_special_characters, num_proc=16)
# common_voice_valid = common_voice_test.map(remove_special_characters, num_proc=16)

In [6]:
common_voice_train[0]

{'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_4015603856.wav',
 'sentence': 'ទេសចរណ៍ នៅ ខេត្ត ព្រះ សីហនុ នា រដូវ បុណ្យ ភ្ជុំ បិណ្ឌ នេះ មាន ការ កើន ឡើង យ៉ាង ខ្លាំង'}

### Build Character

In [7]:
vocab_train = []
vocab_test  = []

for batch in tqdm(common_voice_train):
    sentence = batch['sentence']
    vocab_train.extend(list(set(list(sentence))))
    
for batch in tqdm(common_voice_valid):
    sentence = batch['sentence']
    vocab_test.extend(list(set(list(sentence))))

  0%|          | 0/2353 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [8]:
vocab_list = list(set(vocab_train) | set(vocab_test))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

In [9]:
print(vocab_dict)

{' ': 0, 'ក': 1, 'ខ': 2, 'គ': 3, 'ឃ': 4, 'ង': 5, 'ច': 6, 'ឆ': 7, 'ជ': 8, 'ឈ': 9, 'ញ': 10, 'ដ': 11, 'ឋ': 12, 'ឌ': 13, 'ឍ': 14, 'ណ': 15, 'ត': 16, 'ថ': 17, 'ទ': 18, 'ធ': 19, 'ន': 20, 'ប': 21, 'ផ': 22, 'ព': 23, 'ភ': 24, 'ម': 25, 'យ': 26, 'រ': 27, 'ល': 28, 'វ': 29, 'ស': 30, 'ហ': 31, 'ឡ': 32, 'អ': 33, 'ឥ': 34, 'ឧ': 35, 'ឪ': 36, 'ឫ': 37, 'ឬ': 38, 'ឭ': 39, 'ឮ': 40, 'ឯ': 41, 'ឱ': 42, 'ា': 43, 'ិ': 44, 'ី': 45, 'ឹ': 46, 'ឺ': 47, 'ុ': 48, 'ូ': 49, 'ួ': 50, 'ើ': 51, 'ឿ': 52, 'ៀ': 53, 'េ': 54, 'ែ': 55, 'ៃ': 56, 'ោ': 57, 'ៅ': 58, 'ំ': 59, 'ះ': 60, 'ៈ': 61, '៉': 62, '៊': 63, '់': 64, '៌': 65, '៍': 66, '៎': 67, '៏': 68, '័': 69, '្': 70}


In [10]:
# make the space more intuitive to understand
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

73

In [11]:
print(vocab_dict)

{'ក': 1, 'ខ': 2, 'គ': 3, 'ឃ': 4, 'ង': 5, 'ច': 6, 'ឆ': 7, 'ជ': 8, 'ឈ': 9, 'ញ': 10, 'ដ': 11, 'ឋ': 12, 'ឌ': 13, 'ឍ': 14, 'ណ': 15, 'ត': 16, 'ថ': 17, 'ទ': 18, 'ធ': 19, 'ន': 20, 'ប': 21, 'ផ': 22, 'ព': 23, 'ភ': 24, 'ម': 25, 'យ': 26, 'រ': 27, 'ល': 28, 'វ': 29, 'ស': 30, 'ហ': 31, 'ឡ': 32, 'អ': 33, 'ឥ': 34, 'ឧ': 35, 'ឪ': 36, 'ឫ': 37, 'ឬ': 38, 'ឭ': 39, 'ឮ': 40, 'ឯ': 41, 'ឱ': 42, 'ា': 43, 'ិ': 44, 'ី': 45, 'ឹ': 46, 'ឺ': 47, 'ុ': 48, 'ូ': 49, 'ួ': 50, 'ើ': 51, 'ឿ': 52, 'ៀ': 53, 'េ': 54, 'ែ': 55, 'ៃ': 56, 'ោ': 57, 'ៅ': 58, 'ំ': 59, 'ះ': 60, 'ៈ': 61, '៉': 62, '៊': 63, '់': 64, '៌': 65, '៍': 66, '៎': 67, '៏': 68, '័': 69, '្': 70, '|': 0, '[UNK]': 71, '[PAD]': 72}


In [12]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Tokenizer

In [13]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor

In [14]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")  # './' load vocab.json in the current directory
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)  
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [26]:
# def speech_file_to_array_fn(batch):
#     audio_array, sampling_rate = torchaudio.load(batch["path"])
#     batch["audio"] = {
#         "array": audio_array[0].numpy(),
#         "path": batch["path"],
#         "sampling_rate": sampling_rate
#     }
#     return batch

In [27]:
# common_voice_train = common_voice_train.map(speech_file_to_array_fn)
# common_voice_test = common_voice_test.map(speech_file_to_array_fn)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [15]:
common_voice_train = common_voice_train.cast_column("path", Audio(sampling_rate=16_000)).rename_column('path', 'audio')
common_voice_valid  = common_voice_valid.cast_column("path", Audio(sampling_rate=16_000)).rename_column('path', 'audio')

In [16]:
common_voice_train[0]

{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_4015603856.wav',
  'array': array([-1.3359112e-06,  1.5759380e-06, -2.0205737e-06, ...,
         -7.6091878e-06,  9.0511895e-07,  0.0000000e+00], dtype=float32),
  'sampling_rate': 16000},
 'sentence': 'ទេសចរណ៍ នៅ ខេត្ត ព្រះ សីហនុ នា រដូវ បុណ្យ ភ្ជុំ បិណ្ឌ នេះ មាន ការ កើន ឡើង យ៉ាង ខ្លាំង'}

In [17]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])
ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=False, rate=16000)

Target text: បញ្ជី ឈ្មោះ បោះឆ្នោត ខេត្ត កំពង់ចាម
Input array shape: (65536,)
Sampling rate: 16000


In [18]:
# This does not prepare the input for the Transformer model.
# This will resample the data and convert the sentence into indices
# Batch here is just for one entry (row)
def prepare_dataset(batch):
    audio = batch["audio"]
    
    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [19]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, num_proc=16)
common_voice_valid = common_voice_valid.map(prepare_dataset, remove_columns=common_voice_valid.column_names, num_proc=16)

Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-decaf49f8e8b5be8/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-a8ad7f3bec152712.arrow
Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-decaf49f8e8b5be8/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-5802da9af6ac9ac7.arrow
Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-decaf49f8e8b5be8/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-418585d4baf07152.arrow
Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-decaf49f8e8b5be8/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-c7e5028c91005615.arrow
Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-decaf49f8e8b5be8/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-b96

In [20]:
# In case the dataset is too long which can lead to OOM. We should filter them out.
# max_input_length_in_sec = 5.0
# common_voice_train = common_voice_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

In [21]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [22]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [23]:
wer_metric = load_metric("wer")
# cer_metric = load_metric("cer")

In [24]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids)
    label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
    
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [25]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.1,
    layerdrop=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.75, 
    mask_time_length=10,
    mask_feature_prob=0.25,
    mask_feature_length=64,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_hid.weight', 'quantizer.weight_proj.bias', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

In [26]:
model.freeze_feature_encoder()

In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir='.',
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=4,
  evaluation_strategy="steps",
  gradient_checkpointing=True,
  fp16=True,
  num_train_epochs=100,
  save_steps=400,
  eval_steps=400,
  logging_steps=100,
  learning_rate=5e-5,
  warmup_steps=1000,
  save_total_limit=3,
  load_best_model_at_end=True
)

In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_valid,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


In [30]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.
***** Running training *****
  Num examples = 2353
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 7300


Step,Training Loss,Validation Loss,Wer
400,5.0795,4.412057,1.0
800,3.5658,3.520337,1.0
1200,3.3689,2.898376,0.999596
1600,2.01,1.004076,0.728814
2000,1.6783,0.694064,0.59887
2400,1.527,0.559943,0.528249
2800,1.4278,0.482676,0.48063
3200,1.3458,0.442869,0.453188
3600,1.2893,0.415609,0.43301
4000,1.2441,0.402046,0.403955


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.
***** Running Evaluation *****
  Num examples = 262
  Batch size = 8
Saving model checkpoint to ./checkpoint-400
Configuration saved in ./checkpoint-400/config.json
Model weights saved in ./checkpoint-400/pytorch_model.bin
Configuration saved in ./checkpoint-400/preprocessor_config.json
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.
***** Running Evaluation *****
  Num examples = 262
  Batch size = 8
Saving model checkpoint to ./checkpoint-800
Configuration saved in ./checkpoint-800/config.json
Model weights saved in ./checkpoint-800/pytorch_model.bin
Configuration saved in ./checkpoint-800/preprocessor_config.json
Deleting older checkpoint [checkpoint-3200] due to args.save_total_limit
The following columns in the evaluation set  don't have a c

TrainOutput(global_step=7300, training_loss=2.0282830110314776, metrics={'train_runtime': 14754.0737, 'train_samples_per_second': 15.948, 'train_steps_per_second': 0.495, 'total_flos': 3.5572390287970673e+19, 'train_loss': 2.0282830110314776, 'epoch': 99.99})

In [31]:
1

1

In [32]:
tokenizer.push_to_hub('vitouphy/xls-r-300m-km')

tokenizer config file saved in vitouphy/xls-r-300m-km/tokenizer_config.json
Special tokens file saved in vitouphy/xls-r-300m-km/special_tokens_map.json
added tokens file saved in vitouphy/xls-r-300m-km/added_tokens.json
To https://huggingface.co/vitouphy/xls-r-300m-km
   3ef5dfc..cb4f72c  main -> main



'https://huggingface.co/vitouphy/xls-r-300m-km/commit/cb4f72cb420eee8ca1f44b582a9d3cfbcd258f3d'

In [34]:
kwargs = {
    "finetuned_from": "facebook/wav2vec2-xls-r-300m",
    "tasks": "speech-recognition",
    "tags": ["automatic-speech-recognition", "openslr", "robust-speech-event", "km"],
    "dataset_args": f"Config: km, Training split: train, Eval split: validation",
    "dataset": "openslr",
    "language": "km"
}

In [35]:
trainer.create_model_card(**kwargs)

Dropping the following result as it does not have all the necessary fields:
{}


In [36]:
model.push_to_hub('vitouphy/xls-r-300m-km')

Configuration saved in vitouphy/xls-r-300m-km/config.json
Model weights saved in vitouphy/xls-r-300m-km/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 3.39k/1.18G [00:00<?, ?B/s]

To https://huggingface.co/vitouphy/xls-r-300m-km
   cb4f72c..8fe8876  main -> main



'https://huggingface.co/vitouphy/xls-r-300m-km/commit/8fe88762a9fca1dce5e056605465042b5700b69e'

In [38]:
trainer.save_model()

Saving model checkpoint to .
Configuration saved in ./config.json
Model weights saved in ./pytorch_model.bin
Configuration saved in ./preprocessor_config.json
