In [1]:
from transformers import AutoProcessor

In [2]:
processor = AutoProcessor.from_pretrained("chmanoj/xls-r-300m-te")

In [3]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [4]:
from pyctcdecode import build_ctcdecoder

In [9]:
# !which python

# !pip install https://github.com/kpu/kenlm/archive/master.zip

In [5]:
decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="3gram_correct.arpa",
)

Loading the LM will be faster if you build a binary file.
Reading /mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te/3gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.


In [6]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

In [7]:
import os
os.getcwd()

'/mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te'

In [8]:
processor_with_lm.save_pretrained(os.getcwd())

In [10]:
!../kenlm/build/bin/build_binary language_model/3gram_correct.arpa language_model/3gram.bin

Reading language_model/3gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [11]:
from huggingface_hub import Repository

In [15]:
repo = Repository(local_dir=".")

In [16]:
repo.push_to_hub(commit_message="Upload lm-boosted decoder")

Upload file language_model/3gram.bin:   0%|          | 32.0k/771M [00:00<?, ?B/s]

Upload file language_model/unigrams.txt:   0%|          | 32.0k/39.0M [00:00<?, ?B/s]

To https://huggingface.co/chmanoj/xls-r-300m-te
   aa77a85..dbca3b5  main -> main



'https://huggingface.co/chmanoj/xls-r-300m-te/commit/dbca3b5d87436c5615b2460922b94a15a878c713'

## Evaluation

In [None]:
#!python eval.py --model_id="chmanoj/xls-r-300m-te" --dataset="openslr_SLR66" --config="te" --split="test" --log_outputs

In [10]:
from huggingface_hub.repocard import metadata_load

In [18]:
x = metadata_load('README.md')

In [19]:
x

{'language': ['te'],
 'license': 'apache-2.0',
 'tags': ['automatic-speech-recognition',
  'openslr_SLR66',
  'generated_from_trainer',
  'robust-speech-event'],
 'datasets': ['openslr', 'SLR66'],
 'metrics': ['wer'],
 'model-index': [{'name': 'xls-r-300m-te',
   'results': [{'task': {'type': 'automatic-speech-recognition',
      'name': 'Speech Recognition'},
     'dataset': {'type': 'openslr', 'name': 'Open SLR', 'args': 'SLR66'},
     'metrics': [{'type': 'wer',
       'value': 24.695121951219512,
       'name': 'Test WER'},
      {'type': 'cer', 'value': 4.861934182322532, 'name': 'Test CER'}]}]}]}