Training in progress, step 1000

Browse files

Files changed (16) hide show

.gitignore +1 -0
.ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py +771 -0
.ipynb_checkpoints/run_training-checkpoint.sh +32 -0
.ipynb_checkpoints/speech_training_notebook-checkpoint.ipynb +1490 -0
.ipynb_checkpoints/vocab-checkpoint.json +1 -0
added_tokens.json +1 -0
config.json +107 -0
preprocessor_config.json +9 -0
pytorch_model.bin +3 -0
run_speech_recognition_ctc_bnb.py +771 -0
run_training.sh +32 -0
special_tokens_map.json +1 -0
speech_training_notebook.ipynb +1490 -0
tokenizer_config.json +1 -0
training_args.bin +3 -0
vocab.json +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint-*/

.ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py ADDED Viewed

	@@ -0,0 +1,771 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+import datasets
+import numpy as np
+import torch
+from datasets import DatasetDict, load_dataset, load_metric
+import bitsandbytes as bnb
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+)
+from transformers.trainer_pt_utils import get_parameter_names
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.16.0.dev0")
+require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+logger = logging.getLogger(__name__)
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
+    )
+    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
+    hidden_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
+            "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+            "vectors will be masked along the time axis."
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
+            "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train+validation",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "value if set."
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to only do data preprocessing and skip training. "
+            "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+            "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+            "so that the cached datasets can consequently be loaded in distributed training"
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "If :obj:`True`, will use the token generated when running"
+            ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    phoneme_language: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The target language that should be used be"
+            " passed to the tokenizer for tokenization. Note that"
+            " this is only relevant if the model classifies the"
+            " input audio to a sequence of phoneme sequences."
+        },
+    )
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.AutoProcessor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: AutoProcessor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lenghts and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        with self.processor.as_target_processor():
+            labels_batch = self.processor.pad(
+                label_features,
+                padding=self.padding,
+                pad_to_multiple_of=self.pad_to_multiple_of_labels,
+                return_tensors="pt",
+            )
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        batch["labels"] = labels
+        return batch
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
+    # Given training and test labels create vocabulary
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["target_text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+    vocabs = datasets.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=datasets["train"].column_names,
+    )
+    # take union of all unique characters in each dataset
+    vocab_set = functools.reduce(
+        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
+    )
+    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+    # add unk and pad token
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)
+    return vocab_dict
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # 1. First, let's load the dataset
+    raw_datasets = DatasetDict()
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            use_auth_token=data_args.use_auth_token,
+        )
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--audio_column_name` to the correct audio column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            use_auth_token=data_args.use_auth_token,
+        )
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+    # 2. We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    from pykakasi import kakasi
+    kakasi = kakasi()
+    kakasi.setMode('J', 'H') #Convert from kanji to hiragana
+    # kakasi.setMode("K", "H") #Convert from katakana to hiragana
+    conv = kakasi.getConverter()
+    chars_to_ignore_regex = (
+        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\（\，\[\]\)\(\！]'
+    )
+    text_column_name = data_args.text_column_name
+    def remove_special_characters(batch):
+        if chars_to_ignore_regex is not None:
+            batch["target_text"] = conv.do(re.sub(chars_to_ignore_regex, "", batch[text_column_name])) + " "
+        else:
+            batch["target_text"] = batch[text_column_name].lower() + " "
+        return batch
+    with training_args.main_process_first(desc="dataset map special characters removal"):
+        raw_datasets = raw_datasets.map(
+            remove_special_characters,
+            remove_columns=[text_column_name],
+            desc="remove special characters from datasets",
+        )
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token
+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
+    # load config
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_kwargs = {}
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                os.remove(vocab_file)
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                vocab_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_kwargs = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "word_delimiter_token": word_delimiter_token,
+        }
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+    # load feature_extractor and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        use_auth_token=data_args.use_auth_token,
+        **tokenizer_kwargs,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+    # adapt config
+    config.update(
+        {
+            "feat_proj_dropout": model_args.feat_proj_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "hidden_dropout": model_args.hidden_dropout,
+            "final_dropout": model_args.final_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "layerdrop": model_args.layerdrop,
+            "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
+            "activation_dropout": model_args.activation_dropout,
+        }
+    )
+    # create model
+    model = AutoModelForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        use_auth_token=data_args.use_auth_token,
+    )
+    # freeze encoder
+    if model_args.freeze_feature_encoder:
+        model.freeze_feature_encoder()
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+    # make sure that dataset decodes audio with correct sampling rate
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
+    # derive max & min input length for sample rate & max duration
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
+    phoneme_language = data_args.phoneme_language
+    # Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    def prepare_dataset(batch):
+        # load audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_values"] = inputs.input_values[0]
+        batch["input_length"] = len(batch["input_values"])
+        # encode targets
+        additional_kwargs = {}
+        if phoneme_language is not None:
+            additional_kwargs["phonemizer_lang"] = phoneme_language
+        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
+        return batch
+    with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = raw_datasets.map(
+            prepare_dataset,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
+        return
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
+        return metrics
+    # Now save everything to be able to create a single processor later
+    if is_main_process(training_args.local_rank):
+        # save feature extractor, tokenizer and config
+        feature_extractor.save_pretrained(training_args.output_dir)
+        tokenizer.save_pretrained(training_args.output_dir)
+        config.save_pretrained(training_args.output_dir)
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+    # Instantiate custom data collator
+    data_collator = DataCollatorCTCWithPadding(processor=processor)
+    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
+            "weight_decay": training_args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = bnb.optim.Adam8bit(
+        params=optimizer_grouped_parameters,
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+    )
+    optimizers = (optimizer, None)
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
+        tokenizer=feature_extractor,
+        optimizers=optimizers,
+    )
+    # 8. Finally, we can start training
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name],
+        "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+    return results
+if __name__ == "__main__":
+    main()

.ipynb_checkpoints/run_training-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+python run_speech_recognition_ctc_bnb.py \
+	--dataset_name="mozilla-foundation/common_voice_8_0" \
+	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
+	--dataset_config_name="ja" \
+	--output_dir="./" \
+    --overwrite_output_dir \
+	--num_train_epochs="10" \
+	--per_device_train_batch_size="48" \
+	--per_device_eval_batch_size="8" \
+	--learning_rate="7.5e-5" \
+	--warmup_steps="2000" \
+	--length_column_name="input_length" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--save_steps="1000" \
+	--eval_steps="1000" \
+	--logging_steps="100" \
+	--layerdrop="0.0" \
+	--activation_dropout="0.1" \
+	--save_total_limit="4" \
+	--freeze_feature_encoder \
+	--feat_proj_dropout="0.0" \
+	--mask_time_prob="0.75" \
+	--mask_time_length="10" \
+	--mask_feature_prob="0.25" \
+	--mask_feature_length="64" \
+	--gradient_checkpointing \
+	--use_auth_token \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+	--push_to_hub

.ipynb_checkpoints/speech_training_notebook-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1490 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7523cd66cf343f98fd3006be918a3b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "251cac7b8968405eafd54e2d29165b40",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/2.98k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "528c6a67efea4512b04b06a32156d5b7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/53.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset common_voice/ja to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ja/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c21c5f782734b3bb3f545cef5b59ee0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/958M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset common_voice downloaded and prepared to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ja/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ja/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10623\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import Audio, Dataset, load_dataset, load_metric\n",
+    "from transformers import AutoFeatureExtractor, pipeline\n",
+    "\n",
+    "language_code = \"ja\"\n",
+    "dataset_name = \"mozilla-foundation/common_voice_8_0\"\n",
+    "\n",
+    "common_voice_train = load_dataset(dataset_name, language_code, use_auth_token=True, split=\"train+validation\")\n",
+    "common_voice_test = load_dataset(dataset_name, language_code, use_auth_token=True, split=\"test\")\n",
+    "\n",
+    "\n",
+    "print(len(common_voice_train))\n",
+    "\n",
+    "# # for testing: only process the first two examples as a test\n",
+    "# dataset = dataset.select(range(10))\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting pykakasi\n",
+      "  Downloading pykakasi-2.2.1-py3-none-any.whl (2.4 MB)\n",
+      "     |████████████████████████████████| 2.4 MB 9.9 MB/s            \n",
+      "\u001b[?25hCollecting jaconv\n",
+      "  Downloading jaconv-0.3.tar.gz (15 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting deprecated\n",
+      "  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)\n",
+      "Collecting wrapt<2,>=1.10\n",
+      "  Downloading wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (84 kB)\n",
+      "     |████████████████████████████████| 84 kB 12.8 MB/s            \n",
+      "\u001b[?25hBuilding wheels for collected packages: jaconv\n",
+      "  Building wheel for jaconv (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for jaconv: filename=jaconv-0.3-py3-none-any.whl size=15553 sha256=fd764f215e4d567cb60062a7052497b66729e9e2190e2e00153e0d19734088e7\n",
+      "  Stored in directory: /workspace/.cache/pip/wheels/73/e8/fb/b4ad8117719f79ac73bc05406d1768f845688cdbeed7aad87e\n",
+      "Successfully built jaconv\n",
+      "Installing collected packages: wrapt, jaconv, deprecated, pykakasi\n",
+      "Successfully installed deprecated-1.2.13 jaconv-0.3 pykakasi-2.2.1 wrapt-1.13.3\n",
+      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n",
+      "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install pykakasi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "にんじゃ ひらがな kana\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2159/3076271513.py:4: DeprecationWarning: Call to deprecated method setMode. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n",
+      "/tmp/ipykernel_2159/3076271513.py:6: DeprecationWarning: Call to deprecated method getConverter. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  conv = kakasi.getConverter()\n",
+      "/tmp/ipykernel_2159/3076271513.py:10: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  print(conv.do(str))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pykakasi import kakasi\n",
+    "\n",
+    "kakasi = kakasi()\n",
+    "kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n",
+    "# kakasi.setMode(\"K\", \"H\") #Convert from katakana to hiragana\n",
+    "conv = kakasi.getConverter()\n",
+    "\n",
+    "str = 'にんじゃ 平仮名 kana'\n",
+    "\n",
+    "print(conv.do(str))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "repo_name = 'https://huggingface.co/AndrewMcDowell/wav2vec2-xls-r-1B-german'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
+    "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ad26c4d7d02948a3bc30d86a0f3527c8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2159/322450745.py:5: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "93295f1cd50f4557a96ff1bf139c9a37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "chars_to_remove_regex = '[\\,\\?\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–\\（\\，\\[\\]\\)\\(\\！]'\n",
+    "# \\.\n",
+    "def remove_special_characters(batch):\n",
+    "    batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n",
+    "    return batch\n",
+    "\n",
+    "common_voice_train = common_voice_train.map(remove_special_characters)\n",
+    "common_voice_test = common_voice_test.map(remove_special_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting num2words\n",
+      "  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)\n",
+      "     |████████████████████████████████| 101 kB 7.9 MB/s            \n",
+      "\u001b[?25hCollecting docopt>=0.6.2\n",
+      "  Downloading docopt-0.6.2.tar.gz (25 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hBuilding wheels for collected packages: docopt\n",
+      "  Building wheel for docopt (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=7cda85e4b3980668714aad8f5d706fb5b189c2804ce1d99ca2380537fdc78031\n",
+      "  Stored in directory: /workspace/.cache/pip/wheels/56/ea/58/ead137b087d9e326852a851351d1debf4ada529b6ac0ec4e8c\n",
+      "Successfully built docopt\n",
+      "Installing collected packages: docopt, num2words\n",
+      "Successfully installed docopt-0.6.2 num2words-0.5.10\n",
+      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n",
+      "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install num2words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0da8fd9cdae64c1fa80fbcfc412bcf9c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "\n",
+    "from num2words import num2words\n",
+    "import regex as re\n",
+    "matches = []\n",
+    "\n",
+    "def replace_numbers(match):\n",
+    "    match = match.group()\n",
+    "    matches.append(match)\n",
+    "    return num2words(match, lang='de')\n",
+    "\n",
+    "def replace_numbers_in_batch(batch):\n",
+    "    batch[\"sentence\"] = re.sub(r'\\d+(?:,\\d+)?', replace_numbers, batch[\"sentence\"])\n",
+    "    return batch\n",
+    "\n",
+    "common_voice_test_2 = common_voice_test.map(replace_numbers_in_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54d62ea7a0214b6abc5de1f106b330dc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "common_voice_train_2 = common_voice_train.map(replace_numbers_in_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(matches)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def replace_accented_characters(batch):\n",
+    "#     accented_string = u'Málaga'\n",
+    "#     # accented_string is of type 'unicode'\n",
+    "#     import unidecode\n",
+    "#     unaccented_string = unidecode.unidecode(accented_string)\n",
+    "#     batch[\"sentence\"] = re.sub('[â]', 'a', batch[\"sentence\"])\n",
+    "#     batch[\"sentence\"] = re.sub('[î]', 'i', batch[\"sentence\"])\n",
+    "#     batch[\"sentence\"] = re.sub('[ô]', 'o', batch[\"sentence\"])\n",
+    "#     batch[\"sentence\"] = re.sub('[û]', 'u', batch[\"sentence\"])\n",
+    "#     return batch\n",
+    "\n",
+    "def strip_accents(batch):\n",
+    "    return ''.join(c for c in unicodedata.normalize('NFD', batch[\"sentence\"]) if unicodedata.category(c) != 'Mn')\n",
+    "\n",
+    "common_voice_train = common_voice_train.map(replace_accented_characters)\n",
+    "common_voice_test = common_voice_test.map(replace_accented_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_all_chars(batch):\n",
+    "    all_text = \" \".join(batch[\"sentence\"])\n",
+    "    vocab = list(set(all_text))\n",
+    "    return {\"vocab\": [vocab], \"all_text\": [all_text]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c40f4d6b6bb74a56b2c570a3a53d7f4b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f69b6a3c0b54477ea15c56b02464bacd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
+    "vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['ん',\n",
+       "  'ン',\n",
+       "  'ダ',\n",
+       "  'S',\n",
+       "  'う',\n",
+       "  'た',\n",
+       "  'ぽ',\n",
+       "  'P',\n",
+       "  '：',\n",
+       "  '々',\n",
+       "  'か',\n",
+       "  'ぞ',\n",
+       "  'よ',\n",
+       "  'や',\n",
+       "  'ヨ',\n",
+       "  'ゃ',\n",
+       "  'Q',\n",
+       "  'N',\n",
+       "  'だ',\n",
+       "  'を',\n",
+       "  'L',\n",
+       "  'ｈ',\n",
+       "  'Ｆ',\n",
+       "  'E',\n",
+       "  'ピ',\n",
+       "  'ち',\n",
+       "  'ボ',\n",
+       "  'w',\n",
+       "  'リ',\n",
+       "  'ゲ',\n",
+       "  'フ',\n",
+       "  'あ',\n",
+       "  'ウ',\n",
+       "  'め',\n",
+       "  'タ',\n",
+       "  'ぬ',\n",
+       "  'せ',\n",
+       "  'て',\n",
+       "  'b',\n",
+       "  '」',\n",
+       "  'す',\n",
+       "  'び',\n",
+       "  'ば',\n",
+       "  'ア',\n",
+       "  'A',\n",
+       "  'r',\n",
+       "  'ャ',\n",
+       "  'イ',\n",
+       "  'へ',\n",
+       "  'ぶ',\n",
+       "  'は',\n",
+       "  'u',\n",
+       "  'と',\n",
+       "  '繫',\n",
+       "  'ぎ',\n",
+       "  'バ',\n",
+       "  'ノ',\n",
+       "  'I',\n",
+       "  'ざ',\n",
+       "  'R',\n",
+       "  'チ',\n",
+       "  'Ａ',\n",
+       "  '｢',\n",
+       "  'G',\n",
+       "  'ェ',\n",
+       "  'く',\n",
+       "  'け',\n",
+       "  'ぇ',\n",
+       "  '？',\n",
+       "  '〜',\n",
+       "  'つ',\n",
+       "  'わ',\n",
+       "  'こ',\n",
+       "  'ス',\n",
+       "  'ズ',\n",
+       "  'p',\n",
+       "  'y',\n",
+       "  'ぼ',\n",
+       "  'し',\n",
+       "  '、',\n",
+       "  '！',\n",
+       "  'ゼ',\n",
+       "  's',\n",
+       "  'Ｕ',\n",
+       "  'き',\n",
+       "  'ゥ',\n",
+       "  '・',\n",
+       "  'が',\n",
+       "  'も',\n",
+       "  'エ',\n",
+       "  'ク',\n",
+       "  'づ',\n",
+       "  'O',\n",
+       "  'グ',\n",
+       "  'ブ',\n",
+       "  'ゅ',\n",
+       "  'ィ',\n",
+       "  'ぁ',\n",
+       "  'd',\n",
+       "  't',\n",
+       "  'ｊ',\n",
+       "  'n',\n",
+       "  'ロ',\n",
+       "  'g',\n",
+       "  'ー',\n",
+       "  '/',\n",
+       "  'ナ',\n",
+       "  'ヅ',\n",
+       "  'の',\n",
+       "  'ケ',\n",
+       "  'ほ',\n",
+       "  '･',\n",
+       "  '）',\n",
+       "  'J',\n",
+       "  'D',\n",
+       "  'ネ',\n",
+       "  'お',\n",
+       "  'パ',\n",
+       "  'ム',\n",
+       "  'む',\n",
+       "  'ラ',\n",
+       "  'ミ',\n",
+       "  'い',\n",
+       "  'ろ',\n",
+       "  'c',\n",
+       "  '＝',\n",
+       "  'z',\n",
+       "  'ベ',\n",
+       "  'Ｏ',\n",
+       "  'h',\n",
+       "  'プ',\n",
+       "  'o',\n",
+       "  'ザ',\n",
+       "  '&',\n",
+       "  '『',\n",
+       "  'ソ',\n",
+       "  '.',\n",
+       "  'ヴ',\n",
+       "  'l',\n",
+       "  'ド',\n",
+       "  'み',\n",
+       "  'v',\n",
+       "  'x',\n",
+       "  'Y',\n",
+       "  'ガ',\n",
+       "  'に',\n",
+       "  'ヌ',\n",
+       "  'ら',\n",
+       "  'ヘ',\n",
+       "  'ょ',\n",
+       "  'カ',\n",
+       "  '。',\n",
+       "  'ギ',\n",
+       "  'C',\n",
+       "  'ぜ',\n",
+       "  'モ',\n",
+       "  'キ',\n",
+       "  'i',\n",
+       "  'j',\n",
+       "  '．',\n",
+       "  \"'\",\n",
+       "  'M',\n",
+       "  'ご',\n",
+       "  'ど',\n",
+       "  'ハ',\n",
+       "  'ね',\n",
+       "  'で',\n",
+       "  'W',\n",
+       "  'ぴ',\n",
+       "  'T',\n",
+       "  'ぷ',\n",
+       "  ' ',\n",
+       "  'マ',\n",
+       "  '―',\n",
+       "  'ビ',\n",
+       "  'H',\n",
+       "  'デ',\n",
+       "  'f',\n",
+       "  'ゾ',\n",
+       "  '－',\n",
+       "  'ポ',\n",
+       "  'K',\n",
+       "  'ヤ',\n",
+       "  'ユ',\n",
+       "  'シ',\n",
+       "  'ペ',\n",
+       "  'Z',\n",
+       "  'ぱ',\n",
+       "  'ふ',\n",
+       "  'る',\n",
+       "  'べ',\n",
+       "  'ヒ',\n",
+       "  'e',\n",
+       "  'そ',\n",
+       "  'テ',\n",
+       "  'サ',\n",
+       "  'V',\n",
+       "  'れ',\n",
+       "  '｣',\n",
+       "  'じ',\n",
+       "  'ワ',\n",
+       "  'レ',\n",
+       "  'X',\n",
+       "  'ォ',\n",
+       "  'ュ',\n",
+       "  'ジ',\n",
+       "  'k',\n",
+       "  'な',\n",
+       "  'ニ',\n",
+       "  'り',\n",
+       "  'q',\n",
+       "  'U',\n",
+       "  'ひ',\n",
+       "  'げ',\n",
+       "  '＆',\n",
+       "  'ゆ',\n",
+       "  'っ',\n",
+       "  'ず',\n",
+       "  'ゴ',\n",
+       "  '「',\n",
+       "  'a',\n",
+       "  'ぢ',\n",
+       "  'ル',\n",
+       "  'さ',\n",
+       "  'ぺ',\n",
+       "  'm',\n",
+       "  'ョ',\n",
+       "  'ト',\n",
+       "  'ツ',\n",
+       "  'ホ',\n",
+       "  'コ',\n",
+       "  'オ',\n",
+       "  'セ',\n",
+       "  'え',\n",
+       "  'ま',\n",
+       "  'メ',\n",
+       "  'ァ',\n",
+       "  'F',\n",
+       "  'ぐ',\n",
+       "  'B',\n",
+       "  '』',\n",
+       "  'ッ']]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# vocab_train[\"vocab\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "249\n",
+      "['ダ', 'た', 'P', 'か', 'よ', 'や', 'Q', 'を', 'Ｆ', 'ｈ', 'E', 'ち', 'リ', 'ゲ', 'フ', 'め', 'タ', 'せ', 'b', '」', 'ば', 'ア', 'A', 'ャ', 'イ', 'ぶ', 'は', 'u', 'と', 'ノ', 'I', 'R', '｢', 'G', 'ェ', 'く', '？', '〜', 'つ', 'こ', 'Ｓ', 'ぼ', 'ゼ', 's', 'Ｕ', 'き', 'ゥ', 'が', 'も', 'エ', 'ク', 'づ', 'グ', 'ブ', 'ゅ', 'ィ', 't', 'n', 'ロ', 'ー', '/', 'の', 'ケ', '･', 'J', 'お', 'む', 'Ｐ', 'ベ', 'h', 'プ', 'o', '&', '『', 'ソ', '.', 'ヴ', 'ド', 'み', 'Y', 'ガ', 'ょ', 'カ', 'C', 'ぜ', 'j', '．', 'ご', 'ど', 'ハ', 'ね', 'W', 'ｊ', 'T', ' ', 'マ', '―', '－', 'デ', 'ゾ', 'ポ', 'K', 'ペ', 'ぱ', 'ふ', 'べ', 'ヒ', 'e', 'サ', 'Ｎ', 'X', 'ュ', 'k', 'り', 'U', 'ひ', 'げ', 'ゆ', 'ず', 'ゴ', 'a', 'ョ', 'ツ', '〇', 'え', 'F', 'B', '』', 'ッ', 'ん', 'ン', 'S', 'う', 'ぽ', '：', '々', 'ぞ', 'N', 'ヨ', 'ゃ', 'だ', 'L', 'ピ', 'ボ', 'w', 'ウ', 'あ', 'ヶ', 'ぬ', 'て', 'す', 'び', 'r', 'へ', '繫', 'バ', 'ぎ', 'ざ', 'Ａ', 'チ', 'け', 'ぇ', 'わ', 'ス', 'p', 'ズ', 'y', 'し', '、', '！', 'Ｇ', '・', 'O', 'ぁ', 'd', 'g', 'ナ', 'ヅ', 'ほ', '）', 'D', 'ネ', 'パ', 'ム', 'ミ', '＝', 'z', 'い', 'ろ', 'c', 'Ｏ', 'ザ', 'l', 'v', 'x', 'ヌ', 'に', 'ら', 'ヘ', '。', 'ギ', 'モ', 'Ｄ', 'キ', 'i', \"'\", 'M', 'で', 'ぴ', 'ぷ', 'ビ', 'H', 'f', 'ヤ', 'ユ', 'シ', 'Z', 'る', 'そ', 'テ', 'V', 'れ', '｣', 'じ', 'ワ', 'レ', 'ォ', 'ジ', 'な', 'ニ', 'q', '＆', 'っ', '「', 'ぢ', 'ル', 'さ', 'ぺ', 'm', 'ト', 'ホ', 'コ', 'オ', 'セ', 'ま', 'メ', 'ァ', 'ぐ', 'ラ']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(vocab_list))\n",
+    "print(vocab_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "j_vocab = {\"<pad>\": 0, \"<s>\": 1, \"</s>\": 2, \"<unk>\": 3, \"|\": 4, \"'\": 5, \"-\": 6, \"A\": 7, \"B\": 8, \"C\": 9, \"D\": 10, \"E\": 11, \"F\": 12, \"G\": 13, \"H\": 14, \"I\": 15, \"J\": 16, \"K\": 17, \"L\": 18, \"M\": 19, \"N\": 20, \"O\": 21, \"P\": 22, \"Q\": 23, \"R\": 24, \"S\": 25, \"T\": 26, \"U\": 27, \"V\": 28, \"W\": 29, \"X\": 30, \"Y\": 31, \"Z\": 32, \"Ä\": 33, \"Í\": 34, \"Ó\": 35, \"Ö\": 36, \"Ü\": 37}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "manually_kept_values = ['ß', 'ä', 'ö', 'ü']\n",
+    "\n",
+    "punctuation = ['.', ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['$', '&', '(', ')', '*', '+', '.', '/', '=', '@', '[', ']', '_', '`', '¡', '§', '«', '°', '´', 'µ', '·', '»', '×', 'à', 'á', 'â', 'ã', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ø', 'ù', 'ú', 'û', 'ý', 'þ', 'ā', 'ă', 'ą', 'ć', 'č', 'ď', 'đ', 'ē', 'ė', 'ę', 'ě', 'ğ', 'ġ', 'ħ', 'ī', 'ı', 'ł', 'ń', 'ņ', 'ň', 'ō', 'ŏ', 'ő', 'œ', 'ř', 'ś', 'ş', 'š', 'ť', 'ū', 'ů', 'ź', 'ż', 'ž', 'ơ', 'ǐ', 'ǔ', 'ș', 'ț', 'ə', 'ʻ', 'ʾ', 'ʿ', '̆', '̇', '̥', 'а', 'в', 'е', 'и', 'к', 'м', 'о', 'р', 'с', 'ф', 'ч', 'ш', 'ѹ', 'א', 'ב', 'נ', 'ע', 'ש', '་', 'ན', 'ḫ', 'ṟ', 'ṣ', 'ṭ', 'ạ', 'ả', 'ắ', 'ằ', 'ế', 'ễ', 'ệ', 'ọ', 'ồ', 'ộ', 'ụ', 'ứ', '‑', '‚', '„', '‟', '′', '″', '‹', '›', '→', '−', '≡', '⟨', '⟩', 'カ', '东', '临', '乡', '关', '合', '城', '孙', '尣', '幺', '支', '比', '毛', '泽', '無', '生', '臣', '辶', '道', '镇', '黃']\n"
+     ]
+    }
+   ],
+   "source": [
+    "odd_values = []\n",
+    "for index, value in enumerate(sorted(vocab_list)):\n",
+    "#     if :\n",
+    "    if value not in j_vocab and not (16 <= index <= 41 or value == ' ') and value not in manually_kept_values:\n",
+    "        odd_values.append(value)\n",
+    "#     print(index, value)\n",
+    "    \n",
+    "print(odd_values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "$ & ( ) * + . / = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\" \".join(odd_values))\n",
+    "\n",
+    "# for value in odd_values:\n",
+    "#     if value not in manually_kept_values:\n",
+    "#         print(value)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "$ & ( ) * + = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_vocab_list = [value for value in vocab_list if value not in odd_values]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ß',\n",
+       " 'j',\n",
+       " 'r',\n",
+       " 'h',\n",
+       " 'd',\n",
+       " 'l',\n",
+       " 'z',\n",
+       " 'n',\n",
+       " 'm',\n",
+       " 'c',\n",
+       " 'ä',\n",
+       " \"'\",\n",
+       " 'g',\n",
+       " 'e',\n",
+       " 'w',\n",
+       " 's',\n",
+       " 'u',\n",
+       " 'k',\n",
+       " 'o',\n",
+       " 'f',\n",
+       " ' ',\n",
+       " 'y',\n",
+       " 'v',\n",
+       " 'ö',\n",
+       " 'ü',\n",
+       " 'p',\n",
+       " 'a',\n",
+       " 'x',\n",
+       " 'b',\n",
+       " 'q',\n",
+       " 't',\n",
+       " 'i']"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_vocab_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'word_delimiter_token' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [21]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m vocab_dict \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28msorted\u001b[39m(vocab_list))}\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# replace white space with delimiter token\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mword_delimiter_token\u001b[49m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m      5\u001b[0m     vocab_dict[word_delimiter_token] \u001b[38;5;241m=\u001b[39m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'word_delimiter_token' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
+    "\n",
+    "# replace white space with delimiter token\n",
+    "if word_delimiter_token is not None:\n",
+    "    vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n",
+    "    del vocab_dict[\" \"]\n",
+    "\n",
+    "# add unk and pad token\n",
+    "if unk_token is not None:\n",
+    "    vocab_dict[unk_token] = len(vocab_dict)\n",
+    "\n",
+    "if pad_token is not None:\n",
+    "    vocab_dict[pad_token] = len(vocab_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "59e89471ea85449ebbc709d0a9d7325c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/437 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OOV found in 421223 samples, and they were removed from training set\n",
+      "The final training set size is 14947\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_set = set(filtered_vocab_list)\n",
+    "train_dataset_size = len(common_voice_train)\n",
+    "common_voice_train_2 = common_voice_train.filter(\n",
+    "    lambda example: vocab_set.issuperset(example[\"sentence\"].replace(\" \", \"\"))\n",
+    ")\n",
+    "print(f\"OOV found in {train_dataset_size - len(common_voice_train_2)} samples, and they were removed from training set\")\n",
+    "print(f\"The final training set size is {len(common_voice_train_2)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m odd_example_texts \u001b[38;5;241m=\u001b[39m  []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m common_voice_train:\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m odd_values:\n\u001b[1;32m      4\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence\u001b[39m\u001b[38;5;124m\"\u001b[39m]: \n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1664\u001b[0m, in \u001b[0;36mDataset._iter\u001b[0;34m(self, decoded)\u001b[0m\n\u001b[1;32m   1658\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate through the examples.\u001b[39;00m\n\u001b[1;32m   1659\u001b[0m \n\u001b[1;32m   1660\u001b[0m \u001b[38;5;124;03mIf a formatting is set with :meth:`Dataset.set_format` rows will be returned with the\u001b[39;00m\n\u001b[1;32m   1661\u001b[0m \u001b[38;5;124;03mselected format.\u001b[39;00m\n\u001b[1;32m   1662\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1663\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_rows):\n\u001b[0;32m-> 1664\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1665\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1666\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdecoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecoded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1667\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1915\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, decoded, **kwargs)\u001b[0m\n\u001b[1;32m   1913\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, decoded\u001b[38;5;241m=\u001b[39mdecoded, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[1;32m   1914\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m query_table(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data, key, indices\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1915\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1916\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[1;32m   1917\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:533\u001b[0m, in \u001b[0;36mformat_table\u001b[0;34m(table, key, formatter, format_columns, output_all_columns)\u001b[0m\n\u001b[1;32m    531\u001b[0m python_formatter \u001b[38;5;241m=\u001b[39m PythonFormatter(features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 533\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    534\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    535\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:282\u001b[0m, in \u001b[0;36mFormatter.__call__\u001b[0;34m(self, pa_table, query_type)\u001b[0m\n\u001b[1;32m    280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa\u001b[38;5;241m.\u001b[39mTable, query_type: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[1;32m    281\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 282\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    283\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat_column(pa_table)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:313\u001b[0m, in \u001b[0;36mPythonFormatter.format_row\u001b[0;34m(self, pa_table)\u001b[0m\n\u001b[1;32m    311\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpython_arrow_extractor()\u001b[38;5;241m.\u001b[39mextract_row(pa_table)\n\u001b[1;32m    312\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoded:\n\u001b[0;32m--> 313\u001b[0m     row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpython_features_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    314\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:222\u001b[0m, in \u001b[0;36mPythonFeaturesDecoder.decode_row\u001b[0;34m(self, row)\u001b[0m\n\u001b[1;32m    221\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mdict\u001b[39m:\n\u001b[0;32m--> 222\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures \u001b[38;5;28;01melse\u001b[39;00m row\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1318\u001b[0m, in \u001b[0;36mFeatures.decode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m   1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m   1309\u001b[0m     \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m   1310\u001b[0m \n\u001b[1;32m   1311\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1315\u001b[0m \u001b[38;5;124;03m        :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m   1316\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1318\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m   1319\u001b[0m         column_name: decode_nested_example(feature, value)\n\u001b[1;32m   1320\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m   1321\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m   1322\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m   1323\u001b[0m             {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m   1324\u001b[0m         )\n\u001b[1;32m   1325\u001b[0m     }\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1319\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m   1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m   1309\u001b[0m     \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m   1310\u001b[0m \n\u001b[1;32m   1311\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1315\u001b[0m \u001b[38;5;124;03m        :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m   1316\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m   1318\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[0;32m-> 1319\u001b[0m         column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1320\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m   1321\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m   1322\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m   1323\u001b[0m             {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m   1324\u001b[0m         )\n\u001b[1;32m   1325\u001b[0m     }\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1056\u001b[0m, in \u001b[0;36mdecode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m   1054\u001b[0m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[1;32m   1055\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (Audio, Image)):\n\u001b[0;32m-> 1056\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:97\u001b[0m, in \u001b[0;36mAudio.decode_example\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn audio sample should have one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m but both are None in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 97\u001b[0m     array, sampling_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode_mp3\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     99\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m file:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:183\u001b[0m, in \u001b[0;36mAudio._decode_mp3\u001b[0;34m(self, path_or_file)\u001b[0m\n\u001b[1;32m    181\u001b[0m array \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmono:\n\u001b[0;32m--> 183\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array, sampling_rate\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/numpy/core/_methods.py:154\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[1;32m    152\u001b[0m \u001b[38;5;66;03m# Cast bool, unsigned int, and int to float64 by default\u001b[39;00m\n\u001b[1;32m    153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 154\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, (\u001b[43mnt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minteger\u001b[49m, nt\u001b[38;5;241m.\u001b[39mbool_)):\n\u001b[1;32m    155\u001b[0m         dtype \u001b[38;5;241m=\u001b[39m mu\u001b[38;5;241m.\u001b[39mdtype(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    156\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, nt\u001b[38;5;241m.\u001b[39mfloat16):\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0  \n",
+      "1 &\n",
+      "2 '\n",
+      "3 .\n",
+      "4 /\n",
+      "5 A\n",
+      "6 B\n",
+      "7 C\n",
+      "8 D\n",
+      "9 E\n",
+      "10 F\n",
+      "11 G\n",
+      "12 H\n",
+      "13 I\n",
+      "14 J\n",
+      "15 K\n",
+      "16 L\n",
+      "17 M\n",
+      "18 N\n",
+      "19 O\n",
+      "20 P\n",
+      "21 Q\n",
+      "22 R\n",
+      "23 S\n",
+      "24 T\n",
+      "25 U\n",
+      "26 V\n",
+      "27 W\n",
+      "28 X\n",
+      "29 Y\n",
+      "30 Z\n",
+      "31 a\n",
+      "32 b\n",
+      "33 c\n",
+      "34 d\n",
+      "35 e\n",
+      "36 f\n",
+      "37 g\n",
+      "38 h\n",
+      "39 i\n",
+      "40 j\n",
+      "41 k\n",
+      "42 l\n",
+      "43 m\n",
+      "44 n\n",
+      "45 o\n",
+      "46 p\n",
+      "47 q\n",
+      "48 r\n",
+      "49 s\n",
+      "50 t\n",
+      "51 u\n",
+      "52 v\n",
+      "53 w\n",
+      "54 x\n",
+      "55 y\n",
+      "56 z\n",
+      "57 ―\n",
+      "58 、\n",
+      "59 。\n",
+      "60 々\n",
+      "61 〇\n",
+      "62 「\n",
+      "63 」\n",
+      "64 『\n",
+      "65 』\n",
+      "66 〜\n",
+      "67 ぁ\n",
+      "68 あ\n",
+      "69 い\n",
+      "70 う\n",
+      "71 ぇ\n",
+      "72 え\n",
+      "73 お\n",
+      "74 か\n",
+      "75 が\n",
+      "76 き\n",
+      "77 ぎ\n",
+      "78 く\n",
+      "79 ぐ\n",
+      "80 け\n",
+      "81 げ\n",
+      "82 こ\n",
+      "83 ご\n",
+      "84 さ\n",
+      "85 ざ\n",
+      "86 し\n",
+      "87 じ\n",
+      "88 す\n",
+      "89 ず\n",
+      "90 せ\n",
+      "91 ぜ\n",
+      "92 そ\n",
+      "93 ぞ\n",
+      "94 た\n",
+      "95 だ\n",
+      "96 ち\n",
+      "97 ぢ\n",
+      "98 っ\n",
+      "99 つ\n",
+      "100 づ\n",
+      "101 て\n",
+      "102 で\n",
+      "103 と\n",
+      "104 ど\n",
+      "105 な\n",
+      "106 に\n",
+      "107 ぬ\n",
+      "108 ね\n",
+      "109 の\n",
+      "110 は\n",
+      "111 ば\n",
+      "112 ぱ\n",
+      "113 ひ\n",
+      "114 び\n",
+      "115 ぴ\n",
+      "116 ふ\n",
+      "117 ぶ\n",
+      "118 ぷ\n",
+      "119 へ\n",
+      "120 べ\n",
+      "121 ぺ\n",
+      "122 ほ\n",
+      "123 ぼ\n",
+      "124 ぽ\n",
+      "125 ま\n",
+      "126 み\n",
+      "127 む\n",
+      "128 め\n",
+      "129 も\n",
+      "130 ゃ\n",
+      "131 や\n",
+      "132 ゅ\n",
+      "133 ゆ\n",
+      "134 ょ\n",
+      "135 よ\n",
+      "136 ら\n",
+      "137 り\n",
+      "138 る\n",
+      "139 れ\n",
+      "140 ろ\n",
+      "141 わ\n",
+      "142 を\n",
+      "143 ん\n",
+      "144 ァ\n",
+      "145 ア\n",
+      "146 ィ\n",
+      "147 イ\n",
+      "148 ゥ\n",
+      "149 ウ\n",
+      "150 ェ\n",
+      "151 エ\n",
+      "152 ォ\n",
+      "153 オ\n",
+      "154 カ\n",
+      "155 ガ\n",
+      "156 キ\n",
+      "157 ギ\n",
+      "158 ク\n",
+      "159 グ\n",
+      "160 ケ\n",
+      "161 ゲ\n",
+      "162 コ\n",
+      "163 ゴ\n",
+      "164 サ\n",
+      "165 ザ\n",
+      "166 シ\n",
+      "167 ジ\n",
+      "168 ス\n",
+      "169 ズ\n",
+      "170 セ\n",
+      "171 ゼ\n",
+      "172 ソ\n",
+      "173 ゾ\n",
+      "174 タ\n",
+      "175 ダ\n",
+      "176 チ\n",
+      "177 ッ\n",
+      "178 ツ\n",
+      "179 ヅ\n",
+      "180 テ\n",
+      "181 デ\n",
+      "182 ト\n",
+      "183 ド\n",
+      "184 ナ\n",
+      "185 ニ\n",
+      "186 ヌ\n",
+      "187 ネ\n",
+      "188 ノ\n",
+      "189 ハ\n",
+      "190 バ\n",
+      "191 パ\n",
+      "192 ヒ\n",
+      "193 ビ\n",
+      "194 ピ\n",
+      "195 フ\n",
+      "196 ブ\n",
+      "197 プ\n",
+      "198 ヘ\n",
+      "199 ベ\n",
+      "200 ペ\n",
+      "201 ホ\n",
+      "202 ボ\n",
+      "203 ポ\n",
+      "204 マ\n",
+      "205 ミ\n",
+      "206 ム\n",
+      "207 メ\n",
+      "208 モ\n",
+      "209 ャ\n",
+      "210 ヤ\n",
+      "211 ュ\n",
+      "212 ユ\n",
+      "213 ョ\n",
+      "214 ヨ\n",
+      "215 ラ\n",
+      "216 リ\n",
+      "217 ル\n",
+      "218 レ\n",
+      "219 ロ\n",
+      "220 ワ\n",
+      "221 ン\n",
+      "222 ヴ\n",
+      "223 ヶ\n",
+      "224 ・\n",
+      "225 ー\n",
+      "226 繫\n",
+      "227 ！\n",
+      "228 ＆\n",
+      "229 ）\n",
+      "230 －\n",
+      "231 ．\n",
+      "232 ：\n",
+      "233 ＝\n",
+      "234 ？\n",
+      "235 Ａ\n",
+      "236 Ｄ\n",
+      "237 Ｆ\n",
+      "238 Ｇ\n",
+      "239 Ｎ\n",
+      "240 Ｏ\n",
+      "241 Ｐ\n",
+      "242 Ｓ\n",
+      "243 Ｕ\n",
+      "244 ｈ\n",
+      "245 ｊ\n",
+      "246 ｢\n",
+      "247 ｣\n",
+      "248 ･\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
+    "for key, value in enumerate(vocab_dict):\n",
+    "    print(key, value)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_vocabulary_from_data(\n",
+    "    datasets: DatasetDict,\n",
+    "    word_delimiter_token: Optional[str] = None,\n",
+    "    unk_token: Optional[str] = None,\n",
+    "    pad_token: Optional[str] = None,\n",
+    "):\n",
+    "    # Given training and test labels create vocabulary\n",
+    "    def extract_all_chars(batch):\n",
+    "        all_text = \" \".join(batch[\"target_text\"])\n",
+    "        vocab = list(set(all_text))\n",
+    "        return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
+    "\n",
+    "    vocabs = datasets.map(\n",
+    "        extract_all_chars,\n",
+    "        batched=True,\n",
+    "        batch_size=-1,\n",
+    "        keep_in_memory=True,\n",
+    "        remove_columns=datasets[\"train\"].column_names,\n",
+    "    )\n",
+    "\n",
+    "    # take union of all unique characters in each dataset\n",
+    "    vocab_set = functools.reduce(\n",
+    "        lambda vocab_1, vocab_2: set(vocab_1[\"vocab\"][0]) | set(vocab_2[\"vocab\"][0]), vocabs.values()\n",
+    "    )\n",
+    "\n",
+    "    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}\n",
+    "\n",
+    "    # replace white space with delimiter token\n",
+    "    if word_delimiter_token is not None:\n",
+    "        vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n",
+    "        del vocab_dict[\" \"]\n",
+    "\n",
+    "    # add unk and pad token\n",
+    "    if unk_token is not None:\n",
+    "        vocab_dict[unk_token] = len(vocab_dict)\n",
+    "\n",
+    "    if pad_token is not None:\n",
+    "        vocab_dict[pad_token] = len(vocab_dict)\n",
+    "\n",
+    "    return vocab_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load processor\n",
+    "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n",
+    "# feature_extractor = processor_with_lm.feature_extractor\n",
+    "sampling_rate = feature_extractor.sampling_rate\n",
+    "\n",
+    "# resample audio\n",
+    "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))\n",
+    "\n",
+    "# load eval pipeline\n",
+    "asr = pipeline(\"automatic-speech-recognition\", model=repo_name, feature_extractor=feature_extractor)\n",
+    "\n",
+    "# map function to decode audio\n",
+    "def map_to_pred(batch):\n",
+    "    prediction = asr(\n",
+    "        batch[\"audio\"][\"array\"])\n",
+    "\n",
+    "    batch[\"prediction\"] = prediction[\"text\"]\n",
+    "    batch[\"target\"] = batch[\"sentence\"]\n",
+    "    return batch\n",
+    "\n",
+    "# run inference on all examples\n",
+    "result = dataset.map(map_to_pred, remove_columns=dataset.column_names)\n",
+    "print(result[\"prediction\"])\n",
+    "\n",
+    "result[0]['target']"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

.ipynb_checkpoints/vocab-checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"&": 1, "'": 2, ".": 3, "/": 4, "A": 5, "B": 6, "C": 7, "D": 8, "E": 9, "F": 10, "G": 11, "H": 12, "I": 13, "J": 14, "K": 15, "L": 16, "M": 17, "N": 18, "O": 19, "P": 20, "Q": 21, "R": 22, "S": 23, "T": 24, "U": 25, "V": 26, "W": 27, "X": 28, "Y": 29, "Z": 30, "a": 31, "b": 32, "c": 33, "d": 34, "e": 35, "f": 36, "g": 37, "h": 38, "i": 39, "j": 40, "k": 41, "l": 42, "m": 43, "n": 44, "o": 45, "p": 46, "q": 47, "r": 48, "s": 49, "t": 50, "u": 51, "v": 52, "w": 53, "x": 54, "y": 55, "z": 56, "\u2015": 57, "\u3001": 58, "\u3002": 59, "\u3005": 60, "\u3007": 61, "\u300c": 62, "\u300d": 63, "\u300e": 64, "\u300f": 65, "\u301c": 66, "\u3041": 67, "\u3042": 68, "\u3044": 69, "\u3046": 70, "\u3047": 71, "\u3048": 72, "\u304a": 73, "\u304b": 74, "\u304c": 75, "\u304d": 76, "\u304e": 77, "\u304f": 78, "\u3050": 79, "\u3051": 80, "\u3052": 81, "\u3053": 82, "\u3054": 83, "\u3055": 84, "\u3056": 85, "\u3057": 86, "\u3058": 87, "\u3059": 88, "\u305a": 89, "\u305b": 90, "\u305c": 91, "\u305d": 92, "\u305e": 93, "\u305f": 94, "\u3060": 95, "\u3061": 96, "\u3062": 97, "\u3063": 98, "\u3064": 99, "\u3065": 100, "\u3066": 101, "\u3067": 102, "\u3068": 103, "\u3069": 104, "\u306a": 105, "\u306b": 106, "\u306c": 107, "\u306d": 108, "\u306e": 109, "\u306f": 110, "\u3070": 111, "\u3071": 112, "\u3072": 113, "\u3073": 114, "\u3074": 115, "\u3075": 116, "\u3076": 117, "\u3077": 118, "\u3078": 119, "\u3079": 120, "\u307a": 121, "\u307b": 122, "\u307c": 123, "\u307d": 124, "\u307e": 125, "\u307f": 126, "\u3080": 127, "\u3081": 128, "\u3082": 129, "\u3083": 130, "\u3084": 131, "\u3085": 132, "\u3086": 133, "\u3087": 134, "\u3088": 135, "\u3089": 136, "\u308a": 137, "\u308b": 138, "\u308c": 139, "\u308d": 140, "\u308f": 141, "\u3092": 142, "\u3093": 143, "\u30a1": 144, "\u30a2": 145, "\u30a3": 146, "\u30a4": 147, "\u30a5": 148, "\u30a6": 149, "\u30a7": 150, "\u30a8": 151, "\u30a9": 152, "\u30aa": 153, "\u30ab": 154, "\u30ac": 155, "\u30ad": 156, "\u30ae": 157, "\u30af": 158, "\u30b0": 159, "\u30b1": 160, "\u30b2": 161, "\u30b3": 162, "\u30b4": 163, "\u30b5": 164, "\u30b6": 165, "\u30b7": 166, "\u30b8": 167, "\u30b9": 168, "\u30ba": 169, "\u30bb": 170, "\u30bc": 171, "\u30bd": 172, "\u30be": 173, "\u30bf": 174, "\u30c0": 175, "\u30c1": 176, "\u30c3": 177, "\u30c4": 178, "\u30c5": 179, "\u30c6": 180, "\u30c7": 181, "\u30c8": 182, "\u30c9": 183, "\u30ca": 184, "\u30cb": 185, "\u30cc": 186, "\u30cd": 187, "\u30ce": 188, "\u30cf": 189, "\u30d0": 190, "\u30d1": 191, "\u30d2": 192, "\u30d3": 193, "\u30d4": 194, "\u30d5": 195, "\u30d6": 196, "\u30d7": 197, "\u30d8": 198, "\u30d9": 199, "\u30da": 200, "\u30db": 201, "\u30dc": 202, "\u30dd": 203, "\u30de": 204, "\u30df": 205, "\u30e0": 206, "\u30e1": 207, "\u30e2": 208, "\u30e3": 209, "\u30e4": 210, "\u30e5": 211, "\u30e6": 212, "\u30e7": 213, "\u30e8": 214, "\u30e9": 215, "\u30ea": 216, "\u30eb": 217, "\u30ec": 218, "\u30ed": 219, "\u30ef": 220, "\u30f3": 221, "\u30f4": 222, "\u30f6": 223, "\u30fb": 224, "\u30fc": 225, "\u7e6b": 226, "\uff06": 227, "\uff09": 228, "\uff0d": 229, "\uff0e": 230, "\uff1a": 231, "\uff1d": 232, "\uff1f": 233, "\uff21": 234, "\uff24": 235, "\uff26": 236, "\uff27": 237, "\uff2e": 238, "\uff2f": 239, "\uff30": 240, "\uff33": 241, "\uff35": 242, "\uff48": 243, "\uff4a": 244, "\uff62": 245, "\uff63": 246, "\uff65": 247, "|": 0, "[UNK]": 248, "[PAD]": 249}

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<s>": 250, "</s>": 251}

config.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "_name_or_path": "facebook/wav2vec2-xls-r-300m",
+  "activation_dropout": 0.1,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 64,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.25,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.75,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 249,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 252,
+  "xvector_output_dim": 512
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a045ec80cccde6513efafa22639d8feb4ad1eed1931045d55322e78ce00a922
+size 1262956849

run_speech_recognition_ctc_bnb.py ADDED Viewed

	@@ -0,0 +1,771 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+import datasets
+import numpy as np
+import torch
+from datasets import DatasetDict, load_dataset, load_metric
+import bitsandbytes as bnb
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+)
+from transformers.trainer_pt_utils import get_parameter_names
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.16.0.dev0")
+require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+logger = logging.getLogger(__name__)
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
+    )
+    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
+    hidden_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
+            "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+            "vectors will be masked along the time axis."
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
+            "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train+validation",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "value if set."
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to only do data preprocessing and skip training. "
+            "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+            "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+            "so that the cached datasets can consequently be loaded in distributed training"
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "If :obj:`True`, will use the token generated when running"
+            ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    phoneme_language: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The target language that should be used be"
+            " passed to the tokenizer for tokenization. Note that"
+            " this is only relevant if the model classifies the"
+            " input audio to a sequence of phoneme sequences."
+        },
+    )
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.AutoProcessor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: AutoProcessor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lenghts and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        with self.processor.as_target_processor():
+            labels_batch = self.processor.pad(
+                label_features,
+                padding=self.padding,
+                pad_to_multiple_of=self.pad_to_multiple_of_labels,
+                return_tensors="pt",
+            )
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        batch["labels"] = labels
+        return batch
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
+    # Given training and test labels create vocabulary
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["target_text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+    vocabs = datasets.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=datasets["train"].column_names,
+    )
+    # take union of all unique characters in each dataset
+    vocab_set = functools.reduce(
+        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
+    )
+    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+    # add unk and pad token
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)
+    return vocab_dict
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # 1. First, let's load the dataset
+    raw_datasets = DatasetDict()
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            use_auth_token=data_args.use_auth_token,
+        )
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--audio_column_name` to the correct audio column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            use_auth_token=data_args.use_auth_token,
+        )
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+    # 2. We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    from pykakasi import kakasi
+    kakasi = kakasi()
+    kakasi.setMode('J', 'H') #Convert from kanji to hiragana
+    # kakasi.setMode("K", "H") #Convert from katakana to hiragana
+    conv = kakasi.getConverter()
+    chars_to_ignore_regex = (
+        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\（\，\[\]\)\(\！]'
+    )
+    text_column_name = data_args.text_column_name
+    def remove_special_characters(batch):
+        if chars_to_ignore_regex is not None:
+            batch["target_text"] = conv.do(re.sub(chars_to_ignore_regex, "", batch[text_column_name])) + " "
+        else:
+            batch["target_text"] = batch[text_column_name].lower() + " "
+        return batch
+    with training_args.main_process_first(desc="dataset map special characters removal"):
+        raw_datasets = raw_datasets.map(
+            remove_special_characters,
+            remove_columns=[text_column_name],
+            desc="remove special characters from datasets",
+        )
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token
+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
+    # load config
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_kwargs = {}
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                os.remove(vocab_file)
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                vocab_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_kwargs = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "word_delimiter_token": word_delimiter_token,
+        }
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+    # load feature_extractor and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        use_auth_token=data_args.use_auth_token,
+        **tokenizer_kwargs,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+    # adapt config
+    config.update(
+        {
+            "feat_proj_dropout": model_args.feat_proj_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "hidden_dropout": model_args.hidden_dropout,
+            "final_dropout": model_args.final_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "layerdrop": model_args.layerdrop,
+            "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
+            "activation_dropout": model_args.activation_dropout,
+        }
+    )
+    # create model
+    model = AutoModelForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        use_auth_token=data_args.use_auth_token,
+    )
+    # freeze encoder
+    if model_args.freeze_feature_encoder:
+        model.freeze_feature_encoder()
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+    # make sure that dataset decodes audio with correct sampling rate
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
+    # derive max & min input length for sample rate & max duration
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
+    phoneme_language = data_args.phoneme_language
+    # Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    def prepare_dataset(batch):
+        # load audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_values"] = inputs.input_values[0]
+        batch["input_length"] = len(batch["input_values"])
+        # encode targets
+        additional_kwargs = {}
+        if phoneme_language is not None:
+            additional_kwargs["phonemizer_lang"] = phoneme_language
+        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
+        return batch
+    with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = raw_datasets.map(
+            prepare_dataset,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
+        return
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
+        return metrics
+    # Now save everything to be able to create a single processor later
+    if is_main_process(training_args.local_rank):
+        # save feature extractor, tokenizer and config
+        feature_extractor.save_pretrained(training_args.output_dir)
+        tokenizer.save_pretrained(training_args.output_dir)
+        config.save_pretrained(training_args.output_dir)
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+    # Instantiate custom data collator
+    data_collator = DataCollatorCTCWithPadding(processor=processor)
+    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
+            "weight_decay": training_args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = bnb.optim.Adam8bit(
+        params=optimizer_grouped_parameters,
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+    )
+    optimizers = (optimizer, None)
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
+        tokenizer=feature_extractor,
+        optimizers=optimizers,
+    )
+    # 8. Finally, we can start training
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name],
+        "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+    return results
+if __name__ == "__main__":
+    main()

run_training.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+python run_speech_recognition_ctc_bnb.py \
+	--dataset_name="mozilla-foundation/common_voice_8_0" \
+	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
+	--dataset_config_name="ja" \
+	--output_dir="./" \
+    --overwrite_output_dir \
+	--num_train_epochs="10" \
+	--per_device_train_batch_size="48" \
+	--per_device_eval_batch_size="8" \
+	--learning_rate="7.5e-5" \
+	--warmup_steps="2000" \
+	--length_column_name="input_length" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--save_steps="1000" \
+	--eval_steps="1000" \
+	--logging_steps="100" \
+	--layerdrop="0.0" \
+	--activation_dropout="0.1" \
+	--save_total_limit="4" \
+	--freeze_feature_encoder \
+	--feat_proj_dropout="0.0" \
+	--mask_time_prob="0.75" \
+	--mask_time_length="10" \
+	--mask_feature_prob="0.25" \
+	--mask_feature_length="64" \
+	--gradient_checkpointing \
+	--use_auth_token \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+	--push_to_hub

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

speech_training_notebook.ipynb ADDED Viewed

	@@ -0,0 +1,1490 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7523cd66cf343f98fd3006be918a3b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "251cac7b8968405eafd54e2d29165b40",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/2.98k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "528c6a67efea4512b04b06a32156d5b7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/53.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset common_voice/ja to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ja/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c21c5f782734b3bb3f545cef5b59ee0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/958M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset common_voice downloaded and prepared to /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ja/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ja/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10623\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import Audio, Dataset, load_dataset, load_metric\n",
+    "from transformers import AutoFeatureExtractor, pipeline\n",
+    "\n",
+    "language_code = \"ja\"\n",
+    "dataset_name = \"mozilla-foundation/common_voice_8_0\"\n",
+    "\n",
+    "common_voice_train = load_dataset(dataset_name, language_code, use_auth_token=True, split=\"train+validation\")\n",
+    "common_voice_test = load_dataset(dataset_name, language_code, use_auth_token=True, split=\"test\")\n",
+    "\n",
+    "\n",
+    "print(len(common_voice_train))\n",
+    "\n",
+    "# # for testing: only process the first two examples as a test\n",
+    "# dataset = dataset.select(range(10))\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting pykakasi\n",
+      "  Downloading pykakasi-2.2.1-py3-none-any.whl (2.4 MB)\n",
+      "     |████████████████████████████████| 2.4 MB 9.9 MB/s            \n",
+      "\u001b[?25hCollecting jaconv\n",
+      "  Downloading jaconv-0.3.tar.gz (15 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting deprecated\n",
+      "  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)\n",
+      "Collecting wrapt<2,>=1.10\n",
+      "  Downloading wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (84 kB)\n",
+      "     |████████████████████████████████| 84 kB 12.8 MB/s            \n",
+      "\u001b[?25hBuilding wheels for collected packages: jaconv\n",
+      "  Building wheel for jaconv (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for jaconv: filename=jaconv-0.3-py3-none-any.whl size=15553 sha256=fd764f215e4d567cb60062a7052497b66729e9e2190e2e00153e0d19734088e7\n",
+      "  Stored in directory: /workspace/.cache/pip/wheels/73/e8/fb/b4ad8117719f79ac73bc05406d1768f845688cdbeed7aad87e\n",
+      "Successfully built jaconv\n",
+      "Installing collected packages: wrapt, jaconv, deprecated, pykakasi\n",
+      "Successfully installed deprecated-1.2.13 jaconv-0.3 pykakasi-2.2.1 wrapt-1.13.3\n",
+      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n",
+      "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install pykakasi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "にんじゃ ひらがな kana\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2159/3076271513.py:4: DeprecationWarning: Call to deprecated method setMode. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n",
+      "/tmp/ipykernel_2159/3076271513.py:6: DeprecationWarning: Call to deprecated method getConverter. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  conv = kakasi.getConverter()\n",
+      "/tmp/ipykernel_2159/3076271513.py:10: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  print(conv.do(str))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pykakasi import kakasi\n",
+    "\n",
+    "kakasi = kakasi()\n",
+    "kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n",
+    "# kakasi.setMode(\"K\", \"H\") #Convert from katakana to hiragana\n",
+    "conv = kakasi.getConverter()\n",
+    "\n",
+    "str = 'にんじゃ 平仮名 kana'\n",
+    "\n",
+    "print(conv.do(str))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "repo_name = 'https://huggingface.co/AndrewMcDowell/wav2vec2-xls-r-1B-german'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
+    "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ad26c4d7d02948a3bc30d86a0f3527c8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2159/322450745.py:5: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n",
+      "  batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "93295f1cd50f4557a96ff1bf139c9a37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "chars_to_remove_regex = '[\\,\\?\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–\\（\\，\\[\\]\\)\\(\\！]'\n",
+    "# \\.\n",
+    "def remove_special_characters(batch):\n",
+    "    batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n",
+    "    return batch\n",
+    "\n",
+    "common_voice_train = common_voice_train.map(remove_special_characters)\n",
+    "common_voice_test = common_voice_test.map(remove_special_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting num2words\n",
+      "  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)\n",
+      "     |████████████████████████████████| 101 kB 7.9 MB/s            \n",
+      "\u001b[?25hCollecting docopt>=0.6.2\n",
+      "  Downloading docopt-0.6.2.tar.gz (25 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hBuilding wheels for collected packages: docopt\n",
+      "  Building wheel for docopt (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=7cda85e4b3980668714aad8f5d706fb5b189c2804ce1d99ca2380537fdc78031\n",
+      "  Stored in directory: /workspace/.cache/pip/wheels/56/ea/58/ead137b087d9e326852a851351d1debf4ada529b6ac0ec4e8c\n",
+      "Successfully built docopt\n",
+      "Installing collected packages: docopt, num2words\n",
+      "Successfully installed docopt-0.6.2 num2words-0.5.10\n",
+      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n",
+      "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install num2words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0da8fd9cdae64c1fa80fbcfc412bcf9c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "\n",
+    "from num2words import num2words\n",
+    "import regex as re\n",
+    "matches = []\n",
+    "\n",
+    "def replace_numbers(match):\n",
+    "    match = match.group()\n",
+    "    matches.append(match)\n",
+    "    return num2words(match, lang='de')\n",
+    "\n",
+    "def replace_numbers_in_batch(batch):\n",
+    "    batch[\"sentence\"] = re.sub(r'\\d+(?:,\\d+)?', replace_numbers, batch[\"sentence\"])\n",
+    "    return batch\n",
+    "\n",
+    "common_voice_test_2 = common_voice_test.map(replace_numbers_in_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54d62ea7a0214b6abc5de1f106b330dc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0ex [00:00, ?ex/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "common_voice_train_2 = common_voice_train.map(replace_numbers_in_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(matches)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def replace_accented_characters(batch):\n",
+    "#     accented_string = u'Málaga'\n",
+    "#     # accented_string is of type 'unicode'\n",
+    "#     import unidecode\n",
+    "#     unaccented_string = unidecode.unidecode(accented_string)\n",
+    "#     batch[\"sentence\"] = re.sub('[â]', 'a', batch[\"sentence\"])\n",
+    "#     batch[\"sentence\"] = re.sub('[î]', 'i', batch[\"sentence\"])\n",
+    "#     batch[\"sentence\"] = re.sub('[ô]', 'o', batch[\"sentence\"])\n",
+    "#     batch[\"sentence\"] = re.sub('[û]', 'u', batch[\"sentence\"])\n",
+    "#     return batch\n",
+    "\n",
+    "def strip_accents(batch):\n",
+    "    return ''.join(c for c in unicodedata.normalize('NFD', batch[\"sentence\"]) if unicodedata.category(c) != 'Mn')\n",
+    "\n",
+    "common_voice_train = common_voice_train.map(replace_accented_characters)\n",
+    "common_voice_test = common_voice_test.map(replace_accented_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_all_chars(batch):\n",
+    "    all_text = \" \".join(batch[\"sentence\"])\n",
+    "    vocab = list(set(all_text))\n",
+    "    return {\"vocab\": [vocab], \"all_text\": [all_text]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c40f4d6b6bb74a56b2c570a3a53d7f4b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f69b6a3c0b54477ea15c56b02464bacd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
+    "vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['ん',\n",
+       "  'ン',\n",
+       "  'ダ',\n",
+       "  'S',\n",
+       "  'う',\n",
+       "  'た',\n",
+       "  'ぽ',\n",
+       "  'P',\n",
+       "  '：',\n",
+       "  '々',\n",
+       "  'か',\n",
+       "  'ぞ',\n",
+       "  'よ',\n",
+       "  'や',\n",
+       "  'ヨ',\n",
+       "  'ゃ',\n",
+       "  'Q',\n",
+       "  'N',\n",
+       "  'だ',\n",
+       "  'を',\n",
+       "  'L',\n",
+       "  'ｈ',\n",
+       "  'Ｆ',\n",
+       "  'E',\n",
+       "  'ピ',\n",
+       "  'ち',\n",
+       "  'ボ',\n",
+       "  'w',\n",
+       "  'リ',\n",
+       "  'ゲ',\n",
+       "  'フ',\n",
+       "  'あ',\n",
+       "  'ウ',\n",
+       "  'め',\n",
+       "  'タ',\n",
+       "  'ぬ',\n",
+       "  'せ',\n",
+       "  'て',\n",
+       "  'b',\n",
+       "  '」',\n",
+       "  'す',\n",
+       "  'び',\n",
+       "  'ば',\n",
+       "  'ア',\n",
+       "  'A',\n",
+       "  'r',\n",
+       "  'ャ',\n",
+       "  'イ',\n",
+       "  'へ',\n",
+       "  'ぶ',\n",
+       "  'は',\n",
+       "  'u',\n",
+       "  'と',\n",
+       "  '繫',\n",
+       "  'ぎ',\n",
+       "  'バ',\n",
+       "  'ノ',\n",
+       "  'I',\n",
+       "  'ざ',\n",
+       "  'R',\n",
+       "  'チ',\n",
+       "  'Ａ',\n",
+       "  '｢',\n",
+       "  'G',\n",
+       "  'ェ',\n",
+       "  'く',\n",
+       "  'け',\n",
+       "  'ぇ',\n",
+       "  '？',\n",
+       "  '〜',\n",
+       "  'つ',\n",
+       "  'わ',\n",
+       "  'こ',\n",
+       "  'ス',\n",
+       "  'ズ',\n",
+       "  'p',\n",
+       "  'y',\n",
+       "  'ぼ',\n",
+       "  'し',\n",
+       "  '、',\n",
+       "  '！',\n",
+       "  'ゼ',\n",
+       "  's',\n",
+       "  'Ｕ',\n",
+       "  'き',\n",
+       "  'ゥ',\n",
+       "  '・',\n",
+       "  'が',\n",
+       "  'も',\n",
+       "  'エ',\n",
+       "  'ク',\n",
+       "  'づ',\n",
+       "  'O',\n",
+       "  'グ',\n",
+       "  'ブ',\n",
+       "  'ゅ',\n",
+       "  'ィ',\n",
+       "  'ぁ',\n",
+       "  'd',\n",
+       "  't',\n",
+       "  'ｊ',\n",
+       "  'n',\n",
+       "  'ロ',\n",
+       "  'g',\n",
+       "  'ー',\n",
+       "  '/',\n",
+       "  'ナ',\n",
+       "  'ヅ',\n",
+       "  'の',\n",
+       "  'ケ',\n",
+       "  'ほ',\n",
+       "  '･',\n",
+       "  '）',\n",
+       "  'J',\n",
+       "  'D',\n",
+       "  'ネ',\n",
+       "  'お',\n",
+       "  'パ',\n",
+       "  'ム',\n",
+       "  'む',\n",
+       "  'ラ',\n",
+       "  'ミ',\n",
+       "  'い',\n",
+       "  'ろ',\n",
+       "  'c',\n",
+       "  '＝',\n",
+       "  'z',\n",
+       "  'ベ',\n",
+       "  'Ｏ',\n",
+       "  'h',\n",
+       "  'プ',\n",
+       "  'o',\n",
+       "  'ザ',\n",
+       "  '&',\n",
+       "  '『',\n",
+       "  'ソ',\n",
+       "  '.',\n",
+       "  'ヴ',\n",
+       "  'l',\n",
+       "  'ド',\n",
+       "  'み',\n",
+       "  'v',\n",
+       "  'x',\n",
+       "  'Y',\n",
+       "  'ガ',\n",
+       "  'に',\n",
+       "  'ヌ',\n",
+       "  'ら',\n",
+       "  'ヘ',\n",
+       "  'ょ',\n",
+       "  'カ',\n",
+       "  '。',\n",
+       "  'ギ',\n",
+       "  'C',\n",
+       "  'ぜ',\n",
+       "  'モ',\n",
+       "  'キ',\n",
+       "  'i',\n",
+       "  'j',\n",
+       "  '．',\n",
+       "  \"'\",\n",
+       "  'M',\n",
+       "  'ご',\n",
+       "  'ど',\n",
+       "  'ハ',\n",
+       "  'ね',\n",
+       "  'で',\n",
+       "  'W',\n",
+       "  'ぴ',\n",
+       "  'T',\n",
+       "  'ぷ',\n",
+       "  ' ',\n",
+       "  'マ',\n",
+       "  '―',\n",
+       "  'ビ',\n",
+       "  'H',\n",
+       "  'デ',\n",
+       "  'f',\n",
+       "  'ゾ',\n",
+       "  '－',\n",
+       "  'ポ',\n",
+       "  'K',\n",
+       "  'ヤ',\n",
+       "  'ユ',\n",
+       "  'シ',\n",
+       "  'ペ',\n",
+       "  'Z',\n",
+       "  'ぱ',\n",
+       "  'ふ',\n",
+       "  'る',\n",
+       "  'べ',\n",
+       "  'ヒ',\n",
+       "  'e',\n",
+       "  'そ',\n",
+       "  'テ',\n",
+       "  'サ',\n",
+       "  'V',\n",
+       "  'れ',\n",
+       "  '｣',\n",
+       "  'じ',\n",
+       "  'ワ',\n",
+       "  'レ',\n",
+       "  'X',\n",
+       "  'ォ',\n",
+       "  'ュ',\n",
+       "  'ジ',\n",
+       "  'k',\n",
+       "  'な',\n",
+       "  'ニ',\n",
+       "  'り',\n",
+       "  'q',\n",
+       "  'U',\n",
+       "  'ひ',\n",
+       "  'げ',\n",
+       "  '＆',\n",
+       "  'ゆ',\n",
+       "  'っ',\n",
+       "  'ず',\n",
+       "  'ゴ',\n",
+       "  '「',\n",
+       "  'a',\n",
+       "  'ぢ',\n",
+       "  'ル',\n",
+       "  'さ',\n",
+       "  'ぺ',\n",
+       "  'm',\n",
+       "  'ョ',\n",
+       "  'ト',\n",
+       "  'ツ',\n",
+       "  'ホ',\n",
+       "  'コ',\n",
+       "  'オ',\n",
+       "  'セ',\n",
+       "  'え',\n",
+       "  'ま',\n",
+       "  'メ',\n",
+       "  'ァ',\n",
+       "  'F',\n",
+       "  'ぐ',\n",
+       "  'B',\n",
+       "  '』',\n",
+       "  'ッ']]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# vocab_train[\"vocab\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "249\n",
+      "['ダ', 'た', 'P', 'か', 'よ', 'や', 'Q', 'を', 'Ｆ', 'ｈ', 'E', 'ち', 'リ', 'ゲ', 'フ', 'め', 'タ', 'せ', 'b', '」', 'ば', 'ア', 'A', 'ャ', 'イ', 'ぶ', 'は', 'u', 'と', 'ノ', 'I', 'R', '｢', 'G', 'ェ', 'く', '？', '〜', 'つ', 'こ', 'Ｓ', 'ぼ', 'ゼ', 's', 'Ｕ', 'き', 'ゥ', 'が', 'も', 'エ', 'ク', 'づ', 'グ', 'ブ', 'ゅ', 'ィ', 't', 'n', 'ロ', 'ー', '/', 'の', 'ケ', '･', 'J', 'お', 'む', 'Ｐ', 'ベ', 'h', 'プ', 'o', '&', '『', 'ソ', '.', 'ヴ', 'ド', 'み', 'Y', 'ガ', 'ょ', 'カ', 'C', 'ぜ', 'j', '．', 'ご', 'ど', 'ハ', 'ね', 'W', 'ｊ', 'T', ' ', 'マ', '―', '－', 'デ', 'ゾ', 'ポ', 'K', 'ペ', 'ぱ', 'ふ', 'べ', 'ヒ', 'e', 'サ', 'Ｎ', 'X', 'ュ', 'k', 'り', 'U', 'ひ', 'げ', 'ゆ', 'ず', 'ゴ', 'a', 'ョ', 'ツ', '〇', 'え', 'F', 'B', '』', 'ッ', 'ん', 'ン', 'S', 'う', 'ぽ', '：', '々', 'ぞ', 'N', 'ヨ', 'ゃ', 'だ', 'L', 'ピ', 'ボ', 'w', 'ウ', 'あ', 'ヶ', 'ぬ', 'て', 'す', 'び', 'r', 'へ', '繫', 'バ', 'ぎ', 'ざ', 'Ａ', 'チ', 'け', 'ぇ', 'わ', 'ス', 'p', 'ズ', 'y', 'し', '、', '！', 'Ｇ', '・', 'O', 'ぁ', 'd', 'g', 'ナ', 'ヅ', 'ほ', '）', 'D', 'ネ', 'パ', 'ム', 'ミ', '＝', 'z', 'い', 'ろ', 'c', 'Ｏ', 'ザ', 'l', 'v', 'x', 'ヌ', 'に', 'ら', 'ヘ', '。', 'ギ', 'モ', 'Ｄ', 'キ', 'i', \"'\", 'M', 'で', 'ぴ', 'ぷ', 'ビ', 'H', 'f', 'ヤ', 'ユ', 'シ', 'Z', 'る', 'そ', 'テ', 'V', 'れ', '｣', 'じ', 'ワ', 'レ', 'ォ', 'ジ', 'な', 'ニ', 'q', '＆', 'っ', '「', 'ぢ', 'ル', 'さ', 'ぺ', 'm', 'ト', 'ホ', 'コ', 'オ', 'セ', 'ま', 'メ', 'ァ', 'ぐ', 'ラ']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(vocab_list))\n",
+    "print(vocab_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "j_vocab = {\"<pad>\": 0, \"<s>\": 1, \"</s>\": 2, \"<unk>\": 3, \"|\": 4, \"'\": 5, \"-\": 6, \"A\": 7, \"B\": 8, \"C\": 9, \"D\": 10, \"E\": 11, \"F\": 12, \"G\": 13, \"H\": 14, \"I\": 15, \"J\": 16, \"K\": 17, \"L\": 18, \"M\": 19, \"N\": 20, \"O\": 21, \"P\": 22, \"Q\": 23, \"R\": 24, \"S\": 25, \"T\": 26, \"U\": 27, \"V\": 28, \"W\": 29, \"X\": 30, \"Y\": 31, \"Z\": 32, \"Ä\": 33, \"Í\": 34, \"Ó\": 35, \"Ö\": 36, \"Ü\": 37}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "manually_kept_values = ['ß', 'ä', 'ö', 'ü']\n",
+    "\n",
+    "punctuation = ['.', ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['$', '&', '(', ')', '*', '+', '.', '/', '=', '@', '[', ']', '_', '`', '¡', '§', '«', '°', '´', 'µ', '·', '»', '×', 'à', 'á', 'â', 'ã', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ø', 'ù', 'ú', 'û', 'ý', 'þ', 'ā', 'ă', 'ą', 'ć', 'č', 'ď', 'đ', 'ē', 'ė', 'ę', 'ě', 'ğ', 'ġ', 'ħ', 'ī', 'ı', 'ł', 'ń', 'ņ', 'ň', 'ō', 'ŏ', 'ő', 'œ', 'ř', 'ś', 'ş', 'š', 'ť', 'ū', 'ů', 'ź', 'ż', 'ž', 'ơ', 'ǐ', 'ǔ', 'ș', 'ț', 'ə', 'ʻ', 'ʾ', 'ʿ', '̆', '̇', '̥', 'а', 'в', 'е', 'и', 'к', 'м', 'о', 'р', 'с', 'ф', 'ч', 'ш', 'ѹ', 'א', 'ב', 'נ', 'ע', 'ש', '་', 'ན', 'ḫ', 'ṟ', 'ṣ', 'ṭ', 'ạ', 'ả', 'ắ', 'ằ', 'ế', 'ễ', 'ệ', 'ọ', 'ồ', 'ộ', 'ụ', 'ứ', '‑', '‚', '„', '‟', '′', '″', '‹', '›', '→', '−', '≡', '⟨', '⟩', 'カ', '东', '临', '乡', '关', '合', '城', '孙', '尣', '幺', '支', '比', '毛', '泽', '無', '生', '臣', '辶', '道', '镇', '黃']\n"
+     ]
+    }
+   ],
+   "source": [
+    "odd_values = []\n",
+    "for index, value in enumerate(sorted(vocab_list)):\n",
+    "#     if :\n",
+    "    if value not in j_vocab and not (16 <= index <= 41 or value == ' ') and value not in manually_kept_values:\n",
+    "        odd_values.append(value)\n",
+    "#     print(index, value)\n",
+    "    \n",
+    "print(odd_values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "$ & ( ) * + . / = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\" \".join(odd_values))\n",
+    "\n",
+    "# for value in odd_values:\n",
+    "#     if value not in manually_kept_values:\n",
+    "#         print(value)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "$ & ( ) * + = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_vocab_list = [value for value in vocab_list if value not in odd_values]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ß',\n",
+       " 'j',\n",
+       " 'r',\n",
+       " 'h',\n",
+       " 'd',\n",
+       " 'l',\n",
+       " 'z',\n",
+       " 'n',\n",
+       " 'm',\n",
+       " 'c',\n",
+       " 'ä',\n",
+       " \"'\",\n",
+       " 'g',\n",
+       " 'e',\n",
+       " 'w',\n",
+       " 's',\n",
+       " 'u',\n",
+       " 'k',\n",
+       " 'o',\n",
+       " 'f',\n",
+       " ' ',\n",
+       " 'y',\n",
+       " 'v',\n",
+       " 'ö',\n",
+       " 'ü',\n",
+       " 'p',\n",
+       " 'a',\n",
+       " 'x',\n",
+       " 'b',\n",
+       " 'q',\n",
+       " 't',\n",
+       " 'i']"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_vocab_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'word_delimiter_token' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [21]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m vocab_dict \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28msorted\u001b[39m(vocab_list))}\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# replace white space with delimiter token\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mword_delimiter_token\u001b[49m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m      5\u001b[0m     vocab_dict[word_delimiter_token] \u001b[38;5;241m=\u001b[39m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'word_delimiter_token' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
+    "\n",
+    "# replace white space with delimiter token\n",
+    "if word_delimiter_token is not None:\n",
+    "    vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n",
+    "    del vocab_dict[\" \"]\n",
+    "\n",
+    "# add unk and pad token\n",
+    "if unk_token is not None:\n",
+    "    vocab_dict[unk_token] = len(vocab_dict)\n",
+    "\n",
+    "if pad_token is not None:\n",
+    "    vocab_dict[pad_token] = len(vocab_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "59e89471ea85449ebbc709d0a9d7325c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/437 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OOV found in 421223 samples, and they were removed from training set\n",
+      "The final training set size is 14947\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_set = set(filtered_vocab_list)\n",
+    "train_dataset_size = len(common_voice_train)\n",
+    "common_voice_train_2 = common_voice_train.filter(\n",
+    "    lambda example: vocab_set.issuperset(example[\"sentence\"].replace(\" \", \"\"))\n",
+    ")\n",
+    "print(f\"OOV found in {train_dataset_size - len(common_voice_train_2)} samples, and they were removed from training set\")\n",
+    "print(f\"The final training set size is {len(common_voice_train_2)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m odd_example_texts \u001b[38;5;241m=\u001b[39m  []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m common_voice_train:\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m odd_values:\n\u001b[1;32m      4\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence\u001b[39m\u001b[38;5;124m\"\u001b[39m]: \n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1664\u001b[0m, in \u001b[0;36mDataset._iter\u001b[0;34m(self, decoded)\u001b[0m\n\u001b[1;32m   1658\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate through the examples.\u001b[39;00m\n\u001b[1;32m   1659\u001b[0m \n\u001b[1;32m   1660\u001b[0m \u001b[38;5;124;03mIf a formatting is set with :meth:`Dataset.set_format` rows will be returned with the\u001b[39;00m\n\u001b[1;32m   1661\u001b[0m \u001b[38;5;124;03mselected format.\u001b[39;00m\n\u001b[1;32m   1662\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1663\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_rows):\n\u001b[0;32m-> 1664\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1665\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1666\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdecoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecoded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1667\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1915\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, decoded, **kwargs)\u001b[0m\n\u001b[1;32m   1913\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, decoded\u001b[38;5;241m=\u001b[39mdecoded, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[1;32m   1914\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m query_table(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data, key, indices\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1915\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1916\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[1;32m   1917\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:533\u001b[0m, in \u001b[0;36mformat_table\u001b[0;34m(table, key, formatter, format_columns, output_all_columns)\u001b[0m\n\u001b[1;32m    531\u001b[0m python_formatter \u001b[38;5;241m=\u001b[39m PythonFormatter(features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 533\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    534\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    535\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:282\u001b[0m, in \u001b[0;36mFormatter.__call__\u001b[0;34m(self, pa_table, query_type)\u001b[0m\n\u001b[1;32m    280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa\u001b[38;5;241m.\u001b[39mTable, query_type: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[1;32m    281\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 282\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    283\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat_column(pa_table)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:313\u001b[0m, in \u001b[0;36mPythonFormatter.format_row\u001b[0;34m(self, pa_table)\u001b[0m\n\u001b[1;32m    311\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpython_arrow_extractor()\u001b[38;5;241m.\u001b[39mextract_row(pa_table)\n\u001b[1;32m    312\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoded:\n\u001b[0;32m--> 313\u001b[0m     row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpython_features_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    314\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:222\u001b[0m, in \u001b[0;36mPythonFeaturesDecoder.decode_row\u001b[0;34m(self, row)\u001b[0m\n\u001b[1;32m    221\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mdict\u001b[39m:\n\u001b[0;32m--> 222\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures \u001b[38;5;28;01melse\u001b[39;00m row\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1318\u001b[0m, in \u001b[0;36mFeatures.decode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m   1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m   1309\u001b[0m     \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m   1310\u001b[0m \n\u001b[1;32m   1311\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1315\u001b[0m \u001b[38;5;124;03m        :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m   1316\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1318\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m   1319\u001b[0m         column_name: decode_nested_example(feature, value)\n\u001b[1;32m   1320\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m   1321\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m   1322\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m   1323\u001b[0m             {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m   1324\u001b[0m         )\n\u001b[1;32m   1325\u001b[0m     }\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1319\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m   1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m   1309\u001b[0m     \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m   1310\u001b[0m \n\u001b[1;32m   1311\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1315\u001b[0m \u001b[38;5;124;03m        :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m   1316\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m   1318\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[0;32m-> 1319\u001b[0m         column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1320\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m   1321\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m   1322\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m   1323\u001b[0m             {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m   1324\u001b[0m         )\n\u001b[1;32m   1325\u001b[0m     }\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1056\u001b[0m, in \u001b[0;36mdecode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m   1054\u001b[0m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[1;32m   1055\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (Audio, Image)):\n\u001b[0;32m-> 1056\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:97\u001b[0m, in \u001b[0;36mAudio.decode_example\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn audio sample should have one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m but both are None in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 97\u001b[0m     array, sampling_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode_mp3\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     99\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m file:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:183\u001b[0m, in \u001b[0;36mAudio._decode_mp3\u001b[0;34m(self, path_or_file)\u001b[0m\n\u001b[1;32m    181\u001b[0m array \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmono:\n\u001b[0;32m--> 183\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array, sampling_rate\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/numpy/core/_methods.py:154\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[1;32m    152\u001b[0m \u001b[38;5;66;03m# Cast bool, unsigned int, and int to float64 by default\u001b[39;00m\n\u001b[1;32m    153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 154\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, (\u001b[43mnt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minteger\u001b[49m, nt\u001b[38;5;241m.\u001b[39mbool_)):\n\u001b[1;32m    155\u001b[0m         dtype \u001b[38;5;241m=\u001b[39m mu\u001b[38;5;241m.\u001b[39mdtype(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    156\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, nt\u001b[38;5;241m.\u001b[39mfloat16):\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0  \n",
+      "1 &\n",
+      "2 '\n",
+      "3 .\n",
+      "4 /\n",
+      "5 A\n",
+      "6 B\n",
+      "7 C\n",
+      "8 D\n",
+      "9 E\n",
+      "10 F\n",
+      "11 G\n",
+      "12 H\n",
+      "13 I\n",
+      "14 J\n",
+      "15 K\n",
+      "16 L\n",
+      "17 M\n",
+      "18 N\n",
+      "19 O\n",
+      "20 P\n",
+      "21 Q\n",
+      "22 R\n",
+      "23 S\n",
+      "24 T\n",
+      "25 U\n",
+      "26 V\n",
+      "27 W\n",
+      "28 X\n",
+      "29 Y\n",
+      "30 Z\n",
+      "31 a\n",
+      "32 b\n",
+      "33 c\n",
+      "34 d\n",
+      "35 e\n",
+      "36 f\n",
+      "37 g\n",
+      "38 h\n",
+      "39 i\n",
+      "40 j\n",
+      "41 k\n",
+      "42 l\n",
+      "43 m\n",
+      "44 n\n",
+      "45 o\n",
+      "46 p\n",
+      "47 q\n",
+      "48 r\n",
+      "49 s\n",
+      "50 t\n",
+      "51 u\n",
+      "52 v\n",
+      "53 w\n",
+      "54 x\n",
+      "55 y\n",
+      "56 z\n",
+      "57 ―\n",
+      "58 、\n",
+      "59 。\n",
+      "60 々\n",
+      "61 〇\n",
+      "62 「\n",
+      "63 」\n",
+      "64 『\n",
+      "65 』\n",
+      "66 〜\n",
+      "67 ぁ\n",
+      "68 あ\n",
+      "69 い\n",
+      "70 う\n",
+      "71 ぇ\n",
+      "72 え\n",
+      "73 お\n",
+      "74 か\n",
+      "75 が\n",
+      "76 き\n",
+      "77 ぎ\n",
+      "78 く\n",
+      "79 ぐ\n",
+      "80 け\n",
+      "81 げ\n",
+      "82 こ\n",
+      "83 ご\n",
+      "84 さ\n",
+      "85 ざ\n",
+      "86 し\n",
+      "87 じ\n",
+      "88 す\n",
+      "89 ず\n",
+      "90 せ\n",
+      "91 ぜ\n",
+      "92 そ\n",
+      "93 ぞ\n",
+      "94 た\n",
+      "95 だ\n",
+      "96 ち\n",
+      "97 ぢ\n",
+      "98 っ\n",
+      "99 つ\n",
+      "100 づ\n",
+      "101 て\n",
+      "102 で\n",
+      "103 と\n",
+      "104 ど\n",
+      "105 な\n",
+      "106 に\n",
+      "107 ぬ\n",
+      "108 ね\n",
+      "109 の\n",
+      "110 は\n",
+      "111 ば\n",
+      "112 ぱ\n",
+      "113 ひ\n",
+      "114 び\n",
+      "115 ぴ\n",
+      "116 ふ\n",
+      "117 ぶ\n",
+      "118 ぷ\n",
+      "119 へ\n",
+      "120 べ\n",
+      "121 ぺ\n",
+      "122 ほ\n",
+      "123 ぼ\n",
+      "124 ぽ\n",
+      "125 ま\n",
+      "126 み\n",
+      "127 む\n",
+      "128 め\n",
+      "129 も\n",
+      "130 ゃ\n",
+      "131 や\n",
+      "132 ゅ\n",
+      "133 ゆ\n",
+      "134 ょ\n",
+      "135 よ\n",
+      "136 ら\n",
+      "137 り\n",
+      "138 る\n",
+      "139 れ\n",
+      "140 ろ\n",
+      "141 わ\n",
+      "142 を\n",
+      "143 ん\n",
+      "144 ァ\n",
+      "145 ア\n",
+      "146 ィ\n",
+      "147 イ\n",
+      "148 ゥ\n",
+      "149 ウ\n",
+      "150 ェ\n",
+      "151 エ\n",
+      "152 ォ\n",
+      "153 オ\n",
+      "154 カ\n",
+      "155 ガ\n",
+      "156 キ\n",
+      "157 ギ\n",
+      "158 ク\n",
+      "159 グ\n",
+      "160 ケ\n",
+      "161 ゲ\n",
+      "162 コ\n",
+      "163 ゴ\n",
+      "164 サ\n",
+      "165 ザ\n",
+      "166 シ\n",
+      "167 ジ\n",
+      "168 ス\n",
+      "169 ズ\n",
+      "170 セ\n",
+      "171 ゼ\n",
+      "172 ソ\n",
+      "173 ゾ\n",
+      "174 タ\n",
+      "175 ダ\n",
+      "176 チ\n",
+      "177 ッ\n",
+      "178 ツ\n",
+      "179 ヅ\n",
+      "180 テ\n",
+      "181 デ\n",
+      "182 ト\n",
+      "183 ド\n",
+      "184 ナ\n",
+      "185 ニ\n",
+      "186 ヌ\n",
+      "187 ネ\n",
+      "188 ノ\n",
+      "189 ハ\n",
+      "190 バ\n",
+      "191 パ\n",
+      "192 ヒ\n",
+      "193 ビ\n",
+      "194 ピ\n",
+      "195 フ\n",
+      "196 ブ\n",
+      "197 プ\n",
+      "198 ヘ\n",
+      "199 ベ\n",
+      "200 ペ\n",
+      "201 ホ\n",
+      "202 ボ\n",
+      "203 ポ\n",
+      "204 マ\n",
+      "205 ミ\n",
+      "206 ム\n",
+      "207 メ\n",
+      "208 モ\n",
+      "209 ャ\n",
+      "210 ヤ\n",
+      "211 ュ\n",
+      "212 ユ\n",
+      "213 ョ\n",
+      "214 ヨ\n",
+      "215 ラ\n",
+      "216 リ\n",
+      "217 ル\n",
+      "218 レ\n",
+      "219 ロ\n",
+      "220 ワ\n",
+      "221 ン\n",
+      "222 ヴ\n",
+      "223 ヶ\n",
+      "224 ・\n",
+      "225 ー\n",
+      "226 繫\n",
+      "227 ！\n",
+      "228 ＆\n",
+      "229 ）\n",
+      "230 －\n",
+      "231 ．\n",
+      "232 ：\n",
+      "233 ＝\n",
+      "234 ？\n",
+      "235 Ａ\n",
+      "236 Ｄ\n",
+      "237 Ｆ\n",
+      "238 Ｇ\n",
+      "239 Ｎ\n",
+      "240 Ｏ\n",
+      "241 Ｐ\n",
+      "242 Ｓ\n",
+      "243 Ｕ\n",
+      "244 ｈ\n",
+      "245 ｊ\n",
+      "246 ｢\n",
+      "247 ｣\n",
+      "248 ･\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
+    "for key, value in enumerate(vocab_dict):\n",
+    "    print(key, value)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_vocabulary_from_data(\n",
+    "    datasets: DatasetDict,\n",
+    "    word_delimiter_token: Optional[str] = None,\n",
+    "    unk_token: Optional[str] = None,\n",
+    "    pad_token: Optional[str] = None,\n",
+    "):\n",
+    "    # Given training and test labels create vocabulary\n",
+    "    def extract_all_chars(batch):\n",
+    "        all_text = \" \".join(batch[\"target_text\"])\n",
+    "        vocab = list(set(all_text))\n",
+    "        return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
+    "\n",
+    "    vocabs = datasets.map(\n",
+    "        extract_all_chars,\n",
+    "        batched=True,\n",
+    "        batch_size=-1,\n",
+    "        keep_in_memory=True,\n",
+    "        remove_columns=datasets[\"train\"].column_names,\n",
+    "    )\n",
+    "\n",
+    "    # take union of all unique characters in each dataset\n",
+    "    vocab_set = functools.reduce(\n",
+    "        lambda vocab_1, vocab_2: set(vocab_1[\"vocab\"][0]) | set(vocab_2[\"vocab\"][0]), vocabs.values()\n",
+    "    )\n",
+    "\n",
+    "    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}\n",
+    "\n",
+    "    # replace white space with delimiter token\n",
+    "    if word_delimiter_token is not None:\n",
+    "        vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n",
+    "        del vocab_dict[\" \"]\n",
+    "\n",
+    "    # add unk and pad token\n",
+    "    if unk_token is not None:\n",
+    "        vocab_dict[unk_token] = len(vocab_dict)\n",
+    "\n",
+    "    if pad_token is not None:\n",
+    "        vocab_dict[pad_token] = len(vocab_dict)\n",
+    "\n",
+    "    return vocab_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load processor\n",
+    "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n",
+    "# feature_extractor = processor_with_lm.feature_extractor\n",
+    "sampling_rate = feature_extractor.sampling_rate\n",
+    "\n",
+    "# resample audio\n",
+    "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))\n",
+    "\n",
+    "# load eval pipeline\n",
+    "asr = pipeline(\"automatic-speech-recognition\", model=repo_name, feature_extractor=feature_extractor)\n",
+    "\n",
+    "# map function to decode audio\n",
+    "def map_to_pred(batch):\n",
+    "    prediction = asr(\n",
+    "        batch[\"audio\"][\"array\"])\n",
+    "\n",
+    "    batch[\"prediction\"] = prediction[\"text\"]\n",
+    "    batch[\"target\"] = batch[\"sentence\"]\n",
+    "    return batch\n",
+    "\n",
+    "# run inference on all examples\n",
+    "result = dataset.map(map_to_pred, remove_columns=dataset.column_names)\n",
+    "print(result[\"prediction\"])\n",
+    "\n",
+    "result[0]['target']"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70e27818f3fc71ffdfcc80419d1967fd61208e9dc6b1b3d61fd6629f0946734b
+size 2991

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"&": 1, "'": 2, ".": 3, "/": 4, "A": 5, "B": 6, "C": 7, "D": 8, "E": 9, "F": 10, "G": 11, "H": 12, "I": 13, "J": 14, "K": 15, "L": 16, "M": 17, "N": 18, "O": 19, "P": 20, "Q": 21, "R": 22, "S": 23, "T": 24, "U": 25, "V": 26, "W": 27, "X": 28, "Y": 29, "Z": 30, "a": 31, "b": 32, "c": 33, "d": 34, "e": 35, "f": 36, "g": 37, "h": 38, "i": 39, "j": 40, "k": 41, "l": 42, "m": 43, "n": 44, "o": 45, "p": 46, "q": 47, "r": 48, "s": 49, "t": 50, "u": 51, "v": 52, "w": 53, "x": 54, "y": 55, "z": 56, "―": 57, "、": 58, "。": 59, "々": 60, "〇": 61, "「": 62, "」": 63, "『": 64, "』": 65, "〜": 66, "ぁ": 67, "あ": 68, "い": 69, "う": 70, "ぇ": 71, "え": 72, "お": 73, "か": 74, "が": 75, "き": 76, "ぎ": 77, "く": 78, "ぐ": 79, "け": 80, "げ": 81, "こ": 82, "ご": 83, "さ": 84, "ざ": 85, "し": 86, "じ": 87, "す": 88, "ず": 89, "せ": 90, "ぜ": 91, "そ": 92, "ぞ": 93, "た": 94, "だ": 95, "ち": 96, "ぢ": 97, "っ": 98, "つ": 99, "づ": 100, "て": 101, "で": 102, "と": 103, "ど": 104, "な": 105, "に": 106, "ぬ": 107, "ね": 108, "の": 109, "は": 110, "ば": 111, "ぱ": 112, "ひ": 113, "び": 114, "ぴ": 115, "ふ": 116, "ぶ": 117, "ぷ": 118, "へ": 119, "べ": 120, "ぺ": 121, "ほ": 122, "ぼ": 123, "ぽ": 124, "ま": 125, "み": 126, "む": 127, "め": 128, "も": 129, "ゃ": 130, "や": 131, "ゅ": 132, "ゆ": 133, "ょ": 134, "よ": 135, "ら": 136, "り": 137, "る": 138, "れ": 139, "ろ": 140, "わ": 141, "を": 142, "ん": 143, "ァ": 144, "ア": 145, "ィ": 146, "イ": 147, "ゥ": 148, "ウ": 149, "ェ": 150, "エ": 151, "ォ": 152, "オ": 153, "カ": 154, "ガ": 155, "キ": 156, "ギ": 157, "ク": 158, "グ": 159, "ケ": 160, "ゲ": 161, "コ": 162, "ゴ": 163, "サ": 164, "ザ": 165, "シ": 166, "ジ": 167, "ス": 168, "ズ": 169, "セ": 170, "ゼ": 171, "ソ": 172, "ゾ": 173, "タ": 174, "ダ": 175, "チ": 176, "ッ": 177, "ツ": 178, "ヅ": 179, "テ": 180, "デ": 181, "ト": 182, "ド": 183, "ナ": 184, "ニ": 185, "ヌ": 186, "ネ": 187, "ノ": 188, "ハ": 189, "バ": 190, "パ": 191, "ヒ": 192, "ビ": 193, "ピ": 194, "フ": 195, "ブ": 196, "プ": 197, "ヘ": 198, "ベ": 199, "ペ": 200, "ホ": 201, "ボ": 202, "ポ": 203, "マ": 204, "ミ": 205, "ム": 206, "メ": 207, "モ": 208, "ャ": 209, "ヤ": 210, "ュ": 211, "ユ": 212, "ョ": 213, "ヨ": 214, "ラ": 215, "リ": 216, "ル": 217, "レ": 218, "ロ": 219, "ワ": 220, "ン": 221, "ヴ": 222, "ヶ": 223, "・": 224, "ー": 225, "繫": 226, "＆": 227, "）": 228, "－": 229, "．": 230, "：": 231, "＝": 232, "？": 233, "Ａ": 234, "Ｄ": 235, "Ｆ": 236, "Ｇ": 237, "Ｎ": 238, "Ｏ": 239, "Ｐ": 240, "Ｓ": 241, "Ｕ": 242, "ｈ": 243, "ｊ": 244, "｢": 245, "｣": 246, "･": 247, "|": 0, "[UNK]": 248, "[PAD]": 249}