Source Files

Browse files

Files changed (6) hide show

cleaning.py +15 -0
collator.py +90 -0
compute_wer.py +179 -0
fine_tune.py +357 -0
lm_fusion.py +56 -0
utils.py +62 -0

cleaning.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import sys
+import fire
+from aspram.utils import clean_characters
+def exec(lower: bool = False, only_mesropatar: bool = False):
+    for line in sys.stdin:
+        line = line.strip()
+        line = clean_characters(dict(sentence=line), lower=lower, only_mesropatar=only_mesropatar)['sentence']
+        sys.stdout.write(line + "\n")
+if __name__ == '__main__':
+  fire.Fire(exec)

collator.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import Wav2Vec2Processor
+from torch_audiomentations import Compose, Gain
+from audiomentations import (
+    Compose,
+    AddGaussianNoise,
+    AddGaussianSNR,
+    ClippingDistortion,
+    FrequencyMask,
+    Gain,
+    LoudnessNormalization,
+    Normalize,
+    PitchShift,
+    PolarityInversion,
+    Shift,
+    TimeMask,
+    TimeStretch,
+)
+class DataCollatorCTCWithPadding:
+    def __init__(
+        self,
+        processor: Wav2Vec2Processor,
+        padding: Union[bool, str] = True,
+        sample_rate: int = 16_000,
+        apply_gaussian_noise_with_p: float = 0,
+        apply_gain_with_p: float = 0,
+        apply_pitch_shift_with_p: float = 0,
+        apply_time_stretch_with_p: float = 0,
+    ):
+        self.processor = processor
+        self.padding = padding
+        self.apply_gaussian_noise_with_p = apply_gaussian_noise_with_p
+        self.apply_gain_with_p = apply_gain_with_p
+        self.apply_pitch_shift_with_p = apply_pitch_shift_with_p
+        self.apply_time_stretch_with_p = apply_time_stretch_with_p
+        self.sample_rate = sample_rate
+        self.augmentator = None
+        if self.apply_gaussian_noise_with_p + self.apply_gain_with_p + self.apply_pitch_shift_with_p + self.apply_time_stretch_with_p > 0:
+            self.augmentator = Compose([
+                TimeStretch(min_rate=0.8, max_rate=1.2, leave_length_unchanged=False, p=self.apply_time_stretch_with_p),
+                PitchShift(min_semitones=-1, max_semitones=1, p=self.apply_pitch_shift_with_p),
+                Gain(min_gain_in_db=-1, max_gain_in_db=1, p=self.apply_gain_with_p),
+                AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.001, p=self.apply_gaussian_noise_with_p),
+            ])
+    def _apply_augmentation(self, input_values: List[float]):
+        """apply some audio augmentations in the given input_values"""
+        if self.augmentator is not None:
+            return self.augmentator(samples=np.array(input_values), sample_rate=self.sample_rate).tolist()
+        else:
+            return input_values
+    def __call__(
+        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
+    ) -> Dict[str, torch.Tensor]:
+        # TODO maybe disable augmentation in inference mode?
+        input_features = [
+            {"input_values": self._apply_augmentation(feature["input_values"])} for feature in features
+        ]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            return_tensors="pt",
+        )
+        with self.processor.as_target_processor():
+            labels_batch = self.processor.pad(
+                label_features,
+                padding=self.padding,
+                return_tensors="pt",
+            )
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(
+            labels_batch.attention_mask.ne(1), -100
+        )
+        batch["labels"] = labels
+        return batch

compute_wer.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import weakref
+import torch
+import numpy as np
+from tqdm import tqdm
+from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2Processor
+from transformers import AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ForCTC
+from datasets import load_dataset, load_metric, Audio
+import fire
+from aspram.utils import clean_characters, prepare_dataset
+# import sentencepiece as spm
+# repo_name = "20220414-210228_lm"
+# repo_name = "./20220414-210228_lm_spm_bpe"
+def exec(
+    *,
+    repo_name: str,
+    dataset: str = "yerevann/common_voice_9_0",
+    cuda: bool = True,
+    batch_size: int = 8,
+    beam_width: int = 1,
+    j: int = 1,
+    sample_rate: int = 16_000,
+    alpha: float = None,
+    beta: float = None,
+    unk_score_offset: float = None,
+    lm_score_boundary: bool = None,
+    beam_prune_logp: float = None,
+    token_min_logp: float = None,
+    output_file : str = None,
+):
+    # repo_name = "20220428-094209--72000_lm"
+    print(f'loading model {repo_name}')
+    model = Wav2Vec2ForCTC.from_pretrained(repo_name)
+    print('done')
+    if cuda:
+        print('CUDA mode')
+        model.cuda()
+    if repo_name.endswith('_lm'):
+        processor = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name, sample_rate=sample_rate)
+        with_lm = True
+    else:
+        processor = Wav2Vec2Processor.from_pretrained(repo_name, sample_rate=sample_rate)
+        with_lm = False
+    common_voice_test = load_dataset(
+        dataset,
+        "hy-AM",
+        split="test",
+        use_auth_token=True,
+    )
+    common_voice_test = common_voice_test.map(clean_characters)
+    common_voice_test = common_voice_test.cast_column(
+        "audio", Audio(sampling_rate=sample_rate)
+    )
+    common_voice_test = common_voice_test.map(
+        prepare_dataset,
+        remove_columns=common_voice_test.column_names,
+        fn_kwargs=dict(processor=processor)
+    )
+    # wer_metric = load()...
+    # for batch in batched_dataset:
+    #     input_dict = processer(batch)
+    #     logits = model(input...)
+    #     wer_metric.update(true, pred)
+    # wer_metric.compute
+    # def exec_cer_wer(batch_size: int = 8, **kwargs):
+    def predict(batch):
+        # print(1)
+        input_dict = processor(
+            batch["input_values"],
+            return_tensors="pt",
+            padding=True,
+            sampling_rate=sample_rate
+        )
+        # print(2)
+        with torch.no_grad():
+            x = input_dict.input_values
+            if cuda:
+                x = x.cuda()
+            logits = model(x).logits
+        # print(3)
+        if with_lm:
+            # print(beam_size)
+            # sp = spm.SentencePieceProcessor()
+            # sp.load('head_mes_lower_bpe.model')
+            pred = processor.batch_decode(
+                logits.cpu().numpy(),
+                beam_width=beam_width,
+                alpha=alpha,
+                beta=beta,
+                unk_score_offset=unk_score_offset,
+                lm_score_boundary=lm_score_boundary,
+                num_processes=j,
+                beam_prune_logp=beam_prune_logp, #-1000,
+                token_min_logp=token_min_logp,
+                # sp=sp,
+            ).text
+        else:
+            pred = processor.batch_decode(
+                logits.cpu().numpy().argmax(-1),
+            )
+            # print(pred)
+        # print(pred)
+        return  {
+            'sentence': pred
+        }
+    with_predictions = common_voice_test.map(predict, batched=True, batch_size=batch_size)
+    def detokenize(sample):
+        if '▁' in sample['sentence']:
+            print("------ ", sample)
+            sample['sentence'] = sample['sentence'].replace(' ', '').replace('▁', ' ')
+            print("------ ", sample)
+        return sample
+    with_predictions = with_predictions.map(detokenize)
+    common_voice_test_transcription = load_dataset(
+        dataset,
+        "hy-AM",
+        split="test",
+        use_auth_token=True,
+    )
+    with_predictions = with_predictions.map(clean_characters, fn_kwargs=dict(lower=True, only_mesropatar=True))
+    common_voice_test_transcription = common_voice_test_transcription.map(clean_characters, fn_kwargs=dict(lower=True, only_mesropatar=True))
+    predictions = with_predictions['sentence']
+    references = common_voice_test_transcription['sentence']
+    wer_metric = load_metric("wer")
+    cer_metric = load_metric("cer")
+    for ref, pred in zip(references, predictions):
+        print(f' REF:\t{ref}')
+        print(f'PRED:\t{pred}')
+        print('\n')
+    wer = wer_metric.compute(predictions=predictions, references=references)
+    cer = cer_metric.compute(predictions=predictions, references=references)
+    print("wer: ", wer)
+    print("cer: ", cer)
+    df = common_voice_test_transcription.to_pandas()['sentence']
+    df = df.to_frame()
+    df["predictions"] = with_predictions.to_pandas()['sentence']
+    # df.insert(2, "predictions", with_predictions['sentence'], True)
+    if output_file is not None:
+        df.to_csv(output_file)
+    # exec_cer_wer(beam_width=beam_width, batch_size=batch_size)
+    # for pruning_score in {-10, -100, -2000}:
+    #     for alpha in {1, 0.5, 1.5}:
+    #         for beta in {1, 0.5, 1.5}:
+    #             for beam_size in {0, 2, 4, 6}:
+    #                     print("Configuration:")
+    #                     print("alpha {alpha} beta {beta}, beam_width {beam_size}, pruning_score {pruning_score}".format(alpha = alpha, beta = beta, beam_size = beam_size, pruning_score = pruning_score))
+    #                     exec_cer_wer(alpha, beta, 2**beam_size, pruning_score, batch_size=batch_size)
+    #                     print('\n\n')
+if __name__ == "__main__":
+    fire.Fire(exec)

fine_tune.py ADDED Viewed

	@@ -0,0 +1,357 @@

+from typing import Any, Dict, List, Optional, Union
+import os
+import json
+import time
+import numpy as np
+from transformers import Trainer
+from transformers import Wav2Vec2ForCTC
+from transformers import TrainingArguments
+from transformers import Wav2Vec2Processor
+from transformers import Wav2Vec2CTCTokenizer
+from transformers import Wav2Vec2FeatureExtractor
+from datasets import load_dataset, load_metric, Audio, concatenate_datasets, load_from_disk
+from aim import Run
+from aim.hugging_face import AimCallback
+import fire
+from aspram.collator import DataCollatorCTCWithPadding
+from aspram.utils import clean_characters, extract_all_chars, prepare_dataset
+def load_data(dataset_name: str, *, split: str):
+    dataset_name = dataset_name.replace(' ', '')
+    if '+' in dataset_name:
+        return concatenate_datasets([
+            load_data(name, split=split)
+            for name in dataset_name.split('+')
+        ])
+    if '*' in dataset_name:
+        a, _, b = dataset_name.partition('*')
+        if a.isnumeric():
+            num_repeats = int(a)
+            dataset_name = b
+        else:
+            num_repeats = int(b)
+            dataset_name = a
+        dataset = load_data(dataset_name, split=split)
+        return concatenate_datasets([
+            dataset
+            for _ in range(num_repeats)
+        ])
+    if 'teacher' in dataset_name:
+        dataset = load_from_disk(
+            dataset_name,
+        ).filter(
+            lambda sample: len(sample['audio']['array']) < 250_000
+        )
+    elif 'common_voice' in dataset_name:
+        dataset = load_dataset(
+            dataset_name,
+            "hy-AM",
+            split="train+validation+other" if split == 'train' else split,
+            use_auth_token=True,
+        )
+    else:
+        dataset = load_dataset(
+            dataset_name,
+            'hy_am',
+            split='train',
+        ).map(
+            lambda sample: dict(sentence=sample['transcription'])
+        ).filter(
+            lambda sample: sample['num_samples'] < 250_000
+        )
+    non_wanted_column_name = set(dataset.column_names) - set(['audio', 'path', 'sentence', 'client_id'])
+    dataset = dataset.map(remove_columns=non_wanted_column_name).cast_column("audio", Audio(sampling_rate=16_000))
+    return dataset
+def exec(
+    *,
+    batch_size: int,
+    lr: float,
+    warmup_steps: int = 2000,
+    grad_acc: int = 1,
+    group_by_length: bool = True,
+    fp16: bool = True,
+    bf16: bool = False,
+    pretrained_model: str = "facebook/wav2vec2-xls-r-2b",
+    dataset: str = "mozilla-foundation/common_voice_8_0",
+    num_train_epochs: int = 1200,
+    blacklist_enabled: bool = True,
+    seed: int = 42,
+    # random augment
+    apply_gaussian_noise_with_p: float = 0,
+    apply_gain_with_p: float = 0,
+    apply_pitch_shift_with_p: float = 0,
+    apply_time_stretch_with_p: float = 0,
+    # spec augment
+    mask_time_prob: float = 0.05, # value that is used in the previous models
+    mask_time_length: int = 10,
+    mask_time_min_masks: int = 2,
+    mask_feature_prob: float = 0,
+    mask_feature_length: int = 10,
+    mask_feature_min_masks: int = 0,
+    layerdrop: float = 0,
+    activation_dropout: float = 0.1,
+    lower: bool = False,
+    only_mesropatar: bool = False,
+    gradient_checkpointing: bool = False,
+    resume_from_hash: str = None,
+):
+    if bf16:
+        fp16 = False
+    fire_args = locals()
+    run = Run(resume_from_hash, log_system_params=(not resume_from_hash))
+    if not resume_from_hash:
+        timestr = time.strftime("%Y%m%d-%H%M%S")
+        repo_name = os.path.join('models', timestr)
+        for key, value in fire_args.items():
+            run['hparams', key] = value
+            run['fire', key] = value
+    else:
+        repo_name = run['hparams', 'output_dir']
+    run_hash = run.hash
+    run = None
+    train_dataset = load_data(dataset, split="train")
+    blacklist_client_ids = set()
+    blacklist_sentences = set()
+    if blacklist_enabled:
+        blacklist_client_ids = {
+            "93fa435db2b9e077af647c9f846d8b6031bcb1f6cd731e894a835e70a0ab4aec1faffce01c882bdcdcb854b98b601c83a1c412bae8e5ee411556f0e2f88c1c5c",
+            "f0aba38a8ab8705a40d05d96829ded5738a7eec7a9a182394c2ed288fc1c64553abcb1e0c4c966ffab9e8b76c27616b9f0503f92c42fe11249af36c50d3de5ef",
+            "a528aa436a34dce3b4ddc198c105ebb904967acdd04157bd1b0e0b2ffadd99b36a6cc5fe76f23c3dd2263d1507bec6038c41cb521ac8ee34126133e559df9e75",
+            "b83375c41b8ef9ab1b64491b624302b1541b0ba8496ed4e5cb4a751766d7a2cf7430e49e7118eaac98f5ae478d8cdd2b59d18526632297185bbc2e10e2126b18",
+            "330411ed21c5d9cda96180ac633b4dd10f5b6e50968e83a64f0016c9e15f22445fa8f396ef92b70ff03fc78e36b35b1693af60431b61b50b706aa58a00f80641",
+        }
+    # valid_dataset = load_data(dataset, split="test")
+    valid_dataset = load_data("yerevann/common_voice_9_0", split="test")
+    # train_client_ids = set(train_dataset['client_id']) - { None }
+    valid_client_ids = set(valid_dataset['client_id']) - { None }
+    blacklist_sentences = set(valid_dataset['sentence'])
+    blacklist_client_ids |= valid_client_ids
+    train_dataset = train_dataset.filter(
+        lambda sample: (
+            sample.get("client_id") not in blacklist_client_ids
+            and
+            sample.get("sentence") not in blacklist_sentences
+        )
+    )
+    # print('\n' * 10 + '================================' + '\n' * 10)
+    # print(train_client_ids & valid_client_ids)
+    # print('\n' * 10 + '================================' + '\n' * 10)
+    # train_dataset = train_dataset.remove_columns(
+    #     [
+    #         "accent",
+    #         "age",
+    #         "client_id",
+    #         "down_votes",
+    #         "gender",
+    #         "locale",
+    #         "segment",
+    #         "up_votes",
+    #     ]
+    # )
+    # valid_dataset = valid_dataset.remove_columns(
+    #     [
+    #         "accent",
+    #         "age",
+    #         "client_id",
+    #         "down_votes",
+    #         "gender",
+    #         "locale",
+    #         "segment",
+    #         "up_votes",
+    #     ]
+    # )
+    train_dataset = train_dataset.map(clean_characters, fn_kwargs=dict(lower=lower, only_mesropatar=only_mesropatar))
+    valid_dataset = valid_dataset.map(clean_characters, fn_kwargs=dict(lower=lower, only_mesropatar=only_mesropatar))
+    if 'models/' in pretrained_model:
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model)
+    elif not resume_from_hash:
+        vocab_train = train_dataset.map(
+            extract_all_chars,
+            batched=True,
+            batch_size=-1,
+            keep_in_memory=True,
+            remove_columns=train_dataset.column_names,
+        )
+        vocab_valid = valid_dataset.map(
+            extract_all_chars,
+            batched=True,
+            batch_size=-1,
+            keep_in_memory=True,
+            remove_columns=valid_dataset.column_names,
+        )
+        vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_valid["vocab"][0]))
+        vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
+        vocab_dict["|"] = vocab_dict[" "]
+        del vocab_dict[" "]
+        vocab_dict["[UNK]"] = len(vocab_dict)
+        vocab_dict["[PAD]"] = len(vocab_dict)
+        with open("vocab.json", "w") as vocab_file:
+            json.dump(vocab_dict, vocab_file)
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
+            "./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"
+        )
+        tokenizer.push_to_hub(repo_name)  # smth is wrong here
+    else:
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name)
+    feature_extractor = Wav2Vec2FeatureExtractor(
+        feature_size=1,
+        sampling_rate=16000,
+        padding_value=0.0,
+        do_normalize=True,
+        return_attention_mask=True,
+    )
+    processor = Wav2Vec2Processor(
+        feature_extractor=feature_extractor,
+        tokenizer=tokenizer,
+    )
+    train_dataset = train_dataset.cast_column(
+        "audio", Audio(sampling_rate=16_000)
+    )
+    valid_dataset = valid_dataset.cast_column(
+        "audio", Audio(sampling_rate=16_000)
+    )
+    train_dataset = train_dataset.map(
+        prepare_dataset, remove_columns=train_dataset.column_names,
+        fn_kwargs=dict(processor=processor)
+    )
+    valid_dataset = valid_dataset.map(
+        prepare_dataset, remove_columns=valid_dataset.column_names,
+        fn_kwargs=dict(processor=processor)
+    )
+    data_collator = DataCollatorCTCWithPadding(
+        processor=processor,
+        padding=True,
+        sample_rate=16_000,
+        apply_gaussian_noise_with_p=apply_gaussian_noise_with_p,
+        apply_gain_with_p=apply_gain_with_p,
+        apply_pitch_shift_with_p=apply_pitch_shift_with_p,
+        apply_time_stretch_with_p=apply_time_stretch_with_p,
+    )
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+        pred_str = processor.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+        wer = wer_metric.compute(predictions=pred_str, references=label_str)
+        cer = cer_metric.compute(predictions=pred_str, references=label_str)
+        return {"wer": wer, "cer": cer}
+    wer_metric = load_metric("wer")
+    cer_metric = load_metric("cer")
+    def model_init():
+        from transformers import Wav2Vec2Config
+        model = Wav2Vec2ForCTC.from_pretrained(
+            pretrained_model,
+            attention_dropout=0.0,
+            hidden_dropout=0.0,
+            feat_proj_dropout=0.0,
+            mask_time_prob=mask_time_prob,
+            mask_time_length=mask_time_length,
+            mask_time_min_masks=mask_time_min_masks,
+            mask_feature_prob=mask_feature_prob,
+            mask_feature_length=mask_feature_length,
+            mask_feature_min_masks=mask_feature_min_masks,
+            layerdrop=layerdrop,
+            activation_dropout=activation_dropout,
+            ctc_loss_reduction="mean",
+            pad_token_id=processor.tokenizer.pad_token_id,
+            vocab_size=len(processor.tokenizer),
+        )
+        model.freeze_feature_extractor()
+        return model
+    training_args = TrainingArguments(
+        output_dir=repo_name,
+        group_by_length=group_by_length,
+        per_device_train_batch_size=batch_size,
+        gradient_accumulation_steps=grad_acc,
+        evaluation_strategy="steps",
+        num_train_epochs=num_train_epochs,
+        gradient_checkpointing=gradient_checkpointing if resume_from_hash is None else True,
+        fp16=fp16,
+        bf16=bf16,
+        save_steps=4000,
+        eval_steps=200,
+        logging_steps=200,
+        learning_rate=lr,  # TODO
+        warmup_steps=warmup_steps,
+        save_total_limit=1,
+        push_to_hub=True,
+        metric_for_best_model="eval_wer",
+        greater_is_better=False,
+        seed=seed,
+    )
+    aim_callback = AimCallback()
+    aim_callback._run_hash = run_hash
+    print(train_dataset)
+    # run = aim_callback.experiment
+    trainer = Trainer(
+        model_init=model_init,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=train_dataset,
+        eval_dataset=valid_dataset,
+        tokenizer=processor.feature_extractor,
+        callbacks=[aim_callback],
+    )
+    trainer.train(resume_from_checkpoint=bool(resume_from_hash))
+    trainer.push_to_hub()
+if __name__ == "__main__":
+    fire.Fire(exec)

lm_fusion.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import AutoProcessor
+from transformers import Wav2Vec2ProcessorWithLM
+from pyctcdecode import build_ctcdecoder
+from huggingface_hub import Repository
+import logging
+import fire
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def exec(
+    kenlm_model_path: str,
+    model_name: str,
+    lm_model_name: str = "",
+):
+    if not lm_model_name:
+        lm_model_name = model_name + "_lm"
+    logger.info(f'writing on {lm_model_name}')
+    logger.info(f'loading processor of `{model_name}`')
+    processor = AutoProcessor.from_pretrained(model_name)
+    logger.info(f'done loading `{model_name}`')
+    vocab_dict = processor.tokenizer.get_vocab()
+    sorted_vocab_dict = {
+        k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])
+    }
+    logger.info(f'building ctc decoder from {kenlm_model_path}')
+    decoder = build_ctcdecoder(
+        labels=list(sorted_vocab_dict.keys()),
+        kenlm_model_path=kenlm_model_path,
+    )
+    logger.info('done')
+    processor_with_lm = Wav2Vec2ProcessorWithLM(
+        feature_extractor=processor.feature_extractor,
+        tokenizer=processor.tokenizer,
+        decoder=decoder,
+    )
+    # repo = Repository(
+    #     local_dir=lm_model_name, clone_from=model_name
+    # )  # model_name
+    # repo.push_to_hub()
+    processor_with_lm.save_pretrained(lm_model_name)
+if __name__ == "__main__":
+    fire.Fire(exec)

utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import re
+def clean_characters(sample, lower: bool = False, only_mesropatar: bool = False):
+    if 'sentence' not in sample:
+        if 'transcription' not in sample:
+            raise NotImplementedError()
+        else:
+            sample['sentence'] = sample['transcription']
+    allowed_chars = (
+        "-"
+        "a-z"
+        "A-Z"
+        "0-9"
+        "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖ"
+        "աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆև"
+        " \"'։֊.:?;,ՙ՚՛՜՝՞՟\(\)"
+    )
+    if lower:
+        sample["sentence"] = sample["sentence"].lower()
+    if only_mesropatar:
+        allowed_chars = (
+            "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖ"
+            "աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆև"
+            " -"
+        )
+    sample["sentence"] = re.sub(f"[^{allowed_chars}]", "", sample["sentence"])
+    # print(sample["sentence"])
+    return sample
+def extract_all_chars(batch):
+    all_text = " ".join(batch["sentence"])
+    vocab = list(set(all_text))
+    return {"vocab": [vocab], "all_text": [all_text]}
+def prepare_dataset(smaple, processor):
+    audio = smaple["audio"]
+    smaple["input_values"] = processor(
+        audio["array"], sampling_rate=audio["sampling_rate"]
+    ).input_values[0]
+    smaple["input_length"] = len(smaple["input_values"])
+    with processor.as_target_processor():
+        smaple["labels"] = processor(smaple["sentence"]).input_ids
+    return smaple
+def batched_prepare_dataset(batch, processor):
+    batch = batch.copy()
+    audio = batch["audio"]
+    batch["input_values"] = processor(
+        [i["array"] for i in audio], sampling_rate=16_000
+    ).input_values
+    batch["input_length"] = [len(i) for i in batch["input_values"] ]
+    with processor.as_target_processor():
+        batch["labels"] = processor(batch["sentence"]).input_ids
+    return batch