Upload 10 files

Browse files

Files changed (10) hide show

README.md +100 -0
config.json +269 -0
dict.txt +0 -0
generation_config.json +9 -0
preprocessor_config.json +10 -0
pytorch_model.bin +3 -0
sample_news.wav +0 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +15 -0
tokenizer_config.json +20 -0

README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+---
+license: cc-by-nc-4.0
+language:
+- vi
+---
+### Vietnamese ASR sequence-to-sequence model. This model supports output normalizing text, labeling timestamps, and segmenting multiple speakers.
+```python
+# !pip install transformers, sentencepiece
+from transformers import SpeechEncoderDecoderModel
+from transformers import AutoFeatureExtractor, AutoTokenizer, GenerationConfig
+import torchaudio
+import torch
+model_path = 'nguyenvulebinh/wav2vec2-bartpho'
+model = SpeechEncoderDecoderModel.from_pretrained(model_path).eval()
+feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+if torch.cuda.is_available():
+  model = model.cuda()
+def decode_tokens(token_ids, skip_special_tokens=True, time_precision=0.02):
+    timestamp_begin = tokenizer.vocab_size
+    outputs = [[]]
+    for token in token_ids:
+        if token >= timestamp_begin:
+            timestamp = f" |{(token - timestamp_begin) * time_precision:.2f}| "
+            outputs.append(timestamp)
+            outputs.append([])
+        else:
+            outputs[-1].append(token)
+    outputs = [
+        s if isinstance(s, str) else tokenizer.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+    ]
+    return "".join(outputs).replace("< |", "<|").replace("| >", "|>")
+def decode_wav(audio_wavs, asr_model, prefix=""):
+  device = next(asr_model.parameters()).device
+  input_values = feature_extractor.pad(
+    [{"input_values": feature} for feature in audio_wavs],
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    return_tensors="pt",
+  )
+  output_beam_ids = asr_model.generate(
+    input_values['input_values'].to(device),
+    attention_mask=input_values['attention_mask'].to(device),
+    decoder_input_ids=tokenizer.batch_encode_plus([prefix] * len(audio_wavs), return_tensors="pt")['input_ids'][..., :-1].to(device),
+    generation_config=GenerationConfig(decoder_start_token_id=tokenizer.bos_token_id),
+    max_length=250,
+    num_beams=25,
+    no_repeat_ngram_size=4,
+    num_return_sequences=1,
+    early_stopping=True,
+    return_dict_in_generate=True,
+    output_scores=True,
+  )
+  output_text = [decode_tokens(sequence) for sequence in output_beam_ids.sequences]
+  return output_text
+# https://huggingface.co/nguyenvulebinh/wav2vec2-bartpho/resolve/main/sample_news.wav
+print(decode_wav([torchaudio.load('sample_news.wav')[0].squeeze()], model))
+# <|0.00| Gia đình cho biết, nhiều lần đã từng gọi điện báo chính quyền và lực lượng an ninh địa phương nhưng đều không có tác dụng |7.00|>
+# <|8.14| Không ai giúp đỡ được mình một chút nào cả, nên là lúc đó là lúc tuyệt vọng nhất, nó tra tấn mình cực kỳ khổ, gây cái tâm lý ức chế rất là nhiều, rất là lớn |19.02|>
+```
+### Citation
+This repository uses the idea from the following paper. Please cite the paper if this model is used to help produce published results or is incorporated into other software.
+```text
+@INPROCEEDINGS{10446589,
+  author={Nguyen, Thai-Binh and Waibel, Alexander},
+  booktitle={ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  title={Synthetic Conversations Improve Multi-Talker ASR},
+  year={2024},
+  volume={},
+  number={},
+  pages={10461-10465},
+  keywords={Systematics;Error analysis;Knowledge based systems;Oral communication;Signal processing;Data models;Acoustics;multi-talker;asr;synthetic conversation},
+  doi={10.1109/ICASSP48485.2024.10446589}
+}
+```
+### Contact
+nguyenvulebinh@gmail.com
+[![Follow](https://img.shields.io/twitter/follow/nguyenvulebinh?style=social)](https://twitter.com/intent/follow?screen_name=nguyenvulebinh)

config.json ADDED Viewed

	@@ -0,0 +1,269 @@

+{
+  "_commit_hash": null,
+  "architectures": [
+    "SpeechEncoderDecoderModel"
+  ],
+  "decoder": {
+    "_name_or_path": "vinai/bartpho-syllable-base",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": true,
+    "architectures": [
+      "MBartModel"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_model": 768,
+    "decoder_attention_heads": 12,
+    "decoder_ffn_dim": 3072,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 6,
+    "decoder_start_token_id": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": false,
+    "encoder_attention_heads": 12,
+    "encoder_ffn_dim": 3072,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 6,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "init_std": 0.02,
+    "is_decoder": true,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 1024,
+    "min_length": 0,
+    "model_type": "mbart",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 6,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": "BartphoTokenizer",
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": "4.30.2",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 41031
+  },
+  "encoder": {
+    "_name_or_path": "nguyenvulebinh/wav2vec2-large-vi",
+    "activation_dropout": 0.0,
+    "adapter_attn_dim": null,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": true,
+    "add_cross_attention": false,
+    "apply_spec_augment": true,
+    "architectures": [
+      "Wav2Vec2ForPreTraining"
+    ],
+    "attention_dropout": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 1,
+    "chunk_size_feed_forward": 0,
+    "classifier_proj_size": 256,
+    "codevector_dim": 768,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": true,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "cross_attention_hidden_size": null,
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "decoder_start_token_id": null,
+    "diversity_loss_weight": 0.1,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "do_stable_layer_norm": true,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "feat_extract_activation": "gelu",
+    "feat_extract_dropout": 0.0,
+    "feat_extract_norm": "layer",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "length_penalty": 1.0,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_min_space": 1,
+    "mask_time_other": 0.0,
+    "mask_time_prob": 0.075,
+    "mask_time_selection": "static",
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "wav2vec2",
+    "no_repeat_ngram_size": 0,
+    "num_adapter_layers": 3,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 24,
+    "num_negatives": 100,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_size": 1024,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "proj_codevector_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "tdnn_dilation": [
+      1,
+      2,
+      3,
+      1,
+      1
+    ],
+    "tdnn_dim": [
+      512,
+      512,
+      512,
+      512,
+      1500
+    ],
+    "tdnn_kernel": [
+      5,
+      3,
+      3,
+      1,
+      1
+    ],
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": "4.30.2",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 96,
+    "xvector_output_dim": 512
+  },
+  "is_encoder_decoder": true,
+  "model_type": "speech-encoder-decoder",
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": null
+}

dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.30.2"
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27dc020061122a4c4d162f08ca41ba37d62525b82b4ab5309898c9a268f63f61
+size 135

sample_news.wav ADDED Viewed

Binary file (609 kB). View file

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8a54190d2b9256881ed34ab5428786629f929dd5a579350a6ef4735b86a9208
+size 132

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "BartphoTokenizer",
+  "unk_token": "<unk>"
+}