change(): model

Files changed (11) hide show

README.md +0 -61
added_tokens.json +0 -3
bpe.codes +0 -0
config.json +52 -46
generation_config.json +7 -0
pytorch_model.bin +2 -2
special_tokens_map.json +102 -4
spiece.model +3 -0
tokenizer_config.json +108 -7
training_args.bin +1 -1
vocab.txt +0 -0

README.md DELETED Viewed

@@ -1,61 +0,0 @@
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-import random
-# Load the fine-tuned model and tokenizer
-model_for_generation = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_similar_question_generation_model")
-tokenizer_for_generation = AutoTokenizer.from_pretrained("./fine_tuned_similar_question_generation_model")
-# Define a function to generate similar questions
-def generate_similar_questions(question):
-    # Tokenize the input question
-    input_ids = tokenizer_for_generation.encode(question, return_tensors="pt")
-    # Generate similar questions using the model
-    output_ids = model_for_generation.generate(
-        input_ids=input_ids,
-        num_beams=4,
-        max_length=64,
-        early_stopping=True,
-        num_return_sequences=2,
-        no_repeat_ngram_size=3,
-        temperature=1.0,
-    )
-    # Decode the output questions
-    output_questions = []
-    for output_id in output_ids:
-        output_question = tokenizer_for_generation.decode(output_id, skip_special_tokens=True)
-        if output_question.endswith("?"):
-            output_questions.append(output_question)
-        else:
-            continue
-    # If no valid question was generated, try again
-    if len(output_questions) == 0:
-        output_questions = generate_similar_questions(question)
-    # Return the output questions
-    return output_questions
-# Example usage:
-questions = [
-    "Bạn có thích đọc sách không?",
-    "Bạn thường xem phim ở đâu?",
-    "Bạn có thích ăn kem không?",
-    "Bạn đã từng đi du lịch ở nước ngoài chưa?",
-    "Bạn thường đọc sách ở thư viện hay mua sách về đọc?",
-    "Bạn thích chơi thể thao gì?",
-    "Bạn thích nghe nhạc thể loại gì?",
-    "Bạn có thường xem truyền hình không?",
-    "Bạn thường uống gì khi đi ăn ngoài?",
-    "Bạn đã từng đến Hà Nội chưa?"
-]
-for question in questions:
-    print(f"Input question: {question}")
-    similar_questions = generate_similar_questions(question)
-    print("Similar questions:")
-    for similar_question in similar_questions:
-        print(f"- {similar_question}")
-    print("\n")

added_tokens.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "<mask>": 64000
-}

bpe.codes DELETED Viewed

The diff for this file is too large to render. See raw diff

config.json CHANGED Viewed

@@ -1,54 +1,60 @@
 {
-  "_name_or_path": "vinai/phobert-base",
-  "activation_dropout": 0.0,
-  "activation_function": "gelu",
   "architectures": [
-    "BartForConditionalGeneration"
   ],
-  "attention_dropout": 0.0,
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 0,
-  "classifier_dropout": 0.0,
-  "d_model": 768,
-  "decoder_attention_heads": 16,
-  "decoder_ffn_dim": 4096,
-  "decoder_layerdrop": 0.0,
-  "decoder_layers": 12,
-  "decoder_start_token_id": 2,
-  "dropout": 0.1,
-  "encoder_attention_heads": 12,
-  "encoder_ffn_dim": 4096,
-  "encoder_layerdrop": 0.0,
-  "encoder_layers": 12,
-  "eos_token_id": 2,
-  "forced_eos_token_id": 2,
-  "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "id2label": {
-    "0": "LABEL_0",
-    "1": "LABEL_1",
-    "2": "LABEL_2"
-  },
-  "init_std": 0.02,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
   "is_encoder_decoder": true,
-  "label2id": {
-    "LABEL_0": 0,
-    "LABEL_1": 1,
-    "LABEL_2": 2
   },
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 258,
-  "model_type": "bart",
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "scale_embedding": false,
-  "tokenizer_class": "PhobertTokenizer",
   "torch_dtype": "float32",
-  "transformers_version": "4.25.1",
-  "type_vocab_size": 1,
   "use_cache": true,
-  "vocab_size": 64001
 }

 {
+  "_name_or_path": "t5-small",
   "architectures": [
+    "T5ForConditionalGeneration"
   ],
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
   "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
   "use_cache": true,
+  "vocab_size": 32128
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.26.1"
+}

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:572c85b9da8b92c11317385c16e8e26b29af1fc13ed2644aaeddd88957520b77
-size 1143647693

 version https://git-lfs.github.com/spec/v1
+oid sha256:b394894a2c51326b7ebfdbbd516a7a5a941ac19d7a555a9b539308053415acf4
+size 242071641

special_tokens_map.json CHANGED Viewed

@@ -1,9 +1,107 @@
 {
-  "bos_token": "<s>",
-  "cls_token": "<s>",
   "eos_token": "</s>",
-  "mask_token": "<mask>",
   "pad_token": "<pad>",
-  "sep_token": "</s>",
   "unk_token": "<unk>"
 }

 {
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
   "eos_token": "</s>",
   "pad_token": "<pad>",
   "unk_token": "<unk>"
 }

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

tokenizer_config.json CHANGED Viewed

@@ -1,13 +1,114 @@
 {
-  "bos_token": "<s>",
-  "cls_token": "<s>",
   "eos_token": "</s>",
-  "mask_token": "<mask>",
-  "model_max_length": 256,
-  "name_or_path": "vinai/phobert-base",
   "pad_token": "<pad>",
-  "sep_token": "</s>",
   "special_tokens_map_file": null,
-  "tokenizer_class": "PhobertTokenizer",
   "unk_token": "<unk>"
 }

 {
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "clean_up_tokenization_spaces": true,
   "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "t5-small",
   "pad_token": "<pad>",
+  "sp_model_kwargs": {},
   "special_tokens_map_file": null,
+  "tokenizer_class": "T5Tokenizer",
   "unk_token": "<unk>"
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b98ab1d86ee27aba4003bd8d30f85a11d73488b1051a6e8ad066bc52c7124e50
 size 3579

 version https://git-lfs.github.com/spec/v1
+oid sha256:70027d7721d8d3f9865755baa3a5fdc396eacbf2c687be95832f22f56a43f62e
 size 3579

vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff