Lê Nguyễn Minh Huy commited on
Commit
c3d4b5c
1 Parent(s): 6495735

Initial commit

Browse files
Files changed (6) hide show
  1. README.md +56 -0
  2. config.json +38 -0
  3. pytorch_model.bin +3 -0
  4. spiece.model +3 -0
  5. tf_model.h5 +3 -0
  6. tokenizer.json +0 -0
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: vi
3
+ datasets:
4
+ - cc100
5
+ tags:
6
+ - summarization
7
+
8
+ license: mit
9
+
10
+ widget:
11
+ - text: "vietnews: VietAI là tổ chức phi lợi nhuận với sứ mệnh ươm mầm tài năng về trí tuệ nhân tạo và xây dựng một cộng đồng các chuyên gia trong lĩnh vực trí tuệ nhân tạo đẳng cấp quốc tế tại Việt Nam."
12
+ ---
13
+
14
+ # ViT5-large Finetuned on `vietnews` Abstractive Summarization
15
+
16
+
17
+ State-of-the-art pretrained Transformer-based encoder-decoder model for Vietnamese.
18
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/vit5-pretrained-text-to-text-transformer-for/abstractive-text-summarization-on-vietnews)](https://paperswithcode.com/sota/abstractive-text-summarization-on-vietnews?p=vit5-pretrained-text-to-text-transformer-for)
19
+
20
+
21
+ ## How to use
22
+ For more details, do check out [our Github repo](https://github.com/vietai/ViT5) and [eval script](https://github.com/vietai/ViT5/blob/main/eval/Eval_vietnews_sum.ipynb).
23
+
24
+ ```python
25
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-large-vietnews-summarization")
28
+ model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-large-vietnews-summarization")
29
+ model.cuda()
30
+
31
+ sentence = "VietAI là tổ chức phi lợi nhuận với sứ mệnh ươm mầm tài năng về trí tuệ nhân tạo và xây dựng một cộng đồng các chuyên gia trong lĩnh vực trí tuệ nhân tạo đẳng cấp quốc tế tại Việt Nam."
32
+ text = "vietnews: " + sentence + " </s>"
33
+ encoding = tokenizer(text, return_tensors="pt")
34
+ input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
35
+ outputs = model.generate(
36
+ input_ids=input_ids, attention_mask=attention_masks,
37
+ max_length=256,
38
+ early_stopping=True
39
+ )
40
+ for output in outputs:
41
+ line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
42
+ print(line)
43
+ ```
44
+
45
+ ## Citation
46
+ ```
47
+ @inproceedings{phan-etal-2022-vit5,
48
+ title = "{V}i{T}5: Pretrained Text-to-Text Transformer for {V}ietnamese Language Generation",
49
+ author = "Phan, Long and Tran, Hieu and Nguyen, Hieu and Trinh, Trieu H.",
50
+ booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop",
51
+ year = "2022",
52
+ publisher = "Association for Computational Linguistics",
53
+ url = "https://aclanthology.org/2022.naacl-srw.18",
54
+ pages = "136--142",
55
+ }
56
+ ```
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "T5ForConditionalGeneration"
4
+ ],
5
+ "d_ff": 2816,
6
+ "d_kv": 64,
7
+ "d_model": 1024,
8
+ "decoder_start_token_id": 0,
9
+ "dropout_rate": 0.1,
10
+ "eos_token_id": 1,
11
+ "feed_forward_proj": "gated-gelu",
12
+ "initializer_factor": 1.0,
13
+ "is_encoder_decoder": true,
14
+ "layer_norm_epsilon": 1e-06,
15
+ "model_type": "t5",
16
+ "num_decoder_layers": 24,
17
+ "num_heads": 16,
18
+ "num_layers": 24,
19
+ "output_past": true,
20
+ "pad_token_id": 0,
21
+ "relative_attention_max_distance": 128,
22
+ "relative_attention_num_buckets": 32,
23
+ "task_specific_params": {
24
+ "summarization": {
25
+ "early_stopping": true,
26
+ "length_penalty": 2.0,
27
+ "max_length": 256,
28
+ "no_repeat_ngram_size": 3,
29
+ "num_beams": 4,
30
+ "prefix": "summarization: "
31
+ }
32
+ },
33
+ "tie_word_embeddings": false,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": true,
37
+ "vocab_size": 36096
38
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c524804e16b992b38ed32b14cf495399e0c70dd0b26aa414f4d93d5acca4254
3
+ size 3165287717
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59986b62f9f0b90edafb9b073ea7b93d21114a5841219a1ea2399ade73f729c6
3
+ size 820370
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2195c610ae03a6b541d53d3dc35b328860ba4148a85a02021696c0fa27d01500
3
+ size 3166229632
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff