fcakyon commited on
Commit
82ba154
1 Parent(s): 38ab4ea

initial commit

Browse files
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: tr
3
+ datasets:
4
+ - tquad1
5
+ - tquad2
6
+ - xquad
7
+ tags:
8
+ - text2text-generation
9
+ - question-generation
10
+ - answer-extraction
11
+ - question-answering
12
+ - text-generation
13
+ pipeline_tag: text2text-generation
14
+ widget:
15
+ - text: "answer: film ve TV haklarını context: Legendary Entertainment, 2016 yılında bilimkurgu romanı Dune'un film ve TV haklarını satın aldı. Geliştirme kısa bir süre sonra başladı. Villeneuve projeye olan ilgisini dile getirdi ve resmi olarak yönetmen olarak imza attı. Roth ve Spaihts ile birlikte çalışarak senaryoyu iki bölüme ayırdı ve 1965 romanının 21. yüzyıla güncellenmiş bir uyarlamasını ekledi."
16
+ example_title: "Question Generation (Movie)"
17
+ - text: "answer: bir antlaşma yaparak context: Fatih Sultan Mehmet, Cenevizlilerin önemli üslerinden Amasra’yı aldı. 1479’da bir antlaşma yaparak Venedik'le 16 yıllık savaşa son verdi."
18
+ example_title: "Question Generation (History)"
19
+ - text: "answer: Venedik'le context: Cenevizlilerin önemli üslerinden Amasra’yı aldı. 1479’da bir antlaşma yaparak Venedik'le 16 yıllık savaşa sona verdi."
20
+ example_title: "Question Generation (History 2)"
21
+ - text: "extract answers: Cenevizlilerin önemli üslerinden Amasra’yı aldı. <hl> 1479’da bir antlaşma yaparak Venedik'le 16 yıllık savaşa sona verdi. <hl>"
22
+ example_title: "Answer Extraction (History)"
23
+ - text: "question: Bu model ne ise yarar? context: Çalışmada sunulan yöntemle, Türkçe metinlerden otomatik olarak soru ve cevap üretilebilir. Bu proje ile paylaşılan kaynak kodu ile Türkçe Soru Üretme / Soru Cevaplama konularında yeni akademik çalışmalar yapılabilir. Projenin detaylarına paylaşılan Github ve Arxiv linklerinden ulaşılabilir."
24
+ example_title: "Answer Extraction (Open Domain)"
25
+ license: cc-by-4.0
26
+ ---
27
+
28
+ # mt5-small for Turkish Question Generation
29
+ Automated question generation and question answering using text-to-text transformers by OBSS AI.
30
+ ```python
31
+ from core.api import GenerationAPI
32
+ generation_api = GenerationAPI('mt5-small-3task-prepend-tquad2', qg_format='prepend')
33
+ ```
34
+
35
+ ## Citation 📜
36
+ ```
37
+ @article{akyon2021automated,
38
+ title={Automated question generation and question answering from Turkish texts using text-to-text transformers},
39
+ author={Akyon, Fatih Cagatay and Cavusoglu, Devrim and Cengiz, Cemil and Altinuc, Sinan Onur and Temizel, Alptekin},
40
+ journal={arXiv preprint arXiv:2111.06476},
41
+ year={2021}
42
+ }
43
+ ```
44
+
45
+ ## Overview ✔️
46
+ **Language model:** mt5-small
47
+ **Language:** Turkish
48
+ **Downstream-task:** Extractive QA/QG, Answer Extraction
49
+ **Training data:** TQuADv2-train
50
+ **Code:** https://github.com/obss/turkish-question-generation
51
+ **Paper:** https://arxiv.org/abs/2111.06476
52
+
53
+ ## Hyperparameters
54
+ ```
55
+ batch_size = 256
56
+ n_epochs = 15
57
+ base_LM_model = "mt5-small"
58
+ max_source_length = 512
59
+ max_target_length = 64
60
+ learning_rate = 1.0e-3
61
+ task_lisst = ["qa", "qg", "ans_ext"]
62
+ qg_format = "prepend"
63
+ ```
64
+
65
+ ## Performance
66
+ Refer to [paper](https://arxiv.org/abs/2111.06476).
67
+
68
+ ## Usage 🔥
69
+ ```python
70
+ from core.api import GenerationAPI
71
+
72
+ generation_api = GenerationAPI('mt5-small-3task-prepend-tquad2', qg_format='prepend')
73
+
74
+ context = """
75
+ Bu modelin eğitiminde, Türkçe soru cevap verileri kullanılmıştır.
76
+ Çalışmada sunulan yöntemle, Türkçe metinlerden otomatik olarak soru ve cevap
77
+ üretilebilir. Bu proje ile paylaşılan kaynak kodu ile Türkçe Soru Üretme
78
+ / Soru Cevaplama konularında yeni akademik çalışmalar yapılabilir.
79
+ Projenin detaylarına paylaşılan Github ve Arxiv linklerinden ulaşılabilir.
80
+ """
81
+
82
+ # a) Fully Automated Question Generation
83
+ generation_api(task='question-generation', context=context)
84
+
85
+ # b) Question Answering
86
+ question = "Bu model ne işe yarar?"
87
+ generation_api(task='question-answering', context=context, question=question)
88
+
89
+ # b) Answer Extraction
90
+ generation_api(task='answer-extraction', context=context)
91
+ ```
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<sep>": 250100, "<hl>": 250101}
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-small",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 1024,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "gated-gelu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "mt5",
17
+ "num_decoder_layers": 8,
18
+ "num_heads": 6,
19
+ "num_layers": 8,
20
+ "pad_token_id": 0,
21
+ "relative_attention_num_buckets": 32,
22
+ "tie_word_embeddings": false,
23
+ "tokenizer_class": "T5Tokenizer",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.12.3",
26
+ "use_cache": true,
27
+ "vocab_size": 250102
28
+ }
experiment_config.yaml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _n_gpu: 1
2
+ adafactor: true
3
+ adam_beta1: 0.9
4
+ adam_beta2: 0.999
5
+ adam_epsilon: 1.0e-08
6
+ cache_dir: null
7
+ dataloader_drop_last: false
8
+ dataloader_num_workers: 0
9
+ dataloader_pin_memory: true
10
+ ddp_find_unused_parameters: null
11
+ debug: []
12
+ deepspeed: null
13
+ disable_tqdm: false
14
+ do_eval: true
15
+ do_predict: false
16
+ do_train: true
17
+ eval_accumulation_steps: 1
18
+ eval_dataset_list:
19
+ - tquad2-valid
20
+ - xquad.tr
21
+ eval_steps: 300
22
+ evaluation_strategy: &id001 !!python/object/apply:transformers.trainer_utils.IntervalStrategy
23
+ - steps
24
+ fp16: false
25
+ fp16_backend: auto
26
+ fp16_full_eval: false
27
+ fp16_opt_level: O1
28
+ freeze_embeddings: false
29
+ gradient_accumulation_steps: 4
30
+ greater_is_better: null
31
+ group_by_length: false
32
+ ignore_data_skip: false
33
+ label_names: null
34
+ label_smoothing_factor: 0
35
+ learning_rate: 0.001
36
+ length_column_name: length
37
+ load_best_model_at_end: false
38
+ local_rank: -1
39
+ log_level: -1
40
+ log_level_replica: -1
41
+ log_on_each_node: true
42
+ logging_dir: runs/mt5-small/3task/adafactor-1e3-15ep-prepend-tquad2train/runs/Sep04_12-32-14_palamut2.yonetim
43
+ logging_first_step: false
44
+ logging_steps: 500
45
+ logging_strategy: *id001
46
+ lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
47
+ - linear
48
+ max_grad_norm: 1.0
49
+ max_source_length: 512
50
+ max_steps: -1
51
+ max_target_length: 64
52
+ metric_for_best_model: null
53
+ model_name_or_path: google/mt5-small
54
+ model_type: mt5
55
+ mp_parameters: ''
56
+ mt5_qg_format: prepend
57
+ mt5_task_list:
58
+ - qa
59
+ - qg
60
+ - ans_ext
61
+ neptune_api_token: null
62
+ neptune_project: obss-ml/nqg-test
63
+ neptune_run: null
64
+ no_cuda: false
65
+ num_train_epochs: 15
66
+ output_dir: runs/mt5-small/3task/adafactor-1e3-15ep-prepend-tquad2train
67
+ overwrite_output_dir: false
68
+ past_index: -1
69
+ per_device_eval_batch_size: 64
70
+ per_device_train_batch_size: 64
71
+ per_gpu_eval_batch_size: null
72
+ per_gpu_train_batch_size: null
73
+ prediction_loss_only: false
74
+ prepare_data: true
75
+ push_to_hub: false
76
+ push_to_hub_model_id: adafactor-1e3-15ep-prepend-tquad2train
77
+ push_to_hub_organization: null
78
+ push_to_hub_token: null
79
+ remove_unused_columns: false
80
+ report_to:
81
+ - wandb
82
+ - neptune
83
+ resume_from_checkpoint: null
84
+ run_name: turque-mt5small-adafactor-1e3-15ep-tquad2train
85
+ save_on_each_node: false
86
+ save_steps: 500
87
+ save_strategy: *id001
88
+ save_total_limit: 1
89
+ seed: 42
90
+ sharded_ddp: []
91
+ skip_memory_metrics: true
92
+ tokenizer_path: tokenizers/mt5-small
93
+ tpu_metrics_debug: false
94
+ tpu_num_cores: null
95
+ train_dataset_list:
96
+ - tquad2-train
97
+ train_file_path: data/train_data.pt
98
+ use_legacy_prediction_loop: false
99
+ valid_dataset_list:
100
+ - tquad2-valid
101
+ valid_file_path: data/valid_data.pt
102
+ wandb_id: null
103
+ wandb_project: turkish-qa-qg
104
+ warmup_ratio: 0.0
105
+ warmup_steps: 0
106
+ weight_decay: 0.0
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca627752681a7d78c491af3722b5715dd8cdc5c1acfb0494cf0de583104a3944
3
+ size 1200734941
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0, "additional_special_tokens": null, "special_tokens_map_file": "/truba/home/fakyon/.cache/huggingface/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276", "name_or_path": "tokenizers/mt5-small", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}