Vaino Hatanpaa commited on
Commit
ceedef8
1 Parent(s): debeca1

add training and evaluation scripts

Browse files
data/fine-tuning/create_online_reviews.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+ import datasets
4
+ import os
5
+
6
+
7
+
8
+ def main():
9
+ datasets.set_caching_enabled(False)
10
+ tokenizer = AutoTokenizer.from_pretrained(r"/tokenizer/loc")
11
+
12
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
13
+
14
+ data_loc = "path/to/review/jsons"
15
+ data_files = [fil.path for fil in os.scandir(data_loc)]
16
+ dataset = load_dataset('online_reviews_loading.py', data_files=data_files)
17
+
18
+ def tokenize_function(examples):
19
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
20
+
21
+ def process_rating(example):
22
+ example["labels"] = [float(item) for item in example["rating"]]
23
+ return example
24
+ dataset = dataset["train"].map(tokenize_function, batched=True).map(process_rating,batched=True,remove_columns=['rating']).shuffle(seed=42).train_test_split(test_size=0.1)
25
+
26
+
27
+ if __name__ == "__main__":
28
+ main()
data/fine-tuning/create_xed.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import sys
5
+ from datasets import load_dataset, concatenate_datasets
6
+ from transformers import PreTrainedTokenizerFast
7
+ import transformers
8
+ from transformers import (
9
+ AutoConfig,
10
+ AutoModelForCausalLM,
11
+ Trainer,
12
+ TrainingArguments,
13
+ default_data_collator,
14
+ GPT2Tokenizer
15
+ )
16
+ from transformers.trainer_utils import get_last_checkpoint
17
+ from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
18
+
19
+ from transformers import GPT2Model
20
+ from transformers import GPT2TokenizerFast
21
+ import transformers
22
+ import torch
23
+ import numpy as np
24
+ import argparse
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc")
27
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
28
+ out_dir = "/out_dir/xed"
29
+ max_length = 1024
30
+
31
+
32
+ fi_annotated_raw = load_dataset("xed_en_fi","fi_annotated")
33
+ fi_neutral_raw = load_dataset("xed_en_fi","fi_neutral")
34
+
35
+ def to_arr(examples):
36
+ labels = []
37
+ for item in examples["labels"]:
38
+ labels.append([item])
39
+ return {"sentence":examples["sentence"],"labels":labels}
40
+ fi_neutral_mapped = fi_neutral_raw["train"].map(to_arr, batched=True)
41
+
42
+ fi_neutral_mapped_cast = fi_neutral_mapped.cast(fi_annotated_raw["train"].features)
43
+ concat_raw_set = concatenate_datasets([fi_neutral_mapped_cast, fi_annotated_raw["train"]])#combine neutral and other labels into single dataset
44
+
45
+ def tokenize_function(examples):
46
+ return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=max_length)
47
+
48
+ def to_arr_2(examples):
49
+ labels = []
50
+ for item in examples["labels"]:
51
+ label = np.zeros(9)
52
+ label[item] = 1
53
+ labels.append(label.tolist())
54
+ return {"sentence":examples["sentence"],"labels":labels}
55
+
56
+ tokenized_datasets = concat_raw_set.map(tokenize_function, batched=True).map(to_arr_2, batched=True).shuffle(seed=42).train_test_split(test_size=0.1)
57
+ tokenized_datasets.save_to_disk(out_dir)
data/fine-tuning/create_yle.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import json
5
+ import sys
6
+ from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
7
+ from transformers import PreTrainedTokenizerFast
8
+ import transformers
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoModelForCausalLM,
12
+ Trainer,
13
+ TrainingArguments,
14
+ default_data_collator,
15
+ )
16
+ from transformers.trainer_utils import get_last_checkpoint
17
+ from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
18
+
19
+ from transformers import GPT2Model
20
+ from transformers import GPT2TokenizerFast
21
+ import transformers
22
+ import torch
23
+ import numpy as np
24
+
25
+ root = r'G:\Data\yle\data'#download from kielipankki and extract
26
+
27
+ texts = []
28
+ subjects = []
29
+ first_subjects = []
30
+ first_ids = []
31
+ subject_ids = []
32
+
33
+ for path, subdirs, files in os.walk(root):
34
+ #Data is split into multiple files
35
+ for name in files:
36
+ print(os.path.join(path, name))
37
+ with open(os.path.join(path, name), encoding="utf8") as f:
38
+ data = json.load(f)
39
+
40
+ #Each file contains json with multiple articles
41
+ for i in range(len(data["data"])):
42
+ try:
43
+ txt = ""
44
+ s = [] #Subjects
45
+ s_ids = []#Id for the subjects
46
+ #From the content loop trough the content and get only heading as text as we do not want to add metadata to a text dataset
47
+ for c in data["data"][i]["content"]:
48
+ if c["type"] in ("heading","text"):
49
+ txt += c["text"]
50
+ txt += "\n"
51
+ first = ""
52
+ #An article contains n subjects. Loop trough those and also save which one was first. We want that as a distinct column in the dataset for performance.
53
+ if "subjects" in data["data"][i]:#To know if we have a first subject, check first if we even have subjects in json.
54
+ first = data["data"][i]["subjects"][0]["title"]["fi"]
55
+ first_id = data["data"][i]["subjects"][0]["id"]
56
+ for subject in data["data"][i]["subjects"]:
57
+ s.append(subject["title"]["fi"])
58
+ s_ids.append(subject["id"])
59
+ first_subjects.append(first)
60
+ first_ids.append(first_id)
61
+ texts.append(txt)
62
+ subjects.append(s)
63
+ subject_ids.append(s_ids)
64
+ except:
65
+ #Some texts contain formatting errors, just skip those as they are a neglible portion of all the articles.
66
+ pass
67
+
68
+
69
+ dataset = Dataset.from_dict({"text":texts, "subjects":subjects, "first_subject":first_subjects, "first_ids":first_ids, "subject_ids":subject_ids})
70
+
71
+ tokenizer_loc = "/tokenizer_loc"
72
+
73
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_loc)
74
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
75
+
76
+ def find_major_subject(example):
77
+ good_subjects = ["urheilu","Kotimaan uutiset","Ulkomaat","jääkiekko","talous","politiikka","poliisi","Liikenne ja kuljetus","kulttuuri","puolueet","onnettomuudet","musiikki","Koulutus ja kasvatus","Venäjä","tieliikenne","luonto","autot","terveys","Helsinki","Pohjoismaat","kunnat","Eurooppa","rikokset","vaalit","Yhdysvallat","lainvalvonta"]
78
+ import numpy as np #Some scopes were broken on Windows so import again here to get batched processing to work...
79
+ example["main_subject"] = None
80
+ label = np.zeros(len(good_subjects))#sparse label matrix, to be made into one-hot later
81
+ for subject in example["subjects"]:
82
+ if subject in good_subjects:
83
+ example["main_subject"] = subject
84
+ label[good_subjects.index(subject)] = 1
85
+ #example["labels"] = label
86
+ break
87
+ return {"labels":label}
88
+
89
+ filtered = dataset.map(find_major_subject, num_proc=12).filter(lambda example: example['main_subject'] != None)
90
+
91
+ def tokenize_function(examples):
92
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=800)
93
+ tokenized_and_filtered_dataset = filtered.map(tokenize_function, batched=True)
94
+
95
+ tokenized_and_filtered_dataset.save_to_disk("/output/dir")
data/fine-tuning/create_ylilauta.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
4
+ from transformers import PreTrainedTokenizerFast
5
+ import transformers
6
+ from transformers import (
7
+ AutoConfig,
8
+ AutoModelForCausalLM,
9
+ Trainer,
10
+ TrainingArguments,
11
+ default_data_collator,
12
+ )
13
+ from transformers.trainer_utils import get_last_checkpoint
14
+ from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
15
+
16
+ from transformers import GPT2Model
17
+ from transformers import GPT2TokenizerFast
18
+ import transformers
19
+ import torch
20
+ import numpy as np
21
+ import argparse
22
+
23
+ tokenizer = AutoTokenizer.from_pretrained("/tokenizer/loc")
24
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
25
+ out_dir = "/out_dir/ylilauta"
26
+ max_length = 1024
27
+
28
+ #checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
29
+ #output_dir = r"H:\Data_temp\checkpoints\tests\yle"
30
+
31
+ path = r"/data/ylilauta-corpus/data/100-percent/train.txt" #get from https://github.com/spyysalo/ylilauta-corpus
32
+ text = []
33
+ labels = []
34
+ with open(path,"r",encoding="utf-8") as f:
35
+ for line in f:
36
+ parts = line.split(" ", maxsplit=1)
37
+ labels.append(parts[0])
38
+ text.append(parts[1])
39
+
40
+ data_dict = {"text":text,"labels":labels}
41
+ dataset = Dataset.from_dict(data_dict)
42
+ label_names = dataset.unique('labels')
43
+ n_labels = len(label_names)
44
+
45
+ def to_one_hot(examples):
46
+ import numpy as np
47
+
48
+ label = np.zeros(n_labels)
49
+ label[label_names.index(examples["labels"])] = 1
50
+
51
+ return {"text":examples["text"],"labels":label.tolist()}
52
+
53
+ def tokenize_function(examples):
54
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)
55
+
56
+ tokenized = dataset.map(to_one_hot).map(tokenize_function).train_test_split(test_size=0.1)
57
+
58
+ tokenized.save_to_disk(out_dir)
data/fine-tuning/online_reviews_loading.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import datasets
3
+ import json
4
+ import numpy as np
5
+ import os
6
+ #Dataset loading script that is missing quite a lot of details but works
7
+ class NewDataset(datasets.GeneratorBasedBuilder):
8
+ def _info(self):
9
+ return datasets.DatasetInfo(
10
+ description="beep boop",
11
+ features=datasets.Features(
12
+ {
13
+ "description": datasets.Value("string"),
14
+ "text": datasets.Value("string"),
15
+ "rating": datasets.Value("int32")
16
+ }
17
+ ),
18
+ # No default supervised_keys (as we have to pass both question
19
+ # and context as input).
20
+ supervised_keys=None,
21
+ homepage="no",
22
+ citation="no",
23
+ )
24
+
25
+ def _split_generators(self, dl_manager):
26
+ files = self.config.data_files
27
+ return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files["train"]})]
28
+ def _generate_examples(
29
+ self, files # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
30
+ ):
31
+ """ Yields examples as (key, example) tuples. """
32
+ # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
33
+ # The `key` is here for legacy reason (tfds) and is not important in itself.
34
+ #print("files",files)
35
+ key = 0
36
+ for file in files:
37
+ with open(file, encoding="utf-8") as f:
38
+ data = json.load(f)
39
+
40
+ for item in data:
41
+ for review in item["reviews"]:
42
+ yield key, {
43
+ "description": item["description_raw"],
44
+ "text": review["reviewText"],
45
+ "rating": review["rating"],
46
+ }
47
+ key += 1
48
+
49
+
50
+
51
+
data/tokenize.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, load_dataset, concatenate_datasets
2
+ import datasets
3
+ from transformers import GPT2TokenizerFast
4
+ from tokenizers.processors import TemplateProcessing
5
+
6
+ input_dir = "dataset_location"
7
+ tokenizer_file="path/to/file"
8
+ output_dir="output/dir"
9
+ tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_file)
10
+ #Add eos tokens to the tokenization pipeline as they are not added otherwise
11
+ tokenizer._tokenizer.post_processor = TemplateProcessing(
12
+ single="$0 "+tokenizer.eos_token,
13
+ pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token,
14
+ special_tokens=[(tokenizer.eos_token, 0)],
15
+ )
16
+
17
+ def tokenize_function(examples):
18
+ return tokenizer(examples["text"])
19
+
20
+
21
+ def group_texts(examples):
22
+ #group texts. This is based on Hugging Face CLM example
23
+ block_size = 1024
24
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
25
+ total_len = len(concatenated_examples[list(examples.keys())[0]])
26
+ total_len = (total_len//block_size) * block_size
27
+ result = {
28
+ k: [t[i:i+block_size] for i in range(0, total_len, block_size)]
29
+ for k, t in concatenated_examples.items()
30
+ }
31
+ result["labels"] = result["input_ids"].copy()
32
+ return result
33
+
34
+ def main():
35
+ num_proc=12 #set to something appropriate
36
+ dataset = datasets.load_from_disk(input_dir) #This one load a saved dataset object from disk. You could create a dataset from iterable or load one like:
37
+ #dataset = load_dataset("Finnish-NLP/mc4_fi_cleaned", split="train").remove_columns(["timestamp","url"]) #Example usage from Hugging Face Hub
38
+
39
+ #Tokenize, filter out very short texts and group texts to blocks of attention size
40
+ dataset\
41
+ .shuffle(seed=42, load_from_cache_file=False, writer_batch_size=100000)\
42
+ .map(tokenize_function, batched=True, num_proc=num_proc, remove_columns=dataset.column_names, load_from_cache_file=False, writer_batch_size=100000)\
43
+ .filter(lambda e: len(e["input_ids"]) > 20, num_proc=num_proc, load_from_cache_file=False, writer_batch_size=100000)\
44
+ .map(group_texts, batched=True, num_proc=num_proc, load_from_cache_file=False, writer_batch_size=100000)\
45
+ .train_test_split(test_size=0.05, load_from_cache_file=False, writer_batch_size=100000)\
46
+ .save_to_disk(output_dir)
47
+ print(dataset)
48
+
49
+ if __name__ == "__main__":
50
+ main()
data/train_tokenizer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, processors, ByteLevelBPETokenizer
2
+ import tokenizers
3
+
4
+ from tokenizers.models import WordPiece, BPE
5
+ from tokenizers.trainers import WordPieceTrainer, BpeTrainer
6
+ from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence
7
+ from tokenizers.processors import TemplateProcessing
8
+ import os
9
+
10
+ from transformers import AutoTokenizer
11
+ old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
12
+
13
+ import datasets
14
+ input_dir = "/dataset/location"
15
+ dataset = datasets.load_from_disk(input_dir)
16
+
17
+ def get_training_corpus():
18
+ for start_idx in range(0, len(dataset), 10000):
19
+ samples = dataset[start_idx : start_idx + 10000]
20
+ yield samples["text"]
21
+
22
+ print("start")
23
+ tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=50000)
24
+ print("end")
25
+ tokenizer.save_vocabulary("/tokenizer_location")
evaluate_and_analyze/evaluate.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import sys
5
+ from datasets import load_dataset, load_from_disk, concatenate_datasets
6
+ from transformers import PreTrainedTokenizerFast
7
+ import transformers
8
+ from transformers import (
9
+ AutoConfig,
10
+ AutoModelForCausalLM,
11
+ Trainer,
12
+ TrainingArguments,
13
+ default_data_collator,
14
+ )
15
+ from transformers.trainer_utils import get_last_checkpoint
16
+ from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
17
+
18
+ from transformers import GPT2Model
19
+ from transformers import GPT2TokenizerFast
20
+ import transformers
21
+ import torch
22
+ import numpy as np
23
+ import argparse
24
+
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument('test', type=int)
27
+ parser.add_argument('length', type=int)
28
+ #parser.add_argument('--input_file', type=int)
29
+ args = parser.parse_args()
30
+
31
+ def compute_metrics(eval_pred):
32
+ logits,labels = eval_pred
33
+ import pickle
34
+ with open("logits_{}.pickle".format("xed"),"wb") as handle:
35
+ pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)
36
+ with open("labels_{}.pickle".format("xed"),"wb") as handle:
37
+ pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)
38
+ #Continue in a jupyter notebook from here
39
+
40
+ return
41
+
42
+
43
+
44
+ class MultilabelTrainer(Trainer):
45
+ def compute_loss(self,model,inputs,return_outputs=False):
46
+ labels = inputs.pop("labels")
47
+ outputs = model(**inputs)
48
+ logits = outputs.logits
49
+ loss_fct = torch.nn.BCEWithLogitsLoss()
50
+ loss = loss_fct(logits.view(-1,self.model.config.num_labels),
51
+ labels.float().view(-1,self.model.config.num_labels))
52
+ return (loss,outputs) if return_outputs else loss
53
+
54
+ def main():
55
+ ds_names = ["yle", "online_review","xed","ylilauta"]
56
+ #ds_sizes = [1000, 3000, 10000, 32000, 9999999]
57
+ print("test:",args.test)
58
+ ds_name = ds_names[args.test]
59
+ #ds_size = int(args.test.slit()[1])
60
+ ds_size = args.length
61
+ print(ds_name, ds_size)
62
+
63
+ metric = compute_metrics
64
+
65
+ #print("cuda_avail:",torch.cuda.is_available())
66
+ #checkpoint_loc = "/media/volume/output/checkpoint-275000"
67
+ #output_dir = "/media/volume/fi_nlp/output/finetune"
68
+ #checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
69
+ output_dir = "/data/loc/"+ds_name
70
+
71
+ #Most of the parameters not used but lets just pass this to make the Trainer happy...
72
+ training_args = TrainingArguments(
73
+ output_dir=output_dir,
74
+ per_device_train_batch_size=4,
75
+ per_device_eval_batch_size=4,
76
+ learning_rate=5e-6,
77
+ adam_beta1=0.95,
78
+ adam_beta2=0.985,
79
+ adam_epsilon=1e-8,
80
+ weight_decay=0.001,
81
+ lr_scheduler_type="linear",
82
+ gradient_accumulation_steps=4,
83
+ max_steps=10000,
84
+ num_train_epochs=20000,
85
+ save_total_limit=2,
86
+ dataloader_num_workers=5,
87
+ save_steps=100000,
88
+ warmup_steps=500,
89
+ do_eval=True,
90
+ eval_steps=500,
91
+ evaluation_strategy="steps",
92
+ logging_strategy="steps",
93
+ logging_steps=50,
94
+ fp16_opt_level="O2",
95
+ half_precision_backend="amp",
96
+ log_on_each_node=False,
97
+ disable_tqdm=True
98
+ )
99
+
100
+ print(training_args)
101
+
102
+ dataset = load_from_disk(r"/data_loc/"+ds_name)["test"]
103
+ #dataset = load_from_disk(r"C:\Users\vin\Documents\Projects\dippa\tests\ylilauta\tokenized_set").train_test_split(test_size=0.1)
104
+
105
+ trainer_class = MultilabelTrainer
106
+
107
+ #print("num_labels",num_labels)
108
+ model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name)
109
+ tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name)
110
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
111
+
112
+ print("init trainer")
113
+ trainer = trainer_class(
114
+ model=model,
115
+ args=training_args,
116
+ train_dataset=dataset,
117
+ eval_dataset=dataset,
118
+ tokenizer=tokenizer,
119
+ compute_metrics=metric,
120
+ data_collator=default_data_collator
121
+ )
122
+ #checkpoint = None
123
+ #checkpoint = get_last_checkpoint(output_dir)
124
+ #checkpoint = None
125
+ #train_result = trainer.train()
126
+ #trainer.save_state()
127
+ metrics = trainer.evaluate()
128
+ print(metrics)
129
+ #trainer.save_model() # Saves the tokenizer too for easy upload
130
+
131
+ if __name__ == "__main__":
132
+ main()
evaluate_and_analyze/few_shot.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
evaluate_and_analyze/generation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
finetune.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import sys
5
+ from datasets import load_dataset, load_from_disk, concatenate_datasets
6
+ from transformers import PreTrainedTokenizerFast
7
+ import transformers
8
+ from transformers import (
9
+ AutoConfig,
10
+ AutoModelForCausalLM,
11
+ Trainer,
12
+ TrainingArguments,
13
+ default_data_collator,
14
+ )
15
+ from transformers.trainer_utils import get_last_checkpoint
16
+ from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
17
+
18
+ from transformers import GPT2Model
19
+ from transformers import GPT2TokenizerFast
20
+ import transformers
21
+ import torch
22
+ import numpy as np
23
+ import argparse
24
+
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument('test', type=int)
27
+ parser.add_argument('length', type=int)
28
+ #parser.add_argument('--input_file', type=int)
29
+ args = parser.parse_args()
30
+
31
+ def compute_metrics(eval_pred):
32
+ logits,labels = eval_pred
33
+
34
+ predictions = np.zeros(logits.shape)
35
+ predictions[np.arange(len(predictions)),logits.argmax(1)] = 1
36
+ predictions = predictions > 0.5
37
+
38
+ #predictions = logits > 0.5
39
+ labels = labels > 0.5
40
+ return {"acc":np.all(predictions == labels,axis=1).sum()/predictions.shape[0]}
41
+
42
+ def compute_metrics_regression(eval_pred):
43
+ logits,labels = eval_pred
44
+
45
+ labels = np.expand_dims(labels,1)
46
+ val = np.abs(logits-labels).mean()
47
+ perc = ((np.abs(logits-labels).round() < 1).sum()*100) / (len(labels))
48
+ perc_50 = ((np.abs(logits-labels).round()[0:50] < 1).sum()*100) / (50)
49
+
50
+ return {"dev":val,"perc":perc,"perc_50":perc_50}
51
+
52
+
53
+
54
+ class MultilabelTrainer(Trainer):
55
+ def compute_loss(self,model,inputs,return_outputs=False):
56
+ labels = inputs.pop("labels")
57
+ outputs = model(**inputs)
58
+ logits = outputs.logits
59
+ loss_fct = torch.nn.BCEWithLogitsLoss()
60
+ loss = loss_fct(logits.view(-1,self.model.config.num_labels),
61
+ labels.float().view(-1,self.model.config.num_labels))
62
+ return (loss,outputs) if return_outputs else loss
63
+
64
+ def main():
65
+ ds_names = ["yle", "online_reviews","xed","ylilauta"]
66
+ #ds_sizes = [1000, 3000, 10000, 32000, 9999999]
67
+ print("test:",args.test)
68
+ ds_name = ds_names[args.test]
69
+ ds_size = args.length
70
+ print(ds_name, ds_size)
71
+
72
+ metric = compute_metrics_regression if ds_name == "online_reviews" else compute_metrics
73
+
74
+ #print("cuda_avail:",torch.cuda.is_available())
75
+ #checkpoint_loc = "/media/volume/output/checkpoint-275000"
76
+ #output_dir = "/media/volume/fi_nlp/output/finetune"
77
+ #checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
78
+ output_dir = "/scratch/project_462000007/hatanpav/output/dippa/gpt/"+ds_name
79
+
80
+ training_args = TrainingArguments(
81
+ output_dir=output_dir,
82
+ per_device_train_batch_size=4,
83
+ per_device_eval_batch_size=4,
84
+ learning_rate=5e-6,
85
+ adam_beta1=0.95,
86
+ adam_beta2=0.985,
87
+ adam_epsilon=1e-8,
88
+ weight_decay=0.001,
89
+ lr_scheduler_type="linear",
90
+ gradient_accumulation_steps=2,#This one assumes 4x8 GPUs. Set to 64 to get global batch size of 64 with one GPU
91
+ max_steps=10000,
92
+ num_train_epochs=20000,#Overriden by max_steps
93
+ save_total_limit=2,
94
+ dataloader_num_workers=5,
95
+ save_steps=100000,
96
+ warmup_steps=500,
97
+ do_eval=True,
98
+ eval_steps=500,
99
+ evaluation_strategy="steps",
100
+ logging_strategy="steps",
101
+ logging_steps=50,
102
+ fp16_opt_level="O2",
103
+ half_precision_backend="amp",
104
+ log_on_each_node=False,
105
+ disable_tqdm=True
106
+ )
107
+
108
+ print(training_args)
109
+
110
+ dataset = load_from_disk(r"/path/to/data/"+ds_name)
111
+
112
+ #Handle regression type task:
113
+ n_labels = 1
114
+ trainer_class = MultilabelTrainer
115
+ try:
116
+ n_labels = len(dataset["train"][0]["labels"])
117
+ except:
118
+ #The case of label being a float.
119
+ n_labels = 1
120
+ trainer_class = Trainer
121
+ if ds_size > len(dataset["train"]):
122
+ ds_size = len(dataset["train"])
123
+
124
+
125
+ model = AutoModelForSequenceClassification.from_pretrained("/checkpoint/loc",num_labels=n_labels)
126
+ tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc")
127
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
128
+
129
+ print("init trainer")
130
+ train_set = dataset["train"].select(range(ds_size))
131
+ test_set = dataset["test"]
132
+ trainer = trainer_class(
133
+ model=model,
134
+ args=training_args,
135
+ train_dataset=train_set,
136
+ eval_dataset=test_set,
137
+ tokenizer=tokenizer,
138
+ compute_metrics=metric,
139
+ data_collator=default_data_collator
140
+ )
141
+ checkpoint = None
142
+ #checkpoint = get_last_checkpoint(output_dir)
143
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
144
+ #trainer.save_state()
145
+ metrics = trainer.evaluate()
146
+ print(metrics)
147
+ trainer.save_model() # Saves the tokenizer too for easy upload
148
+
149
+ if __name__ == "__main__":
150
+ main()
train.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import transformers
3
+ import datasets
4
+ from transformers import PreTrainedTokenizerFast
5
+ from transformers import (
6
+ GPT2TokenizerFast,
7
+ AutoConfig,
8
+ AutoModelForCausalLM,
9
+ Trainer,
10
+ TrainingArguments,
11
+ default_data_collator
12
+ )
13
+ from transformers.trainer_utils import get_last_checkpoint
14
+ import torch
15
+ #from transformers.utils.dummy_tokenizers_objects import PreTrainedTokenizerFast
16
+
17
+ #config_name = "C:\\Users\\vin\\Documents\\Projects\\NLP\\kielimalli\\config.json"
18
+ #tokenizer_file = "C:\\Users\\vin\\Documents\\Projects\\NLP\\models\\tokens.json"
19
+ #input_dir = "H:\\Data_temp\\tokenized_dataset"
20
+ #output_dir = "H:\\Data_temp\\checkpoints\\model1"
21
+
22
+ def main():
23
+ import os
24
+ #enable if required by your environment
25
+ #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
26
+ #torch.backends.cuda.matmul.allow_tf32 = True
27
+ #torch.backends.cudnn.allow_tf32 = True
28
+
29
+ config_name = "config_large_bpe.json"
30
+ tokenizer_files = "/path/to/tokenizer/files"
31
+ input_dir = "/data/dir"
32
+ output_dir = "/out/dir"
33
+
34
+ training_args = TrainingArguments(
35
+ output_dir=output_dir,
36
+ per_device_train_batch_size=4,
37
+ per_device_eval_batch_size=4,
38
+ learning_rate=2.067e-5,
39
+ lr_scheduler_type="linear",
40
+ adam_beta1=0.95,
41
+ adam_beta2=0.985,
42
+ adam_epsilon=1e-8,
43
+ weight_decay=0.001,
44
+ gradient_accumulation_steps=32,
45
+ num_train_epochs=6.7,
46
+ save_total_limit=2,
47
+ dataloader_num_workers=10,
48
+ save_steps=100,
49
+ warmup_steps=1000,
50
+ do_eval=True,
51
+ eval_steps=1000,
52
+ evaluation_strategy="steps",
53
+ logging_strategy="steps",
54
+ logging_steps=100,
55
+ bf16=True,
56
+ tf32=True,
57
+ fp16_opt_level="O2",
58
+ half_precision_backend="amp",
59
+ bf16_full_eval=True
60
+ )
61
+
62
+ print("setting up tokenizer...")
63
+ tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_files)
64
+ #tokenizer.add_special_tokens({'pad_token': '[PAD]'})#Probably wrong
65
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
66
+ from tokenizers.processors import TemplateProcessing
67
+ tokenizer._tokenizer.post_processor = TemplateProcessing(
68
+ single="$0 "+tokenizer.eos_token,
69
+ pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token,
70
+ special_tokens=[(tokenizer.eos_token, 0)],
71
+ )
72
+
73
+ print("loading model...")
74
+ config = AutoConfig.from_pretrained(config_name)
75
+ model = AutoModelForCausalLM.from_config(config)
76
+ #model = AutoModelForCausalLM.from_pretrained("/checkpoint/dir") if restarting training completely and loading weights from a checkpoints
77
+ model.gradient_checkpointing_enable() #Optional, affects performance
78
+ print("loading data...")
79
+ dataset = datasets.load_from_disk(input_dir)
80
+
81
+ print("starting training...")
82
+ trainer = Trainer(
83
+ model=model,
84
+ args=training_args,
85
+ train_dataset=dataset["train"],
86
+ data_collator=default_data_collator,
87
+ eval_dataset=dataset["test"].select(range(10000)), #To save time do not evaluate on whole test set during training
88
+ tokenizer=tokenizer
89
+ )
90
+
91
+ #checkpoint = None
92
+ checkpoint = get_last_checkpoint(output_dir)
93
+ print("checkpoint:", checkpoint)
94
+ trainer.train(resume_from_checkpoint=checkpoint)
95
+
96
+ if __name__ == "__main__":
97
+ main()