kkmkorea commited on
Commit
471470c
โ€ข
1 Parent(s): a8805a4

Upload korscideberta-colab.py

Browse files
Files changed (1) hide show
  1. korscideberta-colab.py +187 -0
korscideberta-colab.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """korscideberta.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz
8
+ """
9
+
10
+ #!git clone https://huggingface.co/kisti/korscideberta; cd korscideberta
11
+
12
+ # Commented out IPython magic to ensure Python compatibility.
13
+ #!pwd
14
+ #%cd ..
15
+ #!pip install konlpy
16
+ # %cd korscideberta
17
+
18
+ # Commented out IPython magic to ensure Python compatibility.
19
+ '''
20
+ ! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
21
+ # %cd Mecab-ko-for-Google-Colab/
22
+ ! bash install_mecab-ko_on_colab_light_220429.sh
23
+ # %cd ..
24
+ !pip install datasets transformers[sentencepiece]
25
+ '''
26
+
27
+ # Commented out IPython magic to ensure Python compatibility.
28
+ '''
29
+ !pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8
30
+ '''
31
+
32
+ !pwd
33
+ # %cd /content/korscideberta
34
+
35
+ '''
36
+ #[ํ•„์ˆ˜]๋ฆฌ๋ˆ…์Šค ํ„ฐ๋ฏธ๋„์—์„œ ๋ณธ ์ฝ”๋“œ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋‹ค์šด๋กœ๋“œ
37
+ #git clone https://huggingface.co/kisti/korscideberta
38
+ #cd korscideberta
39
+
40
+ #[ํ•„์ˆ˜]๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜(Mecab ๋“ฑ ์ž์„ธํ•œ ์„ค์น˜ ๋ฐฉ๋ฒ•์€ KorSciDeBERTaํ™˜๊ฒฝ์„ค์น˜+ํŒŒ์ธํŠœ๋‹.pdf ์ฐธ์กฐ)
41
+ !apt install git-lfs
42
+
43
+ '''
44
+
45
+ from datasets import load_dataset
46
+ import datasets
47
+ from huggingface_hub import notebook_login
48
+
49
+ notebook_login() #Huggingface ๋กœ๊ทธ์ธ
50
+ #ํ† ํฐ ์˜ˆ์‹œ: hf_jRjLZcSBibYHwUaTjiNUEeoJlFxhFkGM
51
+
52
+ model_repository = "kisti/korscideberta" #Huggingface ๋ชจ๋ธ๋ช… ์„ค์ •
53
+ #model_repository = "./"
54
+ from transformers import AutoTokenizer
55
+ from tokenization_korscideberta_v2 import DebertaV2Tokenizer
56
+ tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository)
57
+ out = tokenizer.tokenize("<cls> ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ <s> ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. <s>")
58
+ print(str(out))
59
+
60
+ #๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
61
+ #data_files = {"train": "๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/test.json", "test": "๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/train.json", 'dev':'๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/dev.json'}
62
+ #dataset = load_dataset('json', data_files=data_files)
63
+ dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train')
64
+ dataset = dataset.shuffle(seed=42)
65
+ dataset = dataset.train_test_split(test_size=0.1)
66
+ print("dataset:", str(dataset))
67
+
68
+ #๋ฐ์ดํ„ฐ์…‹์„ ํ† ํฌ๋‚˜์ด์ง• ํ›„ ์ €์žฅ
69
+ from datasets import ClassLabel
70
+ labels = [x for x in dataset['train']['tag']]
71
+ labels = list(set(labels))
72
+ labels.sort()
73
+ num_labels = len(labels)
74
+ print('Labels: '+str(labels)[:200])
75
+ ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
76
+
77
+ def preprocess_function(example):
78
+ output_dict = tokenizer('<cls>'+example["sentence"]+'<s>', max_length=512, truncation=True)
79
+ output_dict['labels'] = ClassLabels.str2int(example['tag'])
80
+ return output_dict
81
+ #tokenized_datasets = dataset.map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names)
82
+ tokenized_datasets = dataset.map(preprocess_function, batched=False)
83
+ tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels))
84
+
85
+ #๋ฐ์ดํ„ฐ์…‹ ํ† ํฌ๋‚˜์ด์ง• ํ™•์ธ
86
+ random_id = 1
87
+ print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"])
88
+ print("Labels:", tokenized_datasets["train"][random_id]["labels"])
89
+ tokenized_datasets.save_to_disk('data/tok')
90
+
91
+ #KorSciDeBERTa ๋ชจ๋ธ ๋กœ๋”ฉ
92
+ from transformers import AutoModelForSequenceClassification
93
+
94
+ num_labels = len(labels)
95
+ def model_init():
96
+ #return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=7)
97
+ #return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.25)
98
+ return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
99
+ model = model_init()
100
+
101
+ #DataCollator ํ™•์ธ
102
+ from transformers import DataCollatorWithPadding
103
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
104
+ from collections import Counter
105
+ print("Test:", Counter(tokenized_datasets["test"]["labels"]))
106
+
107
+ #์ •ํ™•๋„ ์ฒ™๋„
108
+ from datasets import load_metric
109
+ accuracy = load_metric("accuracy")
110
+
111
+ import numpy as np
112
+ def compute_metrics(pred):
113
+ pred_logits = pred.predictions
114
+ pred_classes = np.argmax(pred_logits, axis=-1)
115
+ labels = np.asarray(pred.label_ids)
116
+ acc = accuracy.compute(predictions=pred_classes, references=labels)
117
+ return {"accuracy": acc["accuracy"]}
118
+
119
+ #training_args ์„ค์ •
120
+ #๋‹ค์Œ ์—๋Ÿฌ ๋ฐœ์ƒ์‹œ output_dir์„ ๋ณ€๊ฒฝํ•˜์—ฌ ๋‹ค์‹œ ์‹œ๋„
121
+ #MlflowException: Changing param values is not allowed. Param with key=
122
+
123
+ import gc
124
+ gc.collect()
125
+ from transformers import TrainingArguments
126
+ training_args = TrainingArguments(
127
+ output_dir="deberta_sent4455",
128
+ num_train_epochs=4,
129
+ #learning_rate=5e-5,
130
+ learning_rate=1.5e-5,
131
+ per_device_train_batch_size=16,
132
+ per_device_eval_batch_size=8,
133
+ weight_decay=0.01,
134
+ fp16=True, # Use mixed precision
135
+ fp16_opt_level="01", # mixed precision mode
136
+ warmup_steps=500,
137
+ logging_steps=200,
138
+ save_steps=2000,
139
+ eval_steps=500,
140
+ push_to_hub=True,
141
+ evaluation_strategy="steps",
142
+ )
143
+
144
+ #Trainer ์„ค์ • ํ›„ ํ•™์Šต ์‹œ์ž‘
145
+ import gc
146
+ gc.collect()
147
+
148
+ from transformers import Trainer
149
+ trainer = Trainer(
150
+ args=training_args,
151
+ compute_metrics=compute_metrics,
152
+ model=model,
153
+ #tokenizer=tokenizer, #์—๋Ÿฌ ์œ ๋ฐœ: TypeError: save_vocabulary() got an unexpected keyword argument 'filename_prefix'
154
+ data_collator=data_collator,
155
+ train_dataset=tokenized_datasets["train"],
156
+ eval_dataset=tokenized_datasets["test"]
157
+ )
158
+ train_metrics = trainer.train().metrics
159
+ trainer.save_metrics("train", train_metrics)
160
+ trainer.push_to_hub()
161
+ #### ํŒŒ์ธํŠœ๋‹ ๋ฐ ๋ชจ๋ธ ์—…๋กœ๋“œ ์™„๋ฃŒ
162
+
163
+ # Commented out IPython magic to ensure Python compatibility.
164
+ # %cd mecab
165
+ !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2;
166
+ !chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install
167
+
168
+ # Commented out IPython magic to ensure Python compatibility.
169
+ !pwd
170
+ # %cd mecab
171
+ !cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make
172
+
173
+ #!mecab -d /usr/local/lib/mecab/dic/mecab-ko-dic
174
+
175
+ # Commented out IPython magic to ensure Python compatibility.
176
+ !pwd
177
+ !ls
178
+ # %cd korscideberta
179
+
180
+ ! unzip korscideberta.zip -d korscideberta; cd korscideberta
181
+
182
+ # Commented out IPython magic to ensure Python compatibility.
183
+ !pwd
184
+ # %cd korscideberta
185
+
186
+ ! pip3 install -r requirements.txt; pip install --upgrade nltk;
187
+ !pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install .