|
|
|
"""korscideberta.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' |
|
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git |
|
# %cd Mecab-ko-for-Google-Colab/ |
|
! bash install_mecab-ko_on_colab_light_220429.sh |
|
# %cd .. |
|
!pip install datasets transformers[sentencepiece] |
|
''' |
|
|
|
|
|
''' |
|
!pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8 |
|
''' |
|
|
|
!pwd |
|
|
|
|
|
''' |
|
#[ํ์]๋ฆฌ๋
์ค ํฐ๋ฏธ๋์์ ๋ณธ ์ฝ๋ ๋ฐ ํ ํฌ๋์ด์ ๋ค์ด๋ก๋ |
|
#git clone https://huggingface.co/kisti/korscideberta |
|
#cd korscideberta |
|
|
|
#[ํ์]๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น(Mecab ๋ฑ ์์ธํ ์ค์น ๋ฐฉ๋ฒ์ KorSciDeBERTaํ๊ฒฝ์ค์น+ํ์ธํ๋.pdf ์ฐธ์กฐ) |
|
!apt install git-lfs |
|
|
|
''' |
|
|
|
from datasets import load_dataset |
|
import datasets |
|
from huggingface_hub import notebook_login |
|
|
|
notebook_login() |
|
|
|
|
|
model_repository = "kisti/korscideberta" |
|
|
|
from transformers import AutoTokenizer |
|
from tokenization_korscideberta_v2 import DebertaV2Tokenizer |
|
tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository) |
|
out = tokenizer.tokenize("<cls> ํ๊ตญ์ด ๋ชจ๋ธ์ <s> ํ๊ตญ์ด ๋ชจ๋ธ์ ๊ณต์ ํฉ๋๋ค. <s>") |
|
print(str(out)) |
|
|
|
|
|
|
|
|
|
dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train') |
|
dataset = dataset.shuffle(seed=42) |
|
dataset = dataset.train_test_split(test_size=0.1) |
|
print("dataset:", str(dataset)) |
|
|
|
|
|
from datasets import ClassLabel |
|
labels = [x for x in dataset['train']['tag']] |
|
labels = list(set(labels)) |
|
labels.sort() |
|
num_labels = len(labels) |
|
print('Labels: '+str(labels)[:200]) |
|
ClassLabels = ClassLabel(num_classes=len(labels), names=labels) |
|
|
|
def preprocess_function(example): |
|
output_dict = tokenizer('<cls>'+example["sentence"]+'<s>', max_length=512, truncation=True) |
|
output_dict['labels'] = ClassLabels.str2int(example['tag']) |
|
return output_dict |
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=False) |
|
tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels)) |
|
|
|
|
|
random_id = 1 |
|
print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"]) |
|
print("Labels:", tokenized_datasets["train"][random_id]["labels"]) |
|
tokenized_datasets.save_to_disk('data/tok') |
|
|
|
|
|
from transformers import AutoModelForSequenceClassification |
|
|
|
num_labels = len(labels) |
|
def model_init(): |
|
|
|
|
|
return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1) |
|
model = model_init() |
|
|
|
|
|
from transformers import DataCollatorWithPadding |
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
from collections import Counter |
|
print("Test:", Counter(tokenized_datasets["test"]["labels"])) |
|
|
|
|
|
from datasets import load_metric |
|
accuracy = load_metric("accuracy") |
|
|
|
import numpy as np |
|
def compute_metrics(pred): |
|
pred_logits = pred.predictions |
|
pred_classes = np.argmax(pred_logits, axis=-1) |
|
labels = np.asarray(pred.label_ids) |
|
acc = accuracy.compute(predictions=pred_classes, references=labels) |
|
return {"accuracy": acc["accuracy"]} |
|
|
|
|
|
|
|
|
|
|
|
import gc |
|
gc.collect() |
|
from transformers import TrainingArguments |
|
training_args = TrainingArguments( |
|
output_dir="deberta_sent4455", |
|
num_train_epochs=4, |
|
|
|
learning_rate=1.5e-5, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=8, |
|
weight_decay=0.01, |
|
fp16=True, |
|
fp16_opt_level="01", |
|
warmup_steps=500, |
|
logging_steps=200, |
|
save_steps=2000, |
|
eval_steps=500, |
|
push_to_hub=True, |
|
evaluation_strategy="steps", |
|
) |
|
|
|
|
|
import gc |
|
gc.collect() |
|
|
|
from transformers import Trainer |
|
trainer = Trainer( |
|
args=training_args, |
|
compute_metrics=compute_metrics, |
|
model=model, |
|
|
|
data_collator=data_collator, |
|
train_dataset=tokenized_datasets["train"], |
|
eval_dataset=tokenized_datasets["test"] |
|
) |
|
train_metrics = trainer.train().metrics |
|
trainer.save_metrics("train", train_metrics) |
|
trainer.push_to_hub() |
|
|
|
|
|
|
|
|
|
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2; |
|
!chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install |
|
|
|
|
|
!pwd |
|
|
|
!cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make |
|
|
|
|
|
|
|
|
|
!pwd |
|
!ls |
|
|
|
|
|
! unzip korscideberta.zip -d korscideberta; cd korscideberta |
|
|
|
|
|
!pwd |
|
|
|
|
|
! pip3 install -r requirements.txt; pip install --upgrade nltk; |
|
!pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install . |