korscideberta / korscideberta-colab.py
kkmkorea's picture
Upload korscideberta-colab.py
471470c
# -*- coding: utf-8 -*-
"""korscideberta.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz
"""
#!git clone https://huggingface.co/kisti/korscideberta; cd korscideberta
# Commented out IPython magic to ensure Python compatibility.
#!pwd
#%cd ..
#!pip install konlpy
# %cd korscideberta
# Commented out IPython magic to ensure Python compatibility.
'''
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# %cd Mecab-ko-for-Google-Colab/
! bash install_mecab-ko_on_colab_light_220429.sh
# %cd ..
!pip install datasets transformers[sentencepiece]
'''
# Commented out IPython magic to ensure Python compatibility.
'''
!pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8
'''
!pwd
# %cd /content/korscideberta
'''
#[ํ•„์ˆ˜]๋ฆฌ๋ˆ…์Šค ํ„ฐ๋ฏธ๋„์—์„œ ๋ณธ ์ฝ”๋“œ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋‹ค์šด๋กœ๋“œ
#git clone https://huggingface.co/kisti/korscideberta
#cd korscideberta
#[ํ•„์ˆ˜]๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜(Mecab ๋“ฑ ์ž์„ธํ•œ ์„ค์น˜ ๋ฐฉ๋ฒ•์€ KorSciDeBERTaํ™˜๊ฒฝ์„ค์น˜+ํŒŒ์ธํŠœ๋‹.pdf ์ฐธ์กฐ)
!apt install git-lfs
'''
from datasets import load_dataset
import datasets
from huggingface_hub import notebook_login
notebook_login() #Huggingface ๋กœ๊ทธ์ธ
#ํ† ํฐ ์˜ˆ์‹œ: hf_jRjLZcSBibYHwUaTjiNUEeoJlFxhFkGM
model_repository = "kisti/korscideberta" #Huggingface ๋ชจ๋ธ๋ช… ์„ค์ •
#model_repository = "./"
from transformers import AutoTokenizer
from tokenization_korscideberta_v2 import DebertaV2Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository)
out = tokenizer.tokenize("<cls> ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ <s> ํ•œ๊ตญ์–ด ๋ชจ๋ธ์„ ๊ณต์œ ํ•ฉ๋‹ˆ๋‹ค. <s>")
print(str(out))
#๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
#data_files = {"train": "๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/test.json", "test": "๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/train.json", 'dev':'๋ฌธ์žฅ์˜๋ฏธ-๊ท ๋“ฑ์ €๋„/dev.json'}
#dataset = load_dataset('json', data_files=data_files)
dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train')
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.1)
print("dataset:", str(dataset))
#๋ฐ์ดํ„ฐ์…‹์„ ํ† ํฌ๋‚˜์ด์ง• ํ›„ ์ €์žฅ
from datasets import ClassLabel
labels = [x for x in dataset['train']['tag']]
labels = list(set(labels))
labels.sort()
num_labels = len(labels)
print('Labels: '+str(labels)[:200])
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
def preprocess_function(example):
output_dict = tokenizer('<cls>'+example["sentence"]+'<s>', max_length=512, truncation=True)
output_dict['labels'] = ClassLabels.str2int(example['tag'])
return output_dict
#tokenized_datasets = dataset.map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names)
tokenized_datasets = dataset.map(preprocess_function, batched=False)
tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels))
#๋ฐ์ดํ„ฐ์…‹ ํ† ํฌ๋‚˜์ด์ง• ํ™•์ธ
random_id = 1
print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"])
print("Labels:", tokenized_datasets["train"][random_id]["labels"])
tokenized_datasets.save_to_disk('data/tok')
#KorSciDeBERTa ๋ชจ๋ธ ๋กœ๋”ฉ
from transformers import AutoModelForSequenceClassification
num_labels = len(labels)
def model_init():
#return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=7)
#return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.25)
return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = model_init()
#DataCollator ํ™•์ธ
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from collections import Counter
print("Test:", Counter(tokenized_datasets["test"]["labels"]))
#์ •ํ™•๋„ ์ฒ™๋„
from datasets import load_metric
accuracy = load_metric("accuracy")
import numpy as np
def compute_metrics(pred):
pred_logits = pred.predictions
pred_classes = np.argmax(pred_logits, axis=-1)
labels = np.asarray(pred.label_ids)
acc = accuracy.compute(predictions=pred_classes, references=labels)
return {"accuracy": acc["accuracy"]}
#training_args ์„ค์ •
#๋‹ค์Œ ์—๋Ÿฌ ๋ฐœ์ƒ์‹œ output_dir์„ ๋ณ€๊ฒฝํ•˜์—ฌ ๋‹ค์‹œ ์‹œ๋„
#MlflowException: Changing param values is not allowed. Param with key=
import gc
gc.collect()
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="deberta_sent4455",
num_train_epochs=4,
#learning_rate=5e-5,
learning_rate=1.5e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=8,
weight_decay=0.01,
fp16=True, # Use mixed precision
fp16_opt_level="01", # mixed precision mode
warmup_steps=500,
logging_steps=200,
save_steps=2000,
eval_steps=500,
push_to_hub=True,
evaluation_strategy="steps",
)
#Trainer ์„ค์ • ํ›„ ํ•™์Šต ์‹œ์ž‘
import gc
gc.collect()
from transformers import Trainer
trainer = Trainer(
args=training_args,
compute_metrics=compute_metrics,
model=model,
#tokenizer=tokenizer, #์—๋Ÿฌ ์œ ๋ฐœ: TypeError: save_vocabulary() got an unexpected keyword argument 'filename_prefix'
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"]
)
train_metrics = trainer.train().metrics
trainer.save_metrics("train", train_metrics)
trainer.push_to_hub()
#### ํŒŒ์ธํŠœ๋‹ ๋ฐ ๋ชจ๋ธ ์—…๋กœ๋“œ ์™„๋ฃŒ
# Commented out IPython magic to ensure Python compatibility.
# %cd mecab
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2;
!chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install
# Commented out IPython magic to ensure Python compatibility.
!pwd
# %cd mecab
!cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make
#!mecab -d /usr/local/lib/mecab/dic/mecab-ko-dic
# Commented out IPython magic to ensure Python compatibility.
!pwd
!ls
# %cd korscideberta
! unzip korscideberta.zip -d korscideberta; cd korscideberta
# Commented out IPython magic to ensure Python compatibility.
!pwd
# %cd korscideberta
! pip3 install -r requirements.txt; pip install --upgrade nltk;
!pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install .