# -*- coding: utf-8 -*- """korscideberta.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz """ #!git clone https://huggingface.co/kisti/korscideberta; cd korscideberta # Commented out IPython magic to ensure Python compatibility. #!pwd #%cd .. #!pip install konlpy # %cd korscideberta # Commented out IPython magic to ensure Python compatibility. ''' ! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git # %cd Mecab-ko-for-Google-Colab/ ! bash install_mecab-ko_on_colab_light_220429.sh # %cd .. !pip install datasets transformers[sentencepiece] ''' # Commented out IPython magic to ensure Python compatibility. ''' !pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8 ''' !pwd # %cd /content/korscideberta ''' #[필수]리눅스 터미널에서 본 코드 및 토크나이저 다운로드 #git clone https://huggingface.co/kisti/korscideberta #cd korscideberta #[필수]라이브러리 설치(Mecab 등 자세한 설치 방법은 KorSciDeBERTa환경설치+파인튜닝.pdf 참조) !apt install git-lfs ''' from datasets import load_dataset import datasets from huggingface_hub import notebook_login notebook_login() #Huggingface 로그인 #토큰 예시: hf_jRjLZcSBibYHwUaTjiNUEeoJlFxhFkGM model_repository = "kisti/korscideberta" #Huggingface 모델명 설정 #model_repository = "./" from transformers import AutoTokenizer from tokenization_korscideberta_v2 import DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository) out = tokenizer.tokenize(" 한국어 모델을 한국어 모델을 공유합니다. ") print(str(out)) #데이터셋 로드 #data_files = {"train": "문장의미-균등저널/test.json", "test": "문장의미-균등저널/train.json", 'dev':'문장의미-균등저널/dev.json'} #dataset = load_dataset('json', data_files=data_files) dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train') dataset = dataset.shuffle(seed=42) dataset = dataset.train_test_split(test_size=0.1) print("dataset:", str(dataset)) #데이터셋을 토크나이징 후 저장 from datasets import ClassLabel labels = [x for x in dataset['train']['tag']] labels = list(set(labels)) labels.sort() num_labels = len(labels) print('Labels: '+str(labels)[:200]) ClassLabels = ClassLabel(num_classes=len(labels), names=labels) def preprocess_function(example): output_dict = tokenizer(''+example["sentence"]+'', max_length=512, truncation=True) output_dict['labels'] = ClassLabels.str2int(example['tag']) return output_dict #tokenized_datasets = dataset.map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names) tokenized_datasets = dataset.map(preprocess_function, batched=False) tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels)) #데이터셋 토크나이징 확인 random_id = 1 print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"]) print("Labels:", tokenized_datasets["train"][random_id]["labels"]) tokenized_datasets.save_to_disk('data/tok') #KorSciDeBERTa 모델 로딩 from transformers import AutoModelForSequenceClassification num_labels = len(labels) def model_init(): #return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=7) #return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.25) return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1) model = model_init() #DataCollator 확인 from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) from collections import Counter print("Test:", Counter(tokenized_datasets["test"]["labels"])) #정확도 척도 from datasets import load_metric accuracy = load_metric("accuracy") import numpy as np def compute_metrics(pred): pred_logits = pred.predictions pred_classes = np.argmax(pred_logits, axis=-1) labels = np.asarray(pred.label_ids) acc = accuracy.compute(predictions=pred_classes, references=labels) return {"accuracy": acc["accuracy"]} #training_args 설정 #다음 에러 발생시 output_dir을 변경하여 다시 시도 #MlflowException: Changing param values is not allowed. Param with key= import gc gc.collect() from transformers import TrainingArguments training_args = TrainingArguments( output_dir="deberta_sent4455", num_train_epochs=4, #learning_rate=5e-5, learning_rate=1.5e-5, per_device_train_batch_size=16, per_device_eval_batch_size=8, weight_decay=0.01, fp16=True, # Use mixed precision fp16_opt_level="01", # mixed precision mode warmup_steps=500, logging_steps=200, save_steps=2000, eval_steps=500, push_to_hub=True, evaluation_strategy="steps", ) #Trainer 설정 후 학습 시작 import gc gc.collect() from transformers import Trainer trainer = Trainer( args=training_args, compute_metrics=compute_metrics, model=model, #tokenizer=tokenizer, #에러 유발: TypeError: save_vocabulary() got an unexpected keyword argument 'filename_prefix' data_collator=data_collator, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"] ) train_metrics = trainer.train().metrics trainer.save_metrics("train", train_metrics) trainer.push_to_hub() #### 파인튜닝 및 모델 업로드 완료 # Commented out IPython magic to ensure Python compatibility. # %cd mecab !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2; !chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install # Commented out IPython magic to ensure Python compatibility. !pwd # %cd mecab !cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make #!mecab -d /usr/local/lib/mecab/dic/mecab-ko-dic # Commented out IPython magic to ensure Python compatibility. !pwd !ls # %cd korscideberta ! unzip korscideberta.zip -d korscideberta; cd korscideberta # Commented out IPython magic to ensure Python compatibility. !pwd # %cd korscideberta ! pip3 install -r requirements.txt; pip install --upgrade nltk; !pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install .