| import numpy as np |
| np.random.seed(42) |
| import random |
| random.seed(42) |
|
|
| import pandas as pd |
| from sklearn.metrics import classification_report |
|
|
| import logging |
| import os |
| import sys |
| from dataclasses import dataclass, field |
| from typing import Optional |
| import json |
|
|
| from copy import deepcopy |
|
|
| import torch |
|
|
| import transformers as transformers |
|
|
| from transformers import ( |
| HfArgumentParser, |
| Trainer, |
| TrainingArguments, |
| set_seed |
| ) |
| from transformers.file_utils import is_offline_mode |
| from transformers.trainer_utils import get_last_checkpoint |
| from transformers.utils import check_min_version |
| from transformers.utils.versions import require_version |
|
|
| from src.datasets import ContrastiveClassificationDataset |
| from src.data_collators import DataCollatorContrastiveClassification |
| from src.modeling import ContrastiveClassifierModel |
|
|
| from src.metrics import compute_metrics_bce |
|
|
| from transformers import EarlyStoppingCallback |
|
|
| from transformers.utils.hp_naming import TrialShortNamer |
|
|
| from pdb import set_trace |
|
|
| import json |
|
|
| def model_fn(model_dir): |
| tokenizer = AutoTokenizer.from_pretrained('roberta-base', additional_special_tokens=('[COL]', '[VAL]')) |
| model = ContrastiveClassifierModel(checkpoint_path=model_dir, len_tokenizer=len(tokenizer), model='roberta-base', frozen=False) |
| return model, tokenizer |
|
|
|
|
| def predict_fn(data, model_and_tokenizer): |
| |
| model, tokenizer = model_and_tokenizer |
|
|
| test_dataset = ContrastiveClassificationDataset(data["inputs"], dataset_type='test', size=512, tokenizer='roberta-base', dataset='serialized') |
| data_collator = DataCollatorContrastiveClassification(tokenizer) |
|
|
| trainer = Trainer( |
| model=model, |
| data_collator=data_collator, |
| compute_metrics=compute_metrics_bce, |
| ) |
|
|
| predict_results = trainer.predict(test_dataset,metric_key_prefix="predict") |
| df = test_dataset.data |
| df['prediction'] = predict_results.predictions |
| return {"values": df[df['prediction']==1].values.tolist()} |
|
|
|
|
|
|