File size: 4,876 Bytes
0217fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import torch
from torch import nn as nn
import pandas as pd

from transformers import AutoModel, AutoTokenizer

# import datasets
from datasets import Dataset, DatasetDict

from sklearn.metrics import classification_report
from sklearn.metrics._classification import _check_targets

envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

int2label = {0:'SUPPORTED', 1:'NEI', 2:'REFUTED'}

class NLI_model(nn.Module):
    def __init__(self, input_dims, class_weights=torch.tensor([0., 0., 0.])):
        super(NLI_model, self).__init__()

        self.classification = nn.Sequential(
            nn.Linear(input_dims, 3)
        )

        self.criterion = nn.CrossEntropyLoss(class_weights)

    def forward(self, input):
        output_linear = self.classification(input)
        return output_linear

    def training_step(self, train_batch, batch_idx=0):
        input_data, targets = train_batch
        outputs = self.forward(input_data)
        loss = self.criterion(outputs, targets)
        return loss

    def predict_step(self, batch, batch_idx=0):
        input_data, _ = batch
        outputs = self.forward(input_data)
        prob = outputs.softmax(dim = -1)
        sort_prob, sort_indices = torch.sort(-prob, 1)
        return sort_indices[:,0], sort_prob[:,0]

    def validation_step(self, val_batch, batch_idx=0):
        _, targets = val_batch
        sort_indices, _ = self.predict_step(val_batch, batch_idx)
        report = classification_report(list(targets.to('cpu').numpy()), list(sort_indices.to('cpu').numpy()), output_dict=True, zero_division = 1)
        return report

    def test_step(self, batch, dict_form, batch_idx=0):
        _, targets = batch
        sort_indices, _ = self.predict_step(batch, batch_idx)
        report = classification_report(targets.to('cpu').numpy(), sort_indices.to('cpu').numpy(), output_dict=dict_form, zero_division = 1)
        return report

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr = 1e-5)


def inferSample(evidence, claim, tokenizer, mDeBertaModel, classifierModel, input_type):

    def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
        premises = [premise for premise, _ in data['sample']]
        hypothesis = [hypothesis for _, hypothesis in data['sample']]

        with torch.no_grad():
            input_token = (tokenizer(premises, hypothesis, truncation=True, return_tensors="pt", padding = True)['input_ids']).to(envir)
            embedding = mDeBertaModel(input_token).last_hidden_state

        mean_embedding = torch.mean(embedding[:, 1:, :], dim = 1)
        cls_embedding = embedding[:, 0, :]

        return {'mean':mean_embedding, 'cls':cls_embedding}

    def predict_mapping(batch):
        with torch.no_grad():
            predict_label, predict_prob = classifierModel.predict_step((batch[input_type].to(envir), None))
        return {'label':predict_label, 'prob':-predict_prob}

    # Mapping the predict label into corresponding string labels
    def output_predictedDataset(predict_dataset):
        for record in predict_dataset:
            labels = int2label[ record['label'].item() ]
            confidence = record['prob'].item()

        return {'labels':labels, 'confidence':confidence}

    dataset = {'sample':[(evidence, claim)], 'key': [0]}

    output_dataset = DatasetDict({
        'infer': Dataset.from_dict(dataset)
    })

    tokenized_dataset = output_dataset.map(mDeBERTa_tokenize, batched=True, batch_size=1)
    tokenized_dataset = tokenized_dataset.with_format("torch", [input_type, 'key'])

    # Running inference step
    predicted_dataset = tokenized_dataset.map(predict_mapping, batched=True, batch_size=tokenized_dataset['infer'].num_rows)
    return output_predictedDataset(predicted_dataset['infer'])

if __name__ == '__main__':
    # CHANGE 'INPUT_TYPE' TO CHANGE MODEL
    INPUT_TYPE = 'mean' # USE "MEAN" OR "CLS" LAST HIDDEN STATE
    
    # Load LLM
    tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")    # LOAD mDEBERTa TOKENIZER
    mDeBertaModel = AutoModel.from_pretrained(f"src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-{INPUT_TYPE}")  # LOAD FINETUNED MODEL
    # Load classifier model
    checkpoints = torch.load(f"src/mDeBERTa (ft) V6/{INPUT_TYPE}.pt", map_location=envir)
    classifierModel = NLI_model(768, torch.tensor([0., 0., 0.])).to(envir)
    classifierModel.load_state_dict(checkpoints['model_state_dict'])
    
    evidence = "Sau khi thẩm định, Liên đoàn Bóng đá châu Á AFC xác nhận thủ thành mới nhập quốc tịch của Việt Nam Filip Nguyễn đủ điều kiện thi đấu ở Asian Cup 2024."
    claim = "Filip Nguyễn đủ điều kiện dự Asian Cup 2024"
    print(inferSample(evidence, claim, tokenizer, mDeBertaModel, classifierModel, INPUT_TYPE))