alenusch/par_cls_bert · Hugging Face

Classifier to check if two sequences are paraphrase or not

Trained based on ruBert by DeepPavlov.
Use this way:
import torch
import torch.nn as nn
import os
import copy
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

from transformers.file_utils import (
    cached_path,
    hf_bucket_url,
    is_remote_url,
)

archive_file = hf_bucket_url(
                "alenusch/par_cls_bert",
                filename="rubert-base-cased_lr_2e-05_val_loss_0.66143_ep_4.pt",
                revision=None,
                mirror=None,
            )
resolved_archive_file = cached_path(
                archive_file,
                cache_dir=None,
                force_download=False,
                proxies=None,
                resume_download=False,
                local_files_only=False,
            )

os.environ["TOKENIZERS_PARALLELISM"] = "false"

class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model):
        super(SentencePairClassifier, self).__init__()
        self.bert_layer = AutoModel.from_pretrained(bert_model)
        self.cls_layer = nn.Linear(768, 1)
        self.dropout = nn.Dropout(p=0.1)

    @autocast()
    def forward(self, input_ids, attn_masks, token_type_ids):
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids,  return_dict=False)
        logits = self.cls_layer(self.dropout(pooler_output))
        return logits

class CustomDataset(Dataset):

    def __init__(self, data, maxlen, bert_model):

        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
        self.maxlen = maxlen
        self.targets = False

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sent1 = str(self.data[index][0])
        sent2 = str(self.data[index][1])
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        return token_ids, attn_masks, token_type_ids

def get_probs_from_logits(logits):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def test_prediction(net, device, dataloader, with_labels=False):
    net.eval()
    probs_all = []

    with torch.no_grad():
        for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
    return probs_all

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cls_model = SentencePairClassifier(bert_model="alenusch/par_cls_bert")
if torch.cuda.device_count() > 1:
    cls_model = nn.DataParallel(model)

cls_model.load_state_dict(torch.load(resolved_archive_file))
cls_model.to(device)

variants = [["sentence1", "sentence2"]]
test_set = CustomDataset(variants, maxlen=512, bert_model="alenusch/par_cls_bert")
test_loader = DataLoader(test_set, batch_size=16, num_workers=5)
res = test_prediction(net=cls_model, device=device, dataloader=test_loader, with_labels=False)