alenusch commited on
Commit
6c87f51
1 Parent(s): 093482a

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +112 -0
README.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Classifier to check if two sequences are paraphrase or not
2
+
3
+ Trained based on ruBert by DeepPavlov.
4
+
5
+ Use this way:
6
+ ```
7
+ import torch
8
+ import torch.nn as nn
9
+ import os
10
+ import copy
11
+ import random
12
+ import numpy as np
13
+ import pandas as pd
14
+ from torch.utils.data import DataLoader, Dataset
15
+ from torch.cuda.amp import autocast, GradScaler
16
+ from tqdm import tqdm
17
+ from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
18
+
19
+ from transformers.file_utils import (
20
+ cached_path,
21
+ hf_bucket_url,
22
+ is_remote_url,
23
+ )
24
+
25
+ archive_file = hf_bucket_url(
26
+ "alenusch/par_cls_bert",
27
+ filename="rubert-base-cased_lr_2e-05_val_loss_0.66143_ep_4.pt",
28
+ revision=None,
29
+ mirror=None,
30
+ )
31
+ resolved_archive_file = cached_path(
32
+ archive_file,
33
+ cache_dir=None,
34
+ force_download=False,
35
+ proxies=None,
36
+ resume_download=False,
37
+ local_files_only=False,
38
+ )
39
+
40
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
41
+
42
+ class SentencePairClassifier(nn.Module):
43
+
44
+ def __init__(self, bert_model):
45
+ super(SentencePairClassifier, self).__init__()
46
+ self.bert_layer = AutoModel.from_pretrained(bert_model)
47
+ self.cls_layer = nn.Linear(768, 1)
48
+ self.dropout = nn.Dropout(p=0.1)
49
+
50
+ @autocast()
51
+ def forward(self, input_ids, attn_masks, token_type_ids):
52
+ cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids, return_dict=False)
53
+ logits = self.cls_layer(self.dropout(pooler_output))
54
+ return logits
55
+
56
+ class CustomDataset(Dataset):
57
+
58
+ def __init__(self, data, maxlen, bert_model):
59
+
60
+ self.data = data
61
+ self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
62
+ self.maxlen = maxlen
63
+ self.targets = False
64
+
65
+ def __len__(self):
66
+ return len(self.data)
67
+
68
+ def __getitem__(self, index):
69
+ sent1 = str(self.data[index][0])
70
+ sent2 = str(self.data[index][1])
71
+ encoded_pair = self.tokenizer(sent1, sent2,
72
+ padding='max_length', # Pad to max_length
73
+ truncation=True, # Truncate to max_length
74
+ max_length=self.maxlen,
75
+ return_tensors='pt') # Return torch.Tensor objects
76
+
77
+ token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids
78
+ attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with "0" for padded values and "1" for the other values
79
+ token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
80
+
81
+ return token_ids, attn_masks, token_type_ids
82
+
83
+ def get_probs_from_logits(logits):
84
+ probs = torch.sigmoid(logits.unsqueeze(-1))
85
+ return probs.detach().cpu().numpy()
86
+
87
+ def test_prediction(net, device, dataloader, with_labels=False):
88
+ net.eval()
89
+ probs_all = []
90
+
91
+ with torch.no_grad():
92
+ for seq, attn_masks, token_type_ids in tqdm(dataloader):
93
+ seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
94
+ logits = net(seq, attn_masks, token_type_ids)
95
+ probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
96
+ probs_all += probs.tolist()
97
+ return probs_all
98
+
99
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
100
+ cls_model = SentencePairClassifier(bert_model="alenusch/par_cls_bert")
101
+ if torch.cuda.device_count() > 1:
102
+ cls_model = nn.DataParallel(model)
103
+
104
+ cls_model.load_state_dict(torch.load(resolved_archive_file))
105
+ cls_model.to(device)
106
+
107
+ variants = [["sentence1", "sentence2"]]
108
+ test_set = CustomDataset(variants, maxlen=512, bert_model="alenusch/par_cls_bert")
109
+ test_loader = DataLoader(test_set, batch_size=16, num_workers=5)
110
+ res = test_prediction(net=cls_model, device=device, dataloader=test_loader, with_labels=False)
111
+
112
+ ```