savasy commited on
Commit
924ef4a
1 Parent(s): dc964d8

Upload prompt.py

Browse files
Files changed (1) hide show
  1. prompt.py +112 -0
prompt.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForMaskedLM , AutoTokenizer
2
+ import torch
3
+
4
+ class Prompting(object):
5
+ """ doc string
6
+ This class helps us to implement
7
+ Prompt-based Learning Model
8
+ """
9
+ def __init__(self, **kwargs):
10
+ """ constructor
11
+
12
+ parameter:
13
+ ----------
14
+ model: AutoModelForMaskedLM
15
+ path to a Pre-trained language model form HuggingFace Hub
16
+ tokenizer: AutoTokenizer
17
+ path to tokenizer if different tokenizer is used,
18
+ otherwise leave it empty
19
+ """
20
+ model_path=kwargs['model']
21
+ tokenizer_path= kwargs['model']
22
+ if "tokenizer" in kwargs.keys():
23
+ tokenizer_path= kwargs['tokenizer']
24
+ self.model = AutoModelForMaskedLM.from_pretrained(model_path)
25
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
26
+
27
+ def prompt_pred(self,text):
28
+ """
29
+ Predict MASK token by listing the probability of candidate tokens
30
+ where the first token is the most likely
31
+
32
+ Parameters:
33
+ ----------
34
+ text: str
35
+ The text including [MASK] token.
36
+ It supports single MASK token. If more [MASK]ed tokens
37
+ are given, it takes the first one.
38
+
39
+ Returns:
40
+ --------
41
+ list of (token, prob)
42
+ The return is a list of all token in LM Vocab along with
43
+ their prob score, sort by score in descending order
44
+ """
45
+ indexed_tokens=self.tokenizer(text, return_tensors="pt").input_ids
46
+ tokenized_text= self.tokenizer.convert_ids_to_tokens (indexed_tokens[0])
47
+ # take the first masked token
48
+ mask_pos=tokenized_text.index(self.tokenizer.mask_token)
49
+ self.model.eval()
50
+ with torch.no_grad():
51
+ outputs = self.model(indexed_tokens)
52
+ predictions = outputs[0]
53
+ values, indices=torch.sort(predictions[0, mask_pos], descending=True)
54
+ #values=torch.nn.functional.softmax(values, dim=0)
55
+ result=list(zip(self.tokenizer.convert_ids_to_tokens(indices), values))
56
+ self.scores_dict={a:b for a,b in result}
57
+ return result
58
+
59
+ def compute_tokens_prob(self, text, token_list1, token_list2):
60
+ """
61
+ Compute the activations for given two token list,
62
+
63
+ Parameters:
64
+ ---------
65
+ token_list1: List(str)
66
+ it is a list for positive polarity tokens such as good, great.
67
+ token_list2: List(str)
68
+ it is a list for negative polarity tokens such as bad, terrible.
69
+
70
+ Returns:
71
+ --------
72
+ Tuple (
73
+ the probability for first token list,
74
+ the probability of the second token list,
75
+ the ratio score1/ (score1+score2)
76
+ The softmax returns
77
+ """
78
+ _=self.prompt_pred(text)
79
+ score1=[self.scores_dict[token1] if token1 in self.scores_dict.keys() else 0\
80
+ for token1 in token_list1]
81
+ score1= sum(score1)
82
+ score2=[self.scores_dict[token2] if token2 in self.scores_dict.keys() else 0\
83
+ for token2 in token_list2]
84
+ score2= sum(score2)
85
+ softmax_rt=torch.nn.functional.softmax(torch.Tensor([score1,score2]), dim=0)
86
+ return softmax_rt
87
+
88
+ def fine_tune(self, sentences, labels, prompt=" Since it was [MASK].",goodToken="good",badToken="bad"):
89
+ """
90
+ Fine tune the model
91
+ """
92
+ good=tokenizer.convert_tokens_to_ids(goodToken)
93
+ bad=tokenizer.convert_tokens_to_ids(badToken)
94
+
95
+ from transformers import AdamW
96
+ optimizer = AdamW(self.model.parameters(),lr=1e-3)
97
+
98
+ for sen, label in zip(sentences, labels):
99
+ tokenized_text = self.tokenizer.tokenize(sen+prompt)
100
+ indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
101
+ tokens_tensor = torch.tensor([indexed_tokens])
102
+ # take the first masked token
103
+ mask_pos=tokenized_text.index(self.tokenizer.mask_token)
104
+ outputs = self.model(tokens_tensor)
105
+ predictions = outputs[0]
106
+ pred=predictions[0, mask_pos][[good,bad]]
107
+ prob=torch.nn.functional.softmax(pred, dim=0)
108
+ lossFunc = torch.nn.CrossEntropyLoss()
109
+ loss=lossFunc(prob.unsqueeze(0), torch.tensor([label]))
110
+ loss.backward()
111
+ optimizer.step()
112
+ print("done!")