File size: 4,004 Bytes
924ef4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from transformers import AutoModelForMaskedLM , AutoTokenizer
import torch

class Prompting(object):
  """ doc string 
   This class helps us to implement
   Prompt-based Learning Model
  """
  def __init__(self, **kwargs):
    """ constructor 

    parameter:
    ----------
       model: AutoModelForMaskedLM
            path to a Pre-trained language model form HuggingFace Hub
       tokenizer: AutoTokenizer
            path to tokenizer if different tokenizer is used, 
            otherwise leave it empty
    """
    model_path=kwargs['model']
    tokenizer_path= kwargs['model']
    if "tokenizer" in kwargs.keys():
      tokenizer_path= kwargs['tokenizer']
    self.model = AutoModelForMaskedLM.from_pretrained(model_path)
    self.tokenizer = AutoTokenizer.from_pretrained(model_path)

  def prompt_pred(self,text):
    """
      Predict MASK token by listing the probability of candidate tokens 
      where the first token is the most likely

      Parameters:
      ----------
      text: str 
          The text including [MASK] token.
          It supports single MASK token. If more [MASK]ed tokens 
          are given, it takes the first one.

      Returns:
      --------
      list of (token, prob)
         The return is a list of all token in LM Vocab along with 
         their prob score, sort by score in descending order 
    """
    indexed_tokens=self.tokenizer(text, return_tensors="pt").input_ids
    tokenized_text= self.tokenizer.convert_ids_to_tokens (indexed_tokens[0])
    # take the first masked token
    mask_pos=tokenized_text.index(self.tokenizer.mask_token)
    self.model.eval()
    with torch.no_grad():
      outputs = self.model(indexed_tokens)
      predictions = outputs[0]
    values, indices=torch.sort(predictions[0, mask_pos],  descending=True)
    #values=torch.nn.functional.softmax(values, dim=0)
    result=list(zip(self.tokenizer.convert_ids_to_tokens(indices), values))
    self.scores_dict={a:b for a,b in result}
    return result

  def compute_tokens_prob(self, text, token_list1, token_list2):
    """
    Compute the activations for given two token list, 

    Parameters:
    ---------
    token_list1: List(str)
     it is a list for positive polarity tokens such as good, great. 
    token_list2: List(str)
     it is a list for negative polarity tokens such as bad, terrible.      

    Returns:
    --------
    Tuple (
       the probability for first token list,
       the probability of the second token list,
       the ratio score1/ (score1+score2)
       The softmax returns
    """
    _=self.prompt_pred(text)
    score1=[self.scores_dict[token1] if token1 in self.scores_dict.keys() else 0\
            for token1 in token_list1]
    score1= sum(score1)
    score2=[self.scores_dict[token2] if token2 in self.scores_dict.keys() else 0\
            for token2 in token_list2]
    score2= sum(score2)
    softmax_rt=torch.nn.functional.softmax(torch.Tensor([score1,score2]), dim=0)
    return softmax_rt

  def fine_tune(self, sentences, labels, prompt=" Since it was [MASK].",goodToken="good",badToken="bad"):
    """  
      Fine tune the model
    """
    good=tokenizer.convert_tokens_to_ids(goodToken)
    bad=tokenizer.convert_tokens_to_ids(badToken)

    from transformers import AdamW
    optimizer = AdamW(self.model.parameters(),lr=1e-3)

    for sen, label in zip(sentences, labels):
      tokenized_text = self.tokenizer.tokenize(sen+prompt)
      indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
      tokens_tensor = torch.tensor([indexed_tokens])
      # take the first masked token
      mask_pos=tokenized_text.index(self.tokenizer.mask_token)
      outputs = self.model(tokens_tensor)
      predictions = outputs[0]
      pred=predictions[0, mask_pos][[good,bad]]
      prob=torch.nn.functional.softmax(pred, dim=0)
      lossFunc = torch.nn.CrossEntropyLoss()
      loss=lossFunc(prob.unsqueeze(0), torch.tensor([label]))
      loss.backward()
      optimizer.step()
    print("done!")