p4vv37 commited on
Commit
9cad313
1 Parent(s): ed03b25

Evaluating code quality before comment generation.

Browse files
Files changed (1) hide show
  1. app.py +156 -3
app.py CHANGED
@@ -1,12 +1,151 @@
1
  import gradio as gr
2
  import requests
3
- from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
 
 
4
  import torch
5
 
6
-
7
  MAX_SOURCE_LENGTH = 512
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def prepare_models():
11
  tokenizer = AutoTokenizer.from_pretrained("microsoft/codereviewer")
12
 
@@ -24,7 +163,8 @@ def prepare_models():
24
  tokenizer.start_id = tokenizer.get_vocab()["<start>"]
25
  tokenizer.end_id = tokenizer.get_vocab()["<end>"]
26
 
27
- model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/codereviewer")
 
28
 
29
  model.eval()
30
  return tokenizer, model
@@ -104,6 +244,19 @@ def review_commit(user="p4vv37", repository="ueflow", commit="610a8c7b02b946bc9e
104
  for diff in fd.diffs:
105
  inputs = torch.tensor([encode_diff(tokenizer, diff, msg, source)], dtype=torch.long).to("cpu")
106
  inputs_mask = inputs.ne(tokenizer.pad_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  preds = model.generate(inputs,
108
  attention_mask=inputs_mask,
109
  use_cache=True,
 
1
  import gradio as gr
2
  import requests
3
+ from torch import nn
4
+ from torch.nn import CrossEntropyLoss
5
+ from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, T5Config
6
  import torch
7
 
 
8
  MAX_SOURCE_LENGTH = 512
9
 
10
 
11
+ class ReviewerModel(T5ForConditionalGeneration):
12
+
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+ self.cls_head = nn.Linear(self.config.d_model, 2, bias=True)
16
+ self.init()
17
+
18
+ def init(self):
19
+ nn.init.xavier_uniform_(self.lm_head.weight)
20
+ factor = self.config.initializer_factor
21
+ self.cls_head.weight.data.normal_(mean=0.0, \
22
+ std=factor * ((self.config.d_model) ** -0.5))
23
+ self.cls_head.bias.data.zero_()
24
+
25
+ def forward(
26
+ self, *argv, **kwargs
27
+ ):
28
+ r"""
29
+ Doc from Huggingface transformers:
30
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
31
+ Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
32
+ config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
33
+ labels in ``[0, ..., config.vocab_size]``
34
+ Returns:
35
+ Examples::
36
+ >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
37
+ >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
38
+ >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
39
+ >>> # training
40
+ >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
41
+ >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
42
+ >>> outputs = model(input_ids=input_ids, labels=labels)
43
+ >>> loss = outputs.loss
44
+ >>> logits = outputs.logits
45
+ >>> # inference
46
+ >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
47
+ >>> outputs = model.generate(input_ids)
48
+ >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
49
+ >>> # studies have shown that owning a dog is good for you.
50
+ """
51
+ if "cls" in kwargs:
52
+ assert (
53
+ "input_ids" in kwargs and \
54
+ "labels" in kwargs and \
55
+ "attention_mask" in kwargs
56
+ )
57
+ return self.cls(
58
+ input_ids=kwargs["input_ids"],
59
+ labels=kwargs["labels"],
60
+ attention_mask=kwargs["attention_mask"],
61
+ )
62
+ if "input_labels" in kwargs:
63
+ assert (
64
+ "input_ids" in kwargs and \
65
+ "input_labels" in kwargs and \
66
+ "decoder_input_ids" in kwargs and \
67
+ "attention_mask" in kwargs and \
68
+ "decoder_attention_mask" in kwargs
69
+ ), "Please give these arg keys."
70
+ input_ids = kwargs["input_ids"]
71
+ input_labels = kwargs["input_labels"]
72
+ decoder_input_ids = kwargs["decoder_input_ids"]
73
+ attention_mask = kwargs["attention_mask"]
74
+ decoder_attention_mask = kwargs["decoder_attention_mask"]
75
+ if "encoder_loss" not in kwargs:
76
+ encoder_loss = True
77
+ else:
78
+ encoder_loss = kwargs["encoder_loss"]
79
+ return self.review_forward(input_ids, input_labels, decoder_input_ids, attention_mask,
80
+ decoder_attention_mask, encoder_loss)
81
+ return super().forward(*argv, **kwargs)
82
+
83
+ def cls(
84
+ self,
85
+ input_ids,
86
+ labels,
87
+ attention_mask,
88
+ ):
89
+ encoder_outputs = self.encoder( \
90
+ input_ids=input_ids,
91
+ attention_mask=attention_mask,
92
+ output_attentions=False,
93
+ return_dict=False
94
+ )
95
+ hidden_states = encoder_outputs[0]
96
+ first_hidden = hidden_states[:, 0, :]
97
+ first_hidden = nn.Dropout(0.3)(first_hidden)
98
+ logits = self.cls_head(first_hidden)
99
+ loss_fct = CrossEntropyLoss()
100
+ if labels != None:
101
+ loss = loss_fct(logits, labels)
102
+ return loss
103
+ return logits
104
+
105
+ def review_forward(
106
+ self,
107
+ input_ids,
108
+ input_labels,
109
+ decoder_input_ids,
110
+ attention_mask,
111
+ decoder_attention_mask,
112
+ encoder_loss=True
113
+ ):
114
+ encoder_outputs = self.encoder( \
115
+ input_ids=input_ids,
116
+ attention_mask=attention_mask,
117
+ output_attentions=False,
118
+ return_dict=False
119
+ )
120
+ hidden_states = encoder_outputs[0]
121
+ decoder_inputs = self._shift_right(decoder_input_ids)
122
+ # Decode
123
+ decoder_outputs = self.decoder(
124
+ input_ids=decoder_inputs,
125
+ attention_mask=decoder_attention_mask,
126
+ encoder_hidden_states=hidden_states,
127
+ encoder_attention_mask=attention_mask,
128
+ output_attentions=False,
129
+ return_dict=False
130
+ )
131
+ sequence_output = decoder_outputs[0]
132
+ if self.config.tie_word_embeddings: # this is True default
133
+ sequence_output = sequence_output * (self.model_dim ** -0.5)
134
+ if encoder_loss:
135
+ # print(self.encoder.get_input_embeddings().weight.shape)
136
+ cls_logits = nn.functional.linear(hidden_states, self.encoder.get_input_embeddings().weight)
137
+ # cls_logits = self.cls_head(hidden_states)
138
+ lm_logits = self.lm_head(sequence_output)
139
+ if decoder_input_ids is not None:
140
+ lm_loss_fct = CrossEntropyLoss(ignore_index=0) # Warning: PAD_ID should be 0
141
+ loss = lm_loss_fct(lm_logits.view(-1, lm_logits.size(-1)), decoder_input_ids.view(-1))
142
+ if encoder_loss and input_labels is not None:
143
+ cls_loss_fct = CrossEntropyLoss(ignore_index=-100)
144
+ loss += cls_loss_fct(cls_logits.view(-1, cls_logits.size(-1)), input_labels.view(-1))
145
+ return loss
146
+ return cls_logits, lm_logits
147
+
148
+
149
  def prepare_models():
150
  tokenizer = AutoTokenizer.from_pretrained("microsoft/codereviewer")
151
 
 
163
  tokenizer.start_id = tokenizer.get_vocab()["<start>"]
164
  tokenizer.end_id = tokenizer.get_vocab()["<end>"]
165
 
166
+ config = T5Config.from_pretrained("microsoft/codereviewer")
167
+ model = ReviewerModel.from_pretrained("microsoft/codereviewer", config=config)
168
 
169
  model.eval()
170
  return tokenizer, model
 
244
  for diff in fd.diffs:
245
  inputs = torch.tensor([encode_diff(tokenizer, diff, msg, source)], dtype=torch.long).to("cpu")
246
  inputs_mask = inputs.ne(tokenizer.pad_id)
247
+ logits = model(
248
+ input_ids=inputs,
249
+ cls=True,
250
+ attention_mask=inputs_mask,
251
+ labels=None,
252
+ use_cache=True,
253
+ num_beams=5,
254
+ early_stopping=True,
255
+ max_length=100
256
+ )
257
+ needs_review = torch.argmax(logits, dim=-1).cpu().numpy()[0]
258
+ if not needs_review:
259
+ continue
260
  preds = model.generate(inputs,
261
  attention_mask=inputs_mask,
262
  use_cache=True,