tombm commited on
Commit
5212a08
β€’
1 Parent(s): b51e9e8

Add functionality to app

Browse files
Files changed (6) hide show
  1. .gitignore +5 -0
  2. README.md +3 -3
  3. app.py +67 -4
  4. gp.py +132 -0
  5. train.py +106 -0
  6. uq.py +102 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .vscode/
2
+ __pycache__/
3
+ text/
4
+ misc/
5
+ bert-base-uncased-finetuned-cola/
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Gp Uq Tester
3
  emoji: πŸ“ˆ
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.44.4
8
  app_file: app.py
 
1
  ---
2
+ title: GP UQ Tester
3
  emoji: πŸ“ˆ
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.44.4
8
  app_file: app.py
app.py CHANGED
@@ -1,7 +1,70 @@
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, set_seed, AutoTokenizer
3
+ from uq import BertForUQSequenceClassification
4
 
 
 
5
 
6
+ def predict(sentence):
7
+ model_path = "tombm/bert-base-uncased-finetuned-cola"
8
+ classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)
9
+ label = classifier(sentence)
10
+ return label
11
+
12
+
13
+ def uncertainty(sentence):
14
+ model_path = "tombm/bert-base-uncased-finetuned-cola"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
16
+ model = BertForUQSequenceClassification.from_pretrained(model_path)
17
+
18
+ test_input = tokenizer(sentence, return_tensors="pt")
19
+ model.return_gp_cov = True
20
+ _, gp_cov = model(**test_input)
21
+
22
+ return str(gp_cov.item())
23
+
24
+
25
+ with gr.Blocks() as demo:
26
+ set_seed(12)
27
+ intro_str = """The *cola* dataset focuses on determining whether sentences are grammatically correct.
28
+ Firstly, let's see how our finetuned model classifies two sentences,
29
+ the first of which is correct (i.e. valid) and the second is not (i.e. invalid):"""
30
+ gr.Markdown(value=intro_str)
31
+
32
+ gr.Interface(
33
+ fn=predict,
34
+ inputs=gr.Textbox(value="Good morning.", label="Input"),
35
+ outputs="label",
36
+ )
37
+ gr.Interface(
38
+ fn=predict,
39
+ inputs=gr.Textbox(
40
+ value="This sentence is sentence, this is a correct sentence!",
41
+ label="Input",
42
+ ),
43
+ outputs="label",
44
+ )
45
+
46
+ explain_str = """As we can see, our model correctly classifies the first sentence, but misclassifies the second.
47
+ Let's now inspect the uncertainties associated with each prediction generated by our GP head:"""
48
+ gr.Markdown(value=explain_str)
49
+
50
+ gr.Interface(
51
+ fn=uncertainty,
52
+ inputs=gr.Textbox(value="Good morning.", label="Input"),
53
+ outputs="text",
54
+ ) # should have low uncertainty
55
+ gr.Interface(
56
+ fn=uncertainty,
57
+ inputs=gr.Textbox(
58
+ value="This sentence is sentence, this is a correct sentence!",
59
+ label="Input",
60
+ ),
61
+ outputs="text",
62
+ ) # should have high uncertainty
63
+
64
+ final_str = """We can see here that the variance for the misclassified example is much higher than for the correctly
65
+ classified example. This is great, as now we have some indication of when our model might be uncertain!"""
66
+ gr.Markdown(value=final_str)
67
+
68
+ demo.launch()
69
+ # iface = gr.Interface(fn=predict, inputs="text", outputs="text")
70
+ # iface.launch()
gp.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code for GP final layer adapted from this great repo:
2
+ # https://github.com/kimjeyoung/SNGP-BERT-Pytorch .
3
+ # We simplify things here a bit by removing the spectral
4
+ # normalisation as the authors of the Plex paper say that this
5
+ # isn't strictly necessary, so we just have a GP classification head on the model.
6
+
7
+ import torch
8
+ import math
9
+ import copy
10
+ from torch import nn
11
+
12
+
13
+ def RandomFeatureLinear(i_dim, o_dim, bias=True, require_grad=False):
14
+ m = nn.Linear(i_dim, o_dim, bias)
15
+ nn.init.normal_(m.weight, mean=0.0, std=0.05)
16
+ m.weight.requires_grad = require_grad # Freeze weights
17
+ if bias:
18
+ nn.init.uniform_(m.bias, a=0.0, b=2.0 * math.pi) # Freeze bias
19
+ m.bias.requires_grad = require_grad
20
+ return m
21
+
22
+
23
+ class GPClassificationHead(nn.Module):
24
+ def __init__(
25
+ self,
26
+ hidden_size=768,
27
+ gp_kernel_scale=1.0,
28
+ num_inducing=1024,
29
+ gp_output_bias=0.0,
30
+ layer_norm_eps=1e-12,
31
+ scale_random_features=True,
32
+ normalize_input=True,
33
+ gp_cov_momentum=0.999,
34
+ gp_cov_ridge_penalty=1e-3,
35
+ epochs=40,
36
+ num_classes=3,
37
+ device="cpu",
38
+ ):
39
+ super(GPClassificationHead, self).__init__()
40
+ self.final_epochs = epochs - 1
41
+ self.gp_cov_ridge_penalty = gp_cov_ridge_penalty
42
+ self.gp_cov_momentum = gp_cov_momentum
43
+
44
+ self.pooled_output_dim = hidden_size
45
+ self.gp_input_scale = 1.0 / math.sqrt(gp_kernel_scale)
46
+ self.gp_feature_scale = math.sqrt(2.0 / float(num_inducing))
47
+ self.gp_output_bias = gp_output_bias
48
+ self.scale_random_features = scale_random_features
49
+ self.normalize_input = normalize_input
50
+ self.device = device
51
+
52
+ self._gp_input_normalize_layer = torch.nn.LayerNorm(
53
+ hidden_size, eps=layer_norm_eps
54
+ )
55
+ self._gp_output_layer = nn.Linear(
56
+ num_inducing, num_classes, bias=False
57
+ ) # gp_output_bias set to not trainable
58
+ self._gp_output_bias = torch.tensor([self.gp_output_bias] * num_classes).to(
59
+ device
60
+ )
61
+ self._random_feature = RandomFeatureLinear(self.pooled_output_dim, num_inducing)
62
+
63
+ # Inverse covariance matrix corresponding to RFF-GP posterior
64
+ self.initial_precision_matrix = self.gp_cov_ridge_penalty * torch.eye(
65
+ num_inducing
66
+ ).to(device)
67
+ self.precision_matrix = torch.nn.Parameter(
68
+ copy.deepcopy(self.initial_precision_matrix), requires_grad=False
69
+ )
70
+
71
+ def gp_layer(self, gp_inputs, update_cov=True):
72
+ if self.normalize_input:
73
+ gp_inputs = self._gp_input_normalize_layer(gp_inputs)
74
+
75
+ gp_feature = self._random_feature(gp_inputs)
76
+ gp_feature = torch.cos(gp_feature)
77
+
78
+ if self.scale_random_features:
79
+ gp_feature = gp_feature * self.gp_input_scale
80
+
81
+ gp_output = self._gp_output_layer(gp_feature).to(
82
+ self.device
83
+ ) + self._gp_output_bias.to(self.device)
84
+
85
+ if update_cov:
86
+ self.update_cov(gp_feature)
87
+ return gp_feature, gp_output
88
+
89
+ def reset_cov(self):
90
+ self.precision_matrix = torch.nn.Parameter(
91
+ copy.deepcopy(self.initial_precision_matrix), requires_grad=False
92
+ )
93
+
94
+ def update_cov(self, gp_feature):
95
+ # https://github.com/google/edward2/blob/main/edward2/tensorflow/layers/random_feature.py#L346
96
+ batch_size = gp_feature.size()[0]
97
+ precision_matrix_minibatch = torch.matmul(gp_feature.t(), gp_feature)
98
+
99
+ # Moving average updates to precision matrix
100
+ precision_matrix_minibatch = precision_matrix_minibatch / batch_size
101
+ precision_matrix_new = (
102
+ self.gp_cov_momentum * self.precision_matrix
103
+ + (1.0 - self.gp_cov_momentum) * precision_matrix_minibatch
104
+ )
105
+
106
+ self.precision_matrix = torch.nn.Parameter(
107
+ precision_matrix_new, requires_grad=False
108
+ )
109
+
110
+ def compute_predictive_covariance(self, gp_feature):
111
+ # https://github.com/google/edward2/blob/main/edward2/tensorflow/layers/random_feature.py#L403
112
+ # Covariance matrix of feature coefficient
113
+ feature_cov_matrix = torch.linalg.inv(self.precision_matrix)
114
+
115
+ # Predictive covariance matrix for the GP
116
+ cov_feature_product = (
117
+ torch.matmul(feature_cov_matrix, gp_feature.t()) * self.gp_cov_ridge_penalty
118
+ )
119
+ gp_cov_matrix = torch.matmul(gp_feature, cov_feature_product)
120
+ return gp_cov_matrix
121
+
122
+ def forward(
123
+ self,
124
+ input_features,
125
+ return_gp_cov: bool = False,
126
+ update_cov: bool = True,
127
+ ):
128
+ gp_feature, gp_output = self.gp_layer(input_features, update_cov=update_cov)
129
+ if return_gp_cov:
130
+ gp_cov_matrix = self.compute_predictive_covariance(gp_feature)
131
+ return gp_output, gp_cov_matrix
132
+ return gp_output
train.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a heavily adapted version of this notebook:
2
+ # https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb ,
3
+ # where we show on a simple text classification problem how we can integrate
4
+ # components for uncertainty quantification into large pretrained models.
5
+
6
+ import evaluate
7
+ import numpy as np
8
+ from datasets import load_dataset
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ TrainingArguments,
12
+ Trainer,
13
+ TrainerCallback,
14
+ )
15
+ from uq import BertForUQSequenceClassification
16
+
17
+ BATCH_SIZE = 16
18
+ EVAL_BATCH_SIZE = 128
19
+ DEVICE = "cpu"
20
+
21
+ # cola dataset for determining whether sentences are gramatically correct
22
+ task = "cola"
23
+ model_checkpoint = "bert-base-uncased"
24
+ dataset = load_dataset("glue", task)
25
+ metric = evaluate.load("glue", task)
26
+
27
+ # Load our tokenizer and tokenize our data as it streams in
28
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
29
+
30
+
31
+ def tokenize_data(data):
32
+ # Will add input ID and attention mask columns to dataset
33
+ return tokenizer(data["sentence"], truncation=True)
34
+
35
+
36
+ encoded_dataset = dataset.map(tokenize_data, batched=True)
37
+
38
+ # Now we can load our pretrained model and introduce our uncertainty quantification component,
39
+ # which in this case is a GP final layer without any spectral normalization of the transformer weights
40
+ num_labels = 2
41
+ id2label = {0: "Invalid", 1: "Valid"}
42
+ label2id = {val: key for key, val in id2label.items()}
43
+ model = BertForUQSequenceClassification.from_pretrained(
44
+ model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
45
+ )
46
+
47
+
48
+ # Specify training arguments
49
+ metric_name = "matthews_correlation"
50
+ model_name = model_checkpoint.split("/")[-1]
51
+
52
+ args = TrainingArguments(
53
+ f"{model_name}-finetuned-{task}",
54
+ evaluation_strategy="epoch",
55
+ save_strategy="epoch",
56
+ learning_rate=2e-5,
57
+ per_device_train_batch_size=BATCH_SIZE,
58
+ per_device_eval_batch_size=EVAL_BATCH_SIZE,
59
+ num_train_epochs=3,
60
+ weight_decay=0.01,
61
+ load_best_model_at_end=True,
62
+ metric_for_best_model=metric_name,
63
+ push_to_hub=True,
64
+ use_mps_device=False,
65
+ no_cuda=True,
66
+ )
67
+
68
+
69
+ # Set up metric tracking
70
+ def compute_metrics(eval_predictions):
71
+ predictions, labels = eval_predictions
72
+ predictions = np.argmax(predictions, axis=1)
73
+ return metric.compute(predictions=predictions, references=labels)
74
+
75
+
76
+ # Finally, set up trainer for finetuning the model
77
+ model.to(DEVICE)
78
+ trainer = Trainer(
79
+ model,
80
+ args,
81
+ train_dataset=encoded_dataset["train"],
82
+ eval_dataset=encoded_dataset["validation"],
83
+ tokenizer=tokenizer,
84
+ compute_metrics=compute_metrics,
85
+ )
86
+
87
+
88
+ # Add in a callback to reset the covariance matrix after each epoch, as we only need
89
+ # to do this once at the final epoch, so we don't double count any of the data. We
90
+ # could use a more elegant solution, but the covariance computation is very cheap
91
+ # so doing it ~5 times rather than once isn't a big deal.
92
+ class ResetCovarianceCallback(TrainerCallback):
93
+ def __init__(self, trainer) -> None:
94
+ super().__init__()
95
+ self._trainer = trainer
96
+
97
+ def on_epoch_end(self, args, state, control, **kwargs):
98
+ if control.should_evaluate:
99
+ self._trainer.model.classifier.reset_cov()
100
+
101
+
102
+ trainer.add_callback(ResetCovarianceCallback(trainer))
103
+
104
+ trainer.train()
105
+
106
+ trainer.push_to_hub()
uq.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from torch.nn import CrossEntropyLoss, MSELoss
3
+ from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
4
+ from gp import GPClassificationHead
5
+
6
+
7
+ class BertForUQSequenceClassification(BertPreTrainedModel):
8
+ def __init__(self, config):
9
+ super().__init__(config)
10
+ self.num_labels = config.num_labels
11
+
12
+ self.bert = BertModel(config)
13
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
14
+ self.classifier = GPClassificationHead(
15
+ hidden_size=config.hidden_size,
16
+ num_classes=config.num_labels,
17
+ num_inducing=512,
18
+ )
19
+
20
+ self.return_gp_cov = False
21
+
22
+ self.init_weights()
23
+
24
+ def forward(
25
+ self,
26
+ input_ids=None,
27
+ attention_mask=None,
28
+ token_type_ids=None,
29
+ position_ids=None,
30
+ head_mask=None,
31
+ inputs_embeds=None,
32
+ labels=None,
33
+ output_attentions=None,
34
+ output_hidden_states=None,
35
+ ):
36
+ r"""
37
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
38
+ Labels for computing the sequence classification/regression loss.
39
+ Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
40
+ If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
41
+ If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
42
+
43
+ Returns:
44
+ :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
45
+ loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
46
+ Classification (or regression if config.num_labels==1) loss.
47
+ logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
48
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
49
+ hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
50
+ Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
51
+ of shape :obj:`(batch_size, sequence_length, hidden_size)`.
52
+
53
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
54
+ attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
55
+ Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
56
+ :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
57
+
58
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
59
+ heads.
60
+ """
61
+
62
+ outputs = self.bert(
63
+ input_ids,
64
+ attention_mask=attention_mask,
65
+ token_type_ids=token_type_ids,
66
+ position_ids=position_ids,
67
+ head_mask=head_mask,
68
+ inputs_embeds=inputs_embeds,
69
+ output_attentions=output_attentions,
70
+ output_hidden_states=output_hidden_states,
71
+ )
72
+
73
+ pooled_output = outputs[1]
74
+
75
+ pooled_output = self.dropout(pooled_output)
76
+ if self.return_gp_cov:
77
+ logits, gp_cov = self.classifier(
78
+ pooled_output,
79
+ return_gp_cov=True,
80
+ update_cov=False,
81
+ )
82
+ else:
83
+ logits = self.classifier(pooled_output)
84
+
85
+ outputs = (logits,) + outputs[
86
+ 2:
87
+ ] # add hidden states and attention if they are here
88
+
89
+ if labels is not None:
90
+ if self.num_labels == 1:
91
+ # We are doing regression
92
+ loss_fct = MSELoss()
93
+ loss = loss_fct(logits.view(-1), labels.view(-1))
94
+ else:
95
+ loss_fct = CrossEntropyLoss()
96
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
97
+ outputs = (loss,) + outputs
98
+
99
+ if self.return_gp_cov:
100
+ return outputs, gp_cov
101
+ else:
102
+ return outputs # (loss), logits, (hidden_states), (attentions)