Clemet commited on
Commit
049e137
1 Parent(s): e79483d

Upload 9 files

Browse files
Files changed (5) hide show
  1. app.py +178 -18
  2. model_lstm.pth +3 -0
  3. models.py +51 -0
  4. requirements.txt +3 -1
  5. utils.py +11 -5
app.py CHANGED
@@ -1,19 +1,34 @@
1
  import streamlit as st
2
- from utils import get_roberta, get_gpt, get_distilbert
 
 
3
  import torch
 
 
 
 
 
 
 
 
 
4
 
5
-
6
-
7
-
8
- st.title('Sentence Entailment')
9
- col1, col2 = st.columns([1,1])
10
-
11
- with col1:
12
  sentence1 = st.text_input('Premise')
13
-
14
- with col2:
15
  sentence2 = st.text_input('Hypothesis')
16
- btn = st.button("Submit")
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  label_dict = {
19
  0 : 'entailment',
@@ -22,6 +37,15 @@ label_dict = {
22
  }
23
 
24
  if btn:
 
 
 
 
 
 
 
 
 
25
  # Get Roberta Output
26
  roberta_tokenizer, roberta_model = get_roberta()
27
  roberta_input = roberta_tokenizer(
@@ -32,8 +56,13 @@ if btn:
32
  truncation=True,
33
  max_length=512
34
  )
35
- roberta_logits = roberta_model(**roberta_input)['logits']
36
- st.write('ROBERTA', label_dict[roberta_logits.argmax().item()])
 
 
 
 
 
37
 
38
  distilbert_tokenizer, distilbert_model = get_distilbert()
39
  distilbert_input = distilbert_tokenizer(
@@ -44,9 +73,10 @@ if btn:
44
  truncation=True,
45
  max_length=512
46
  )
47
- distilbert_logits = distilbert_model(**distilbert_input)['logits']
48
- st.write('DistilBERT', label_dict[distilbert_logits.argmax().item()])
49
- #
 
50
 
51
  gpt_tokenizer, gpt_model = get_gpt()
52
  gpt_input = gpt_tokenizer(
@@ -57,5 +87,135 @@ if btn:
57
  return_tensors='pt'
58
  )
59
 
60
- gpt_logits = gpt_model(**gpt_input)['logits']
61
- st.write('GPT', label_dict[gpt_logits.argmax().item()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from streamlit import components
3
+ from utils import get_roberta, get_gpt, get_distilbert, softmax
4
+ from models import load_custom_model
5
  import torch
6
+ import plotly.express as px
7
+ from plotly.subplots import make_subplots
8
+ import plotly.graph_objects as go
9
+ from bertviz.neuron_view import show
10
+ from bertviz import model_view, head_view
11
+ import pandas as pd
12
+ import warnings
13
+ warnings.filterwarnings('ignore')
14
+ st.set_page_config(page_title="Sentence Entailment",layout="wide")
15
 
16
+ with st.sidebar:
17
+ st.title('Sentence Entailment')
 
 
 
 
 
18
  sentence1 = st.text_input('Premise')
 
 
19
  sentence2 = st.text_input('Hypothesis')
20
+ btn = st.button("Submit")
21
+
22
+ # At least for roberta
23
+ n_layers = 12
24
+ n_heads = 12
25
+ col1, col2 = st.columns([1,1])
26
+ with col1:
27
+ #layer = st.slider('Layer', 0, n_layers-1)
28
+ layer = st.number_input('Layer', min_value=0, max_value=11)
29
+ with col2:
30
+ #head = st.slider('Head', 0, n_heads-1)
31
+ head = st.number_input('Head', min_value=0, max_value=11)
32
 
33
  label_dict = {
34
  0 : 'entailment',
 
37
  }
38
 
39
  if btn:
40
+
41
+ preds_tab, roberta_tab, distilbert_tab, gpt_tab, lstm_tab = st.tabs([
42
+ 'Predictions',
43
+ 'RoBERTa',
44
+ 'DistilBERT',
45
+ 'GPT',
46
+ 'LSTM'
47
+ ])
48
+
49
  # Get Roberta Output
50
  roberta_tokenizer, roberta_model = get_roberta()
51
  roberta_input = roberta_tokenizer(
 
56
  truncation=True,
57
  max_length=512
58
  )
59
+ roberta_outputs = roberta_model(**roberta_input)
60
+ roberta_logits = roberta_outputs['logits']
61
+ #roberta_attentions = roberta_outputs.attentions
62
+ #roberta_tokens = roberta_tokenizer.convert_ids_to_tokens(roberta_input['input_ids'][0])
63
+ #st.write('ROBERTA', label_dict[roberta_logits.argmax().item()])
64
+ roberta_prediction = label_dict[roberta_logits.argmax().item()]
65
+ roberta_probas = softmax(roberta_logits)
66
 
67
  distilbert_tokenizer, distilbert_model = get_distilbert()
68
  distilbert_input = distilbert_tokenizer(
 
73
  truncation=True,
74
  max_length=512
75
  )
76
+ distilbert_output = distilbert_model(**distilbert_input)
77
+ distilbert_logits = distilbert_output['logits']
78
+ distilbert_prediction = label_dict[distilbert_logits.argmax().item()]
79
+ distilbert_probas = softmax(distilbert_logits)
80
 
81
  gpt_tokenizer, gpt_model = get_gpt()
82
  gpt_input = gpt_tokenizer(
 
87
  return_tensors='pt'
88
  )
89
 
90
+ gpt_outputs = gpt_model(**gpt_input)
91
+ gpt_logits = gpt_outputs['logits']
92
+ gpt_prediction = label_dict[gpt_logits.argmax().item()]
93
+ gpt_probas = softmax(gpt_logits)
94
+
95
+ lstm_model = load_custom_model('model_lstm.pth', model_type='lstm')
96
+ bos_token = roberta_tokenizer.bos_token # Token de début de séquence
97
+ sep_token = roberta_tokenizer.sep_token # Token de séparation
98
+ eos_token = roberta_tokenizer.eos_token # Token de fin de séquence
99
+ sentence = bos_token + ' ' + sentence1 + ' ' + sep_token + ' ' + sentence2 + ' ' + eos_token
100
+ lstm_input = roberta_tokenizer.encode(sentence, add_special_tokens=False, padding='max_length', max_length=130, return_tensors="pt")
101
+
102
+ with torch.no_grad():
103
+ lstm_logits = lstm_model(lstm_input)
104
+ lstm_prediction = label_dict[lstm_logits.argmax().item()]
105
+ lstm_probas = softmax(lstm_logits)
106
+
107
+ with preds_tab:
108
+
109
+ col1, col2, col3, col4 = st.columns([1,1,1,1])
110
+ with col1:
111
+ # Pie RoBERTa probabilities
112
+ fig = px.pie(title=f'RoBERTa : {roberta_prediction}', names=label_dict.values(), values=roberta_probas)
113
+ fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
114
+ st.plotly_chart(fig, use_container_width=True)
115
+ with col2:
116
+ # Pie DistilBERT probabilities
117
+ fig = px.pie(title=f'DistilBERT : {distilbert_prediction}', names=label_dict.values(), values=distilbert_probas)
118
+ fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
119
+ st.plotly_chart(fig, use_container_width=True)
120
+
121
+ with col3:
122
+ # Pie GPT probabilities
123
+ fig = px.pie(title=f'GPT : {gpt_prediction}', names=label_dict.values(), values=gpt_probas)
124
+ fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
125
+ st.plotly_chart(fig, use_container_width=True)
126
+
127
+ with col4:
128
+ # Pie RoBERTa probabilities
129
+ fig = px.pie(title=f'LSTM : {lstm_prediction}', names=label_dict.values(), values=lstm_probas)
130
+ fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
131
+ st.plotly_chart(fig, use_container_width=True)
132
+
133
+ with roberta_tab:
134
+
135
+ with st.expander('Why RoBERTa?'):
136
+ st.write("""
137
+ Compared to BERT, RoBERTa introduces several optimizations in the pre-training process, such as training with larger batch sizes, omitting the next sentence prediction (NSP) pre-training phase, and using a larger corpus. These modifications demonstrated significant improvements on several NLP benchmarks.RoBERTa excels in tasks that require a deep understanding of the context and semantic relationships between sentences, which is essential in this case where SNLI is used and the objective is to determine the relationship (entailment, contradiction, neutral) between a premise and a hypothesis.
138
+ """)
139
+
140
+ attentions = roberta_outputs.attentions
141
+ tokens = roberta_tokenizer.convert_ids_to_tokens(roberta_input['input_ids'][0])
142
+ with st.expander('Model View'):
143
+ st.write('Click on a cell for details')
144
+ components.v1.html(
145
+ model_view(
146
+ attention=attentions,
147
+ tokens=tokens,
148
+ html_action='return'
149
+ )._repr_html_(), height=775, width=1000, scrolling=True)
150
+
151
+ with st.expander('Attention'):
152
+ attention_matrix = attentions[layer][0, head].detach().numpy()
153
+ separator_token = roberta_tokenizer.sep_token
154
+ sep_token_index = tokens.index(separator_token) if separator_token in tokens else len(tokens) - 1
155
+ tokens_a = tokens[1:sep_token_index] # tokens de la première phrase
156
+ tokens_b = tokens[sep_token_index + 1:-1] # tokens de la deuxième phrase
157
+ attention_matrix_adjusted = attention_matrix[1:sep_token_index, sep_token_index + 1:-1]
158
+ df = pd.DataFrame(attention_matrix_adjusted)
159
+ tokens_a = [tok.split('Ġ')[-1] for tok in tokens_a]
160
+ tokens_b = [tok.split('Ġ')[-1] for tok in tokens_b]
161
+ df.index = tokens_a
162
+ df.columns = tokens_b
163
+ fig = px.imshow(df, text_auto=True)
164
+ fig.update_layout(margin=dict(t=0,r=0,l=0,b=0))
165
+ st.plotly_chart(fig)
166
+
167
+ with distilbert_tab:
168
+ with st.expander('Why DistilBERT?'):
169
+ st.write("""DistilBERT represents a lightweight, optimized version of BERT, designed to deliver much of BERT's performance with a fraction of its computational resources. The knowledge of a pre-trained BERT model is "distilled" in DistilBERT, reducing the size of the model while retaining much of its learning capacity. This reduction in size translates into a significant acceleration in training and inference time, as well as a reduction in memory usage. DistilBERT is therefore a wise choice for a wide range of NLP tasks, offering an effective compromise between performance and efficiency.""")
170
+ attentions = distilbert_output[-1]
171
+ tokens = distilbert_tokenizer.convert_ids_to_tokens(distilbert_input['input_ids'][0])
172
+ with st.expander('Model View'):
173
+ st.write('Click on a cell for details')
174
+ if layer > 5:
175
+ st.info('Please select Layer index smaller or equal to 5 for DistilBERT')
176
+ else:
177
+ components.v1.html(
178
+ model_view(
179
+ attention=attentions,
180
+ tokens=tokens,
181
+ html_action='return'
182
+ )._repr_html_(), height=375, width=1000, scrolling=True)
183
+
184
+ with st.expander('Attention'):
185
+ attention_matrix = attentions[layer][0, head].detach().numpy()
186
+ separator_token = distilbert_tokenizer.sep_token
187
+ sep_token_index = tokens.index(separator_token) if separator_token in tokens else len(tokens) - 1
188
+ tokens_a = tokens[1:sep_token_index] # tokens de la première phrase
189
+ tokens_b = tokens[sep_token_index + 1:-1] # tokens de la deuxième phrase
190
+ attention_matrix_adjusted = attention_matrix[1:sep_token_index, sep_token_index + 1:-1]
191
+ df = pd.DataFrame(attention_matrix_adjusted)
192
+ tokens_a = [tok.split('Ġ')[-1] for tok in tokens_a]
193
+ tokens_b = [tok.split('Ġ')[-1] for tok in tokens_b]
194
+ df.index = tokens_a
195
+ df.columns = tokens_b
196
+ fig = px.imshow(df, text_auto=True)
197
+ fig.update_layout(margin=dict(t=0,r=0,l=0,b=0))
198
+ st.plotly_chart(fig)
199
+
200
+ with gpt_tab:
201
+ with st.expander('Why GPT?'):
202
+ st.write("""L'utilisation de GPT pour la classification de séquences exploite ses capacités en génération de texte pour des applications de classification.Originellement développé pour générer du texte, GPT possède une compréhension approfondie du langage qui s'avère bénéfique même pour catégoriser des textes.Pour adapter GPT à des tâches de classification, on effectue un fine-tuning du modèle sur notre dataset. Ce processus d'ajustement permet à GPT de lier efficacement des séquences textuelles à des catégories définies, ajustant ses poids internes pour optimiser la prédiction de labels à partir des données d'entraînement.""")
203
+ attentions = gpt_outputs[-1]
204
+ tokens = gpt_tokenizer.convert_ids_to_tokens(gpt_input['input_ids'][0])
205
+
206
+ with st.expander('Visualizations'):
207
+ st.warning('Not displayed for UX reasons (creates use lags and crashes), but the same as RoBERTa and DistilBERT could in theory be displayed as it is a transformers model too.')
208
+
209
+ with lstm_tab:
210
+ with st.expander('Why LSTM?'):
211
+ st.write("""We need a contextual analysis of word sequences in this premise-hypothesis problem.
212
+ LSTMs are designed to process data sequences by capturing long-term dependencies, making them suitable for this tp where context and word order are important.""")
213
+
214
+ with st.expander('Architecture'):
215
+ st.write("""Embedding Layer: Converts word indices into dense vectors. Using an embedding layer is essential here to represent words in a vector space where semantic relationships can be learned. For the SNLI dataset in our case, where understanding the meaning of words in context is essential, this choice is consistent.""")
216
+ st.write("""Bidirectional: Using a bidirectional LSTM allows the model to capture contextual information both before and after each word in the sequence, giving us a richer understanding of the overall meaning of the premise and hypothesis.""")
217
+ st.write("""Number of LSTM layers: Having several LSTM layers enables the model to capture higher levels of semantic and syntactic abstraction. However, it's important to strike a balance to avoid overlearning and the training difficulties associated with deep networks. The choice of 6 layers gives us the best results""")
218
+
219
+
220
+ else:
221
+ st.info('Enter 2 sentences')
model_lstm.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03d9ad5707ebeccec15ddcfc5418ea03dd24b416e6f4ab2d915118721dd838d0
3
+ size 31543848
models.py CHANGED
@@ -1,5 +1,15 @@
1
  from torch import nn
2
  from transformers import RobertaModel, RobertaConfig
 
 
 
 
 
 
 
 
 
 
3
 
4
  class RobertaSNLI(nn.Module):
5
  def __init__(self):
@@ -24,4 +34,45 @@ class RobertaSNLI(nn.Module):
24
  return self.output(roberta_out[:, 0]), attentions
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
 
 
 
 
 
 
 
 
 
1
  from torch import nn
2
  from transformers import RobertaModel, RobertaConfig
3
+ import torch
4
+ from transformers import RobertaTokenizer
5
+
6
+ tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
7
+
8
+ ### Définition des tokens spéciaux ###
9
+ # indiquent respectivement le début de séquence, la séparation entre hypothèse et prémisse, et la fin de séquence #
10
+ bos_token = tokenizer.bos_token
11
+ sep_token = tokenizer.sep_token
12
+ eos_token = tokenizer.eos_token
13
 
14
  class RobertaSNLI(nn.Module):
15
  def __init__(self):
 
34
  return self.output(roberta_out[:, 0]), attentions
35
 
36
 
37
+ class LstmSNLI(nn.Module):
38
+ def __init__(self):
39
+ super(LstmSNLI, self).__init__()
40
+ # Couche d'embedding : transforme les indices de mots en vecteurs denses
41
+ self.embedding = nn.Embedding(
42
+ num_embeddings=tokenizer.vocab_size, # Taille du vocabulaire obtenu depuis le tokenizer
43
+ embedding_dim=128, # Dimension des vecteurs d'embedding
44
+ padding_idx=tokenizer.pad_token_id, # Index du token de padding utilisé pour égaliser les longueurs des séquences
45
+ max_norm=1 # La norme maximale des vecteurs d'embedding; réduit la variance des embeddings
46
+ )
47
+ # Couche LSTM : réseau de neurones récurrent capable de capturer des dépendances à long terme
48
+ self.lstm = nn.LSTM(
49
+ num_layers=4, # Utilisation de 4 couches LSTM empilées pour plus de profondeur
50
+ input_size=128, # Taille des entrées correspondant à la dimension des embeddings
51
+ hidden_size=128, # Taille des états cachés dans le LSTM
52
+ batch_first=True, # Indique que la première dimension des entrées représente la taille du batch
53
+ dropout=0.1, # Taux de dropout appliqué aux sorties de chaque couche LSTM, sauf la dernière
54
+ bidirectional=True # LSTM bidirectionnel pour capturer des informations contextuelles des deux directions
55
+ )
56
+ # Couche linéaire : effectue une transformation linéaire pour classifier les sorties du LSTM
57
+ self.linear = nn.Linear(
58
+ in_features=256, # Taille d'entrée
59
+ out_features=3 # Trois sorties pour les trois classes du SNLI
60
+ )
61
+
62
+ def forward(self, input_ids):
63
+ # Transformation des indices de mots en embeddings
64
+ embed = self.embedding(input_ids)
65
+ # Passage des embeddings à travers le LSTM
66
+ lstm_out = self.lstm(embed) # lstm_out contient les sorties de toutes les étapes temporelles; les états cachés ne sont pas utilisés ici
67
+ return self.linear(lstm_out[0][:,0])
68
+
69
 
70
+ def load_custom_model(model_path, model_type='lstm'):
71
+ if model_type == 'lstm':
72
+ model = LstmSNLI()
73
+ else:
74
+ model = RobertaSNLI()
75
+ print("")
76
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) # chargement des poids du modèle depuis le fichier
77
+ model.eval()
78
+ return model
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  torch
2
  safetensors
3
  transformers
4
- streamlit
 
 
 
1
  torch
2
  safetensors
3
  transformers
4
+ streamlit
5
+ plotly
6
+ bertviz
utils.py CHANGED
@@ -2,13 +2,13 @@ from safetensors.torch import load_model
2
  from transformers import RobertaTokenizer, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
3
  from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
4
  from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
5
-
6
 
7
 
8
 
9
  def get_roberta():
10
 
11
- model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-roberta-base')
12
  load_model(model, "roberta.safetensors")
13
 
14
  tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')
@@ -17,7 +17,7 @@ def get_roberta():
17
 
18
  def get_gpt():
19
 
20
- model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)
21
  model.config.pad_token_id = model.config.eos_token_id
22
  load_model(model, "gpt.safetensors")
23
 
@@ -28,9 +28,15 @@ def get_gpt():
28
 
29
  def get_distilbert():
30
 
31
- model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
32
  load_model(model, "distilbert.safetensors")
33
 
34
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
35
 
36
- return tokenizer, model
 
 
 
 
 
 
 
2
  from transformers import RobertaTokenizer, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
3
  from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
4
  from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
5
+ import numpy as np
6
 
7
 
8
 
9
  def get_roberta():
10
 
11
+ model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-roberta-base', output_attentions=True)
12
  load_model(model, "roberta.safetensors")
13
 
14
  tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')
 
17
 
18
  def get_gpt():
19
 
20
+ model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3, output_attentions=True)
21
  model.config.pad_token_id = model.config.eos_token_id
22
  load_model(model, "gpt.safetensors")
23
 
 
28
 
29
  def get_distilbert():
30
 
31
+ model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3, output_attentions=True)
32
  load_model(model, "distilbert.safetensors")
33
 
34
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
35
 
36
+ return tokenizer, model
37
+
38
+
39
+ def softmax(xx):
40
+ """Compute softmax values for each sets of scores in x."""
41
+ x = xx.detach().numpy()[0]
42
+ return np.exp(x) / np.sum(np.exp(x), axis=0)