File size: 11,768 Bytes
f2a699d
049e137
 
 
f2a699d
049e137
 
 
 
 
 
 
 
 
f2a699d
049e137
 
f2a699d
 
049e137
 
 
 
 
 
 
 
 
 
 
 
f2a699d
 
 
 
 
 
 
 
049e137
 
 
 
 
 
 
 
 
f2a699d
 
 
 
 
 
 
 
 
 
049e137
 
 
 
 
 
 
f2a699d
 
 
 
 
 
 
 
 
 
049e137
 
 
 
f2a699d
 
 
 
 
 
 
 
 
 
049e137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ed291
049e137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ed291
049e137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817dbe0
049e137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import streamlit as st
from streamlit import components
from utils import get_roberta, get_gpt, get_distilbert, softmax
from models import load_custom_model
import torch
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from bertviz.neuron_view import show
from bertviz import model_view, head_view
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
st.set_page_config(page_title="Sentence Entailment",layout="wide")

with st.sidebar:
    st.title('Sentence Entailment')
    sentence1 = st.text_input('Premise')
    sentence2 = st.text_input('Hypothesis')
    btn = st.button("Submit")

    # At least for roberta
    n_layers = 12
    n_heads = 12
    col1, col2 = st.columns([1,1])
    with col1:
        #layer = st.slider('Layer', 0, n_layers-1)
        layer = st.number_input('Layer', min_value=0, max_value=11)
    with col2:
        #head = st.slider('Head', 0, n_heads-1)
        head = st.number_input('Head', min_value=0, max_value=11)

label_dict = {
    0 : 'entailment',
    1 : 'neutral',
    2 : 'contradiction'
}

if btn:

    preds_tab, roberta_tab, distilbert_tab, gpt_tab, lstm_tab = st.tabs([
        'Predictions', 
        'RoBERTa', 
        'DistilBERT', 
        'GPT', 
        'LSTM'
    ])

    # Get Roberta Output
    roberta_tokenizer, roberta_model = get_roberta()
    roberta_input = roberta_tokenizer(
        sentence1, 
        sentence2, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    )
    roberta_outputs = roberta_model(**roberta_input)
    roberta_logits = roberta_outputs['logits']
    #roberta_attentions = roberta_outputs.attentions
    #roberta_tokens = roberta_tokenizer.convert_ids_to_tokens(roberta_input['input_ids'][0])
    #st.write('ROBERTA', label_dict[roberta_logits.argmax().item()])
    roberta_prediction = label_dict[roberta_logits.argmax().item()]
    roberta_probas = softmax(roberta_logits)

    distilbert_tokenizer, distilbert_model = get_distilbert()
    distilbert_input = distilbert_tokenizer(
        sentence1, 
        sentence2, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    )
    distilbert_output = distilbert_model(**distilbert_input)
    distilbert_logits = distilbert_output['logits']
    distilbert_prediction = label_dict[distilbert_logits.argmax().item()]
    distilbert_probas = softmax(distilbert_logits)

    gpt_tokenizer, gpt_model = get_gpt()
    gpt_input = gpt_tokenizer(
        sentence1 + ' [SEP] ' + sentence2,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    gpt_outputs = gpt_model(**gpt_input)
    gpt_logits = gpt_outputs['logits']
    gpt_prediction = label_dict[gpt_logits.argmax().item()]
    gpt_probas = softmax(gpt_logits)

    lstm_model = load_custom_model('model_lstm.pth', model_type='lstm')
    bos_token = roberta_tokenizer.bos_token  # Token de début de séquence
    sep_token = roberta_tokenizer.sep_token  # Token de séparation
    eos_token = roberta_tokenizer.eos_token  # Token de fin de séquence
    sentence = bos_token + ' ' + sentence1 + ' ' + sep_token + ' ' + sentence2 + ' ' + eos_token
    lstm_input = roberta_tokenizer.encode(sentence, add_special_tokens=False, padding='max_length', max_length=130, return_tensors="pt")

    with torch.no_grad():
        lstm_logits = lstm_model(lstm_input)
        lstm_prediction = label_dict[lstm_logits.argmax().item()]
        lstm_probas = softmax(lstm_logits)

    with preds_tab:

        col1, col2, col3, col4 = st.columns([1,1,1,1])
        with col1:
            # Pie RoBERTa probabilities
            fig = px.pie(title=f'RoBERTa : {roberta_prediction}', names=label_dict.values(), values=roberta_probas)
            fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
            st.plotly_chart(fig, use_container_width=True)
        with col2:
            # Pie DistilBERT probabilities
            fig = px.pie(title=f'DistilBERT : {distilbert_prediction}', names=label_dict.values(), values=distilbert_probas)
            fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
            st.plotly_chart(fig, use_container_width=True)

        with col3:
            # Pie GPT probabilities
            fig = px.pie(title=f'GPT : {gpt_prediction}', names=label_dict.values(), values=gpt_probas)
            fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
            st.plotly_chart(fig, use_container_width=True)

        with col4:
            # Pie RoBERTa probabilities
            fig = px.pie(title=f'LSTM : {lstm_prediction}', names=label_dict.values(), values=lstm_probas)
            fig.update_layout(margin=dict(t=100, l=0, r=0, b=0), showlegend=False)
            st.plotly_chart(fig, use_container_width=True)

    with roberta_tab:

        with st.expander('Why RoBERTa?'):
            st.write("""
                Compared to BERT, RoBERTa introduces several optimizations in the pre-training process, such as training with larger batch sizes, omitting the next sentence prediction (NSP) pre-training phase, and using a larger corpus. These modifications demonstrated significant improvements on several NLP benchmarks.RoBERTa excels in tasks that require a deep understanding of the context and semantic relationships between sentences, which is essential in this case where SNLI is used and the objective is to determine the relationship (entailment, contradiction, neutral) between a premise and a hypothesis.
            """)

        attentions = roberta_outputs.attentions
        tokens = roberta_tokenizer.convert_ids_to_tokens(roberta_input['input_ids'][0])
        with st.expander('Model View'):
            st.write('Click on a cell for details')
            components.v1.html(
                model_view(
                    attention=attentions, 
                    tokens=tokens, 
                    html_action='return'
                )._repr_html_(), height=775, width=1000, scrolling=True)
            
        with st.expander('Attention at selected layer and head'):
            attention_matrix = attentions[layer][0, head].detach().numpy()
            separator_token = roberta_tokenizer.sep_token
            sep_token_index = tokens.index(separator_token) if separator_token in tokens else len(tokens) - 1
            tokens_a = tokens[1:sep_token_index]  # tokens de la première phrase 
            tokens_b = tokens[sep_token_index + 1:-1]  # tokens de la deuxième phrase
            attention_matrix_adjusted = attention_matrix[1:sep_token_index, sep_token_index + 1:-1]
            df = pd.DataFrame(attention_matrix_adjusted)
            tokens_a = [tok.split('Ġ')[-1] for tok in tokens_a]
            tokens_b = [tok.split('Ġ')[-1] for tok in tokens_b]
            df.index = tokens_a
            df.columns = tokens_b
            fig = px.imshow(df, text_auto=True)
            fig.update_layout(margin=dict(t=0,r=0,l=0,b=0))
            st.plotly_chart(fig)

    with distilbert_tab:
        with st.expander('Why DistilBERT?'):
            st.write("""DistilBERT represents a lightweight, optimized version of BERT, designed to deliver much of BERT's performance with a fraction of its computational resources. The knowledge of a pre-trained BERT model is "distilled" in DistilBERT, reducing the size of the model while retaining much of its learning capacity. This reduction in size translates into a significant acceleration in training and inference time, as well as a reduction in memory usage. DistilBERT is therefore a wise choice for a wide range of NLP tasks, offering an effective compromise between performance and efficiency.""")
        attentions = distilbert_output[-1]
        tokens = distilbert_tokenizer.convert_ids_to_tokens(distilbert_input['input_ids'][0])
        with st.expander('Model View'):
            st.write('Click on a cell for details')
            if layer > 5:
                st.info('Please select Layer index smaller or equal to 5 for DistilBERT')
            else:
                components.v1.html(
                    model_view(
                        attention=attentions, 
                        tokens=tokens, 
                        html_action='return'
                    )._repr_html_(), height=375, width=1000, scrolling=True)
                
        with st.expander('Attention at selected layer and head'):
            attention_matrix = attentions[layer][0, head].detach().numpy()
            separator_token = distilbert_tokenizer.sep_token
            sep_token_index = tokens.index(separator_token) if separator_token in tokens else len(tokens) - 1
            tokens_a = tokens[1:sep_token_index]  # tokens de la première phrase 
            tokens_b = tokens[sep_token_index + 1:-1]  # tokens de la deuxième phrase
            attention_matrix_adjusted = attention_matrix[1:sep_token_index, sep_token_index + 1:-1]
            df = pd.DataFrame(attention_matrix_adjusted)
            tokens_a = [tok.split('Ġ')[-1] for tok in tokens_a]
            tokens_b = [tok.split('Ġ')[-1] for tok in tokens_b]
            df.index = tokens_a
            df.columns = tokens_b
            fig = px.imshow(df, text_auto=True)
            fig.update_layout(margin=dict(t=0,r=0,l=0,b=0))
            st.plotly_chart(fig)

    with gpt_tab:
        with st.expander('Why GPT?'):
            st.write("""The use of GPT for sequence classification exploits its text generation capabilities for classification applications.Originally developed to generate text, GPT possesses a deep understanding of language that proves beneficial even for categorizing text.To adapt GPT to classification tasks, we perform model fine-tuning on our dataset. This tuning process enables GPT to efficiently link text sequences to defined categories, adjusting its internal weights to optimize label prediction from training data.""")
        attentions = gpt_outputs[-1]
        tokens = gpt_tokenizer.convert_ids_to_tokens(gpt_input['input_ids'][0])

        with st.expander('Visualizations'):
            st.warning('Not displayed for UX reasons (creates use lags and crashes), but the same as RoBERTa and DistilBERT could in theory be displayed as it is a transformers model too.')

    with lstm_tab:
        with st.expander('Why LSTM?'):
            st.write("""We need a contextual analysis of word sequences in this premise-hypothesis problem.
                        LSTMs are designed to process data sequences by capturing long-term dependencies, making them suitable for this tp where context and word order are important.""")          
        
        with st.expander('Architecture'):
            st.write("""Embedding Layer: Converts word indices into dense vectors. Using an embedding layer is essential here to represent words in a vector space where semantic relationships can be learned. For the SNLI dataset in our case, where understanding the meaning of words in context is essential, this choice is consistent.""")
            st.write("""Bidirectional: Using a bidirectional LSTM allows the model to capture contextual information both before and after each word in the sequence, giving us a richer understanding of the overall meaning of the premise and hypothesis.""")
            st.write("""Number of LSTM layers: Having several LSTM layers enables the model to capture higher levels of semantic and syntactic abstraction. However, it's important to strike a balance to avoid overlearning and the training difficulties associated with deep networks. The choice of 6 layers gives us the best results""")


else:
    st.info('Enter 2 sentences')