File size: 2,494 Bytes
4547fcf
9c52068
 
 
4547fcf
 
9c52068
 
 
 
 
4547fcf
abc1971
 
76316b2
 
 
 
 
 
 
 
 
9c52068
4547fcf
a71f9d4
954178d
 
4547fcf
9423c98
 
 
5117017
 
abc1971
5117017
 
 
9c52068
 
 
 
2c2a191
9c52068
f336aa0
 
9c52068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5117017
 
 
 
4547fcf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import pandas as pd
import numpy as np

from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
from bokeh.plotting import figure, output_notebook, show, save
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, HoverTool
from sklearn.manifold import TSNE


@st.cache
def load_model():
  model = SentenceTransformer('hackathon-pln-es/bertin-roberta-base-finetuning-esnli')
  model.eval()
  return model
      
@st.cache
def load_plot_data():
  embs = np.load('semeval2015-embs.npy')
  data = pd.read_csv('semeval2015-data.csv')
  return embs, data
    
st.title("Sentence Embedding for Spanish with Bertin")
st.write("Sentence embedding for spanish trained on NLI. Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/bertin-roberta-base-finetuning-esnli.")
st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.")
st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli y Mauricio Mazuecos.")

sent1 = st.text_area('Enter sentence 1')
sent2 = st.text_area('Enter sentence 2')

if st.button('Compute similarity'):
  if sent1 and sent2:
    model = load_model()
    encodings = model.encode([sent1, sent2])
    sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0]
    st.text('Cosine Similarity: {0:.4f}'.format(sim))
    
    print('Generating visualization...')
    sentembs, data = load_plot_data()
    X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(np.concatenate([sentembs, encodings], axis=0))
                  
    data = data.append({'sent': sent1, 'color': '#F0E442'}, ignore_index=True) # sentence 1
    data = data.append({'sent': sent2, 'color': '#D55E00'}, ignore_index=True) # sentence 2
    data['x'] = X_embedded[:,0]
    data['y'] = X_embedded[:,1]
    
    source = ColumnDataSource(data)
    
    p = figure(title="Embeddings in space")
    p.circle(
      x='x',
      y='y',
      legend_label="Objects",
      #fill_color=["red"],
      color='color',
      fill_alpha=0.5,
      line_color="blue",
      size=14,
      source=source
    )
    p.add_tools(HoverTool(
      tooltips=[
          ('sent', '@sent')
      ],
      formatters={
          '@sent': 'printf'
      },
      mode='mouse'
    ))
    st.bokeh_chart(p, use_container_width=True)
  else:
      st.write('Missing a sentences')
else:
  pass