File size: 2,802 Bytes
ed74deb
 
01714af
ed74deb
77b655a
01714af
4bd311a
01714af
1c9c58f
57da7b2
01714af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed74deb
01714af
 
 
 
 
 
 
 
 
 
 
3bba4cb
 
 
 
 
 
 
821e07e
3bba4cb
 
 
 
9485db4
 
821e07e
3bba4cb
57da7b2
 
 
 
 
 
b1c0da1
8e7eab6
3bba4cb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import torch

model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2022"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint,output_hidden_states=True)
model_base = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased", output_hidden_states=True)
model_2019_2022 = AutoModelForMaskedLM.from_pretrained("vives/distilbert-base-uncased-finetuned-cvent-2019_2022",output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
text1 = st.text_area("Enter first sentence")
text2 = st.text_area("Enter second sentence")

def concat_tokens(t1,t2):
  tokens = {'input_ids': [], 'attention_mask': []}
  sentences = [t1, t2]
  for sentence in sentences:
      # encode each sentence and append to dictionary
      new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                         truncation=True, padding='max_length',
                                         return_tensors='pt')
      tokens['input_ids'].append(new_tokens['input_ids'][0])
      tokens['attention_mask'].append(new_tokens['attention_mask'][0])
  
  # reformat list of tensors into single tensor
  tokens['input_ids'] = torch.stack(tokens['input_ids'])
  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
  return tokens

def pool_embeddings(out, tok):
  embeddings = out["hidden_states"][-1]
  attention_mask = tok['attention_mask']
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
  masked_embeddings = embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  return mean_pooled
  
if text1 and text2:
  with torch.no_grad():
    tokens = concat_tokens(text1,text2)
    outputs = model(**tokens)
    mean_pooled = pool_embeddings(outputs,tokens).detach().numpy()
    fine_tuned_out = cosine_similarity(
        [mean_pooled[0]],
        mean_pooled[1:]
    )[0][0]
    
    outputs_base = model_base(**tokens)
    mean_pooled_base = pool_embeddings(outputs_base,tokens).detach().numpy()
    base_out = cosine_similarity(
        [mean_pooled_base[0]],
        mean_pooled_base[1:]
    )[0][0]
    
    outputs_2019_2022 = model_2019_2022(**tokens)
    mean_pooled_2019_2022 = pool_embeddings(outputs_2019_2022,tokens).detach().numpy()
    fine_tuned_out2 = cosine_similarity(
        [mean_pooled_2019_2022[0]],
        mean_pooled_2019_2022[1:]
    )[0][0]
    st.write(f">>>Similarity for fine-tuned (2019-2022) {fine_tuned_out2}")
    st.write(f">>>Similarity for fine-tuned (2022) {fine_tuned_out}")
    st.write(f">>>Similarity for base {base_out}")