Spaces:
Runtime error
Runtime error
File size: 2,802 Bytes
ed74deb 01714af ed74deb 77b655a 01714af 4bd311a 01714af 1c9c58f 57da7b2 01714af ed74deb 01714af 3bba4cb 821e07e 3bba4cb 9485db4 821e07e 3bba4cb 57da7b2 b1c0da1 8e7eab6 3bba4cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import torch
model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2022"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint,output_hidden_states=True)
model_base = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased", output_hidden_states=True)
model_2019_2022 = AutoModelForMaskedLM.from_pretrained("vives/distilbert-base-uncased-finetuned-cvent-2019_2022",output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
text1 = st.text_area("Enter first sentence")
text2 = st.text_area("Enter second sentence")
def concat_tokens(t1,t2):
tokens = {'input_ids': [], 'attention_mask': []}
sentences = [t1, t2]
for sentence in sentences:
# encode each sentence and append to dictionary
new_tokens = tokenizer.encode_plus(sentence, max_length=128,
truncation=True, padding='max_length',
return_tensors='pt')
tokens['input_ids'].append(new_tokens['input_ids'][0])
tokens['attention_mask'].append(new_tokens['attention_mask'][0])
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
return tokens
def pool_embeddings(out, tok):
embeddings = out["hidden_states"][-1]
attention_mask = tok['attention_mask']
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask
summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / summed_mask
return mean_pooled
if text1 and text2:
with torch.no_grad():
tokens = concat_tokens(text1,text2)
outputs = model(**tokens)
mean_pooled = pool_embeddings(outputs,tokens).detach().numpy()
fine_tuned_out = cosine_similarity(
[mean_pooled[0]],
mean_pooled[1:]
)[0][0]
outputs_base = model_base(**tokens)
mean_pooled_base = pool_embeddings(outputs_base,tokens).detach().numpy()
base_out = cosine_similarity(
[mean_pooled_base[0]],
mean_pooled_base[1:]
)[0][0]
outputs_2019_2022 = model_2019_2022(**tokens)
mean_pooled_2019_2022 = pool_embeddings(outputs_2019_2022,tokens).detach().numpy()
fine_tuned_out2 = cosine_similarity(
[mean_pooled_2019_2022[0]],
mean_pooled_2019_2022[1:]
)[0][0]
st.write(f">>>Similarity for fine-tuned (2019-2022) {fine_tuned_out2}")
st.write(f">>>Similarity for fine-tuned (2022) {fine_tuned_out}")
st.write(f">>>Similarity for base {base_out}")
|