from transformers import AutoModelForMaskedLM from transformers import AutoTokenizer from sklearn.metrics.pairwise import cosine_similarity import streamlit as st import torch model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2022" model = AutoModelForMaskedLM.from_pretrained(model_checkpoint,output_hidden_states=True) model_base = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased", output_hidden_states=True) model_2019_2022 = AutoModelForMaskedLM.from_pretrained("vives/distilbert-base-uncased-finetuned-cvent-2019_2022",output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) text1 = st.text_area("Enter first sentence") text2 = st.text_area("Enter second sentence") def concat_tokens(t1,t2): tokens = {'input_ids': [], 'attention_mask': []} sentences = [t1, t2] for sentence in sentences: # encode each sentence and append to dictionary new_tokens = tokenizer.encode_plus(sentence, max_length=128, truncation=True, padding='max_length', return_tensors='pt') tokens['input_ids'].append(new_tokens['input_ids'][0]) tokens['attention_mask'].append(new_tokens['attention_mask'][0]) # reformat list of tensors into single tensor tokens['input_ids'] = torch.stack(tokens['input_ids']) tokens['attention_mask'] = torch.stack(tokens['attention_mask']) return tokens def pool_embeddings(out, tok): embeddings = out["hidden_states"][-1] attention_mask = tok['attention_mask'] mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() masked_embeddings = embeddings * mask summed = torch.sum(masked_embeddings, 1) summed_mask = torch.clamp(mask.sum(1), min=1e-9) mean_pooled = summed / summed_mask return mean_pooled if text1 and text2: with torch.no_grad(): tokens = concat_tokens(text1,text2) outputs = model(**tokens) mean_pooled = pool_embeddings(outputs,tokens).detach().numpy() fine_tuned_out = cosine_similarity( [mean_pooled[0]], mean_pooled[1:] )[0][0] outputs_base = model_base(**tokens) mean_pooled_base = pool_embeddings(outputs_base,tokens).detach().numpy() base_out = cosine_similarity( [mean_pooled_base[0]], mean_pooled_base[1:] )[0][0] outputs_2019_2022 = model_2019_2022(**tokens) mean_pooled_2019_2022 = pool_embeddings(outputs_2019_2022,tokens).detach().numpy() fine_tuned_out2 = cosine_similarity( [mean_pooled_2019_2022[0]], mean_pooled_2019_2022[1:] )[0][0] st.write(f">>>Similarity for fine-tuned (2019-2022) {fine_tuned_out2}") st.write(f">>>Similarity for fine-tuned (2022) {fine_tuned_out}") st.write(f">>>Similarity for base {base_out}")