File size: 2,513 Bytes
faf09ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from warnings import filterwarnings
filterwarnings("ignore")
models = ["MPNet-base-v2", "DistilRoBERTa-v1", "MiniLM-L12-v2", "MiniLM-L6-v2"]
models_info = {
"MPNet-base-v2": {
"model_size": "420MB",
"model_url": "sentence-transformers/all-mpnet-base-v2",
"efficiency": "Moderate",
"chunk_size": 512
},
"DistilRoBERTa-v1": {
"model_size": "263MB",
"model_url": "sentence-transformers/all-distilroberta-v1",
"efficiency": "High",
"chunk_size": 512
},
"MiniLM-L12-v2": {
"model_size": "118MB",
"model_url": "sentence-transformers/all-MiniLM-L12-v2",
"efficiency": "High",
"chunk_size": 512
},
"MiniLM-L6-v2": {
"model_size": "82MB",
"model_url": "sentence-transformers/all-MiniLM-L6-v2",
"efficiency": "Very High",
"chunk_size": 512
}
}
class Description_Validator:
def __init__(self, model_name=None):
if model_name is None: model_name="DistilRoBERTa-v1"
self.model_info = models_info[model_name]
model_url = self.model_info["model_url"]
self.model = SentenceTransformer(model_url)
self.tokenizer = AutoTokenizer.from_pretrained(model_url)
self.chunk_size = self.model_info["chunk_size"]
def tokenize_and_chunk(self, text):
tokens = self.tokenizer(text, truncation=False, padding=True, add_special_tokens=False)['input_ids']
token_chunks = [tokens[i:i+self.chunk_size] for i in range(0, len(tokens), self.chunk_size)]
return token_chunks
def get_average_embedding(self, text):
token_chunks = self.tokenize_and_chunk(text)
chunk_embeddings = []
for chunk in token_chunks:
chunk_embedding = self.model.encode(self.tokenizer.decode(chunk), show_progress_bar=False)
chunk_embeddings.append(chunk_embedding)
return np.mean(chunk_embeddings, axis=0)
def similarity_score(self, desc1, desc2):
embedding1 = self.get_average_embedding(desc1).reshape(1, -1)
embedding2 = self.get_average_embedding(desc2).reshape(1, -1)
similarity = cosine_similarity(embedding1, embedding2)
return similarity[0][0] |