File size: 2,051 Bytes
9b9ea2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from functools import partial
from multiprocessing import Pool

# Load pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load data
with open('data.pickle', 'rb') as file:
    data = pickle.load(file)

# Define a function to compute similarity for a pair of sentences
def compute_similarity(model, source_sentence, target_sentence):
    embedding_1 = model.encode(source_sentence, convert_to_tensor=True)
    embedding_2 = model.encode(target_sentence, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embedding_1, embedding_2)
    return similarity.item()

# Define a function to compute similarities for a given source sentence
def compute_similarities_for_source(model, source_sentence, data):
    source_index = data.index(source_sentence)
    similarities = [compute_similarity(model,
                                       source_sentence['description'],
                                       data[index]['description']) for index in tqdm(range(source_index, len(data)),
                                                                                     desc=f"Computing similarities for '{source_sentence['description']}'")]
    return similarities

# Define a function to compute similarities for all sentences in the data
def compute_similarities(model, data):
    with Pool() as pool:
        func = partial(compute_similarities_for_source, model)
        similarities = list(tqdm(pool.imap(func, data), total=len(data), desc="Computing similarities"))
    return similarities

# Embed sentences and compute similarities
embeddings = model.encode([source_sentence['description'] for source_sentence in data], convert_to_tensor=True)
matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy()

# Save similarities to CSV file
pd.DataFrame(matrix, columns=[source_sentence['description'] for source_sentence in data]).to_csv('data.csv', index=False)