Update Cosine_distance.py
Browse files- Cosine_distance.py +15 -28
Cosine_distance.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
import pandas as pd
|
2 |
-
from gensim.models import Word2Vec
|
3 |
import numpy as np
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
from nltk.tokenize import word_tokenize
|
6 |
-
from
|
7 |
-
|
8 |
|
9 |
def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
|
10 |
# Load positive labelled data and unlabelled labelled data
|
@@ -15,37 +14,25 @@ def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
|
|
15 |
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
|
16 |
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
|
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
|
25 |
-
#
|
26 |
-
|
27 |
-
tokenizer.fit_on_texts(positive_labelled_info['text'])
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
unlabelled_sequences = tokenizer.texts_to_sequences(unlabelled_labelled['text'])
|
32 |
-
|
33 |
-
# Pad or truncate sequences to a fixed length
|
34 |
-
max_length = 50 # Adjust as needed
|
35 |
-
padded_positive_sequences = pad_sequences(positive_sequences, maxlen=max_length, padding='post', truncating='post')
|
36 |
-
padded_unlabelled_sequences = pad_sequences(unlabelled_sequences, maxlen=max_length, padding='post', truncating='post')
|
37 |
-
|
38 |
-
# Compute centroid of positive examples' word embeddings
|
39 |
-
centroid = np.mean([word2vec_model.wv[token] for token in tokenizer.index_word.values() if token in word2vec_model.wv], axis=0)
|
40 |
|
41 |
# Compute similarity scores for each document in the unlabelled labelled data
|
42 |
similarity_scores = []
|
43 |
-
for
|
44 |
-
|
45 |
-
|
46 |
-
similarity_score = cosine_similarity([centroid], [np.mean(embeddings, axis=0)])[0][0]
|
47 |
-
else:
|
48 |
-
similarity_score = 0 # Default similarity score for empty embeddings
|
49 |
similarity_scores.append(similarity_score)
|
50 |
|
51 |
# Add similarity scores to unlabelled labelled data
|
|
|
1 |
import pandas as pd
|
|
|
2 |
import numpy as np
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
from nltk.tokenize import word_tokenize
|
5 |
+
from transformers import AutoTokenizer, AutoModel
|
6 |
+
import torch
|
7 |
|
8 |
def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
|
9 |
# Load positive labelled data and unlabelled labelled data
|
|
|
14 |
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
|
15 |
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
|
16 |
|
17 |
+
# Tokenize text using the pre-trained Sentence Transformer model
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
19 |
+
positive_labelled_info['input_ids'] = positive_labelled_info['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
|
20 |
+
unlabelled_labelled['input_ids'] = unlabelled_labelled['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
|
21 |
|
22 |
+
# Load the pre-trained Sentence Transformer model
|
23 |
+
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
24 |
|
25 |
+
# Compute embeddings for positive labelled data
|
26 |
+
positive_labelled_info['embedding'] = positive_labelled_info['input_ids'].apply(lambda x: model(torch.tensor([x]))[0])
|
|
|
27 |
|
28 |
+
# Compute centroid of positive examples' embeddings
|
29 |
+
centroid = np.mean(positive_labelled_info['embedding'].tolist(), axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
# Compute similarity scores for each document in the unlabelled labelled data
|
32 |
similarity_scores = []
|
33 |
+
for input_ids in unlabelled_labelled['input_ids']:
|
34 |
+
embedding = model(torch.tensor([input_ids]))[0].detach().numpy()
|
35 |
+
similarity_score = cosine_similarity([centroid], [embedding])[0][0]
|
|
|
|
|
|
|
36 |
similarity_scores.append(similarity_score)
|
37 |
|
38 |
# Add similarity scores to unlabelled labelled data
|