|
import pandas as pd |
|
import numpy as np |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from nltk.tokenize import word_tokenize |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
|
|
def recommend_papers(positive_csv_path, unlabelled_csv_path, n): |
|
|
|
positive_labelled_info = pd.read_csv(positive_csv_path) |
|
unlabelled_labelled = pd.read_csv(unlabelled_csv_path) |
|
|
|
|
|
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract'] |
|
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract'] |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") |
|
positive_labelled_info['input_ids'] = positive_labelled_info['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True)) |
|
unlabelled_labelled['input_ids'] = unlabelled_labelled['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True)) |
|
|
|
|
|
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
positive_labelled_info['embedding'] = positive_labelled_info['input_ids'].apply(lambda x: model(torch.tensor([x]))[0]) |
|
|
|
|
|
centroid = np.mean(positive_labelled_info['embedding'].tolist(), axis=0) |
|
|
|
|
|
similarity_scores = [] |
|
for input_ids in unlabelled_labelled['input_ids']: |
|
embedding = model(torch.tensor([input_ids]))[0].detach().numpy() |
|
similarity_score = cosine_similarity([centroid], [embedding])[0][0] |
|
similarity_scores.append(similarity_score) |
|
|
|
|
|
unlabelled_labelled['similarity_score'] = similarity_scores |
|
|
|
|
|
recommended_documents = unlabelled_labelled.sort_values(by='similarity_score', ascending=False) |
|
|
|
|
|
return recommended_documents[['id', 'title', 'similarity_score']].head(n) |
|
|