File size: 2,400 Bytes
f3a086d
 
 
 
6c9a7ab
 
f3a086d
 
 
 
 
 
 
 
 
 
6c9a7ab
 
 
 
f3a086d
6c9a7ab
 
f3a086d
6c9a7ab
 
f3a086d
6c9a7ab
 
f3a086d
 
 
6c9a7ab
 
 
f3a086d
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModel
import torch

def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
    # Load positive labelled data and unlabelled labelled data
    positive_labelled_info = pd.read_csv(positive_csv_path)
    unlabelled_labelled = pd.read_csv(unlabelled_csv_path)

    # Combine title and abstract for positive labelled data and unlabelled labelled data
    positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
    unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']

    # Tokenize text using the pre-trained Sentence Transformer model
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    positive_labelled_info['input_ids'] = positive_labelled_info['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
    unlabelled_labelled['input_ids'] = unlabelled_labelled['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

    # Load the pre-trained Sentence Transformer model
    model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    # Compute embeddings for positive labelled data
    positive_labelled_info['embedding'] = positive_labelled_info['input_ids'].apply(lambda x: model(torch.tensor([x]))[0])

    # Compute centroid of positive examples' embeddings
    centroid = np.mean(positive_labelled_info['embedding'].tolist(), axis=0)

    # Compute similarity scores for each document in the unlabelled labelled data
    similarity_scores = []
    for input_ids in unlabelled_labelled['input_ids']:
        embedding = model(torch.tensor([input_ids]))[0].detach().numpy()
        similarity_score = cosine_similarity([centroid], [embedding])[0][0]
        similarity_scores.append(similarity_score)

    # Add similarity scores to unlabelled labelled data
    unlabelled_labelled['similarity_score'] = similarity_scores

    # Sort unlabelled labelled data by similarity score in descending order
    recommended_documents = unlabelled_labelled.sort_values(by='similarity_score', ascending=False)

    # Return top n recommended documents
    return recommended_documents[['id', 'title', 'similarity_score']].head(n)