|
import pandas as pd |
|
from gensim.models import Word2Vec |
|
import numpy as np |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from nltk.tokenize import word_tokenize |
|
from keras.preprocessing.text import Tokenizer |
|
from keras.preprocessing.sequence import pad_sequences |
|
|
|
def recommend_papers(positive_csv_path, unlabelled_csv_path, n): |
|
|
|
positive_labelled_info = pd.read_csv(positive_csv_path) |
|
unlabelled_labelled = pd.read_csv(unlabelled_csv_path) |
|
|
|
|
|
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract'] |
|
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract'] |
|
|
|
|
|
positive_labelled_info['tokenized_text'] = positive_labelled_info['text'].apply(word_tokenize) |
|
unlabelled_labelled['tokenized_text'] = unlabelled_labelled['text'].apply(word_tokenize) |
|
|
|
|
|
word2vec_model = Word2Vec(sentences=positive_labelled_info['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4) |
|
|
|
|
|
tokenizer = Tokenizer() |
|
tokenizer.fit_on_texts(positive_labelled_info['text']) |
|
|
|
|
|
positive_sequences = tokenizer.texts_to_sequences(positive_labelled_info['text']) |
|
unlabelled_sequences = tokenizer.texts_to_sequences(unlabelled_labelled['text']) |
|
|
|
|
|
max_length = 50 |
|
padded_positive_sequences = pad_sequences(positive_sequences, maxlen=max_length, padding='post', truncating='post') |
|
padded_unlabelled_sequences = pad_sequences(unlabelled_sequences, maxlen=max_length, padding='post', truncating='post') |
|
|
|
|
|
centroid = np.mean([word2vec_model.wv[token] for token in tokenizer.index_word.values() if token in word2vec_model.wv], axis=0) |
|
|
|
|
|
similarity_scores = [] |
|
for sequence in padded_unlabelled_sequences: |
|
embeddings = [word2vec_model.wv[token] for token in sequence if token in word2vec_model.wv] |
|
if len(embeddings) > 0: |
|
similarity_score = cosine_similarity([centroid], [np.mean(embeddings, axis=0)])[0][0] |
|
else: |
|
similarity_score = 0 |
|
similarity_scores.append(similarity_score) |
|
|
|
|
|
unlabelled_labelled['similarity_score'] = similarity_scores |
|
|
|
|
|
recommended_documents = unlabelled_labelled.sort_values(by='similarity_score', ascending=False) |
|
|
|
|
|
return recommended_documents[['id', 'title', 'similarity_score']].head(n) |
|
|