ssk3232 commited on
Commit
f3a086d
1 Parent(s): 5e9beb0

Upload 2 files

Browse files
Files changed (2) hide show
  1. Cosine_distance.py +58 -0
  2. requirements (1).txt +7 -0
Cosine_distance.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from gensim.models import Word2Vec
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from nltk.tokenize import word_tokenize
6
+ from keras.preprocessing.text import Tokenizer
7
+ from keras.preprocessing.sequence import pad_sequences
8
+
9
+ def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
10
+ # Load positive labelled data and unlabelled labelled data
11
+ positive_labelled_info = pd.read_csv(positive_csv_path)
12
+ unlabelled_labelled = pd.read_csv(unlabelled_csv_path)
13
+
14
+ # Combine title and abstract for positive labelled data and unlabelled labelled data
15
+ positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
16
+ unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
17
+
18
+ # Preprocess text and tokenize
19
+ positive_labelled_info['tokenized_text'] = positive_labelled_info['text'].apply(word_tokenize)
20
+ unlabelled_labelled['tokenized_text'] = unlabelled_labelled['text'].apply(word_tokenize)
21
+
22
+ # Train Word2Vec model on the tokenized text of positive labelled data
23
+ word2vec_model = Word2Vec(sentences=positive_labelled_info['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)
24
+
25
+ # Initialize tokenizer
26
+ tokenizer = Tokenizer()
27
+ tokenizer.fit_on_texts(positive_labelled_info['text'])
28
+
29
+ # Convert text to sequences of integer indices
30
+ positive_sequences = tokenizer.texts_to_sequences(positive_labelled_info['text'])
31
+ unlabelled_sequences = tokenizer.texts_to_sequences(unlabelled_labelled['text'])
32
+
33
+ # Pad or truncate sequences to a fixed length
34
+ max_length = 50 # Adjust as needed
35
+ padded_positive_sequences = pad_sequences(positive_sequences, maxlen=max_length, padding='post', truncating='post')
36
+ padded_unlabelled_sequences = pad_sequences(unlabelled_sequences, maxlen=max_length, padding='post', truncating='post')
37
+
38
+ # Compute centroid of positive examples' word embeddings
39
+ centroid = np.mean([word2vec_model.wv[token] for token in tokenizer.index_word.values() if token in word2vec_model.wv], axis=0)
40
+
41
+ # Compute similarity scores for each document in the unlabelled labelled data
42
+ similarity_scores = []
43
+ for sequence in padded_unlabelled_sequences:
44
+ embeddings = [word2vec_model.wv[token] for token in sequence if token in word2vec_model.wv]
45
+ if len(embeddings) > 0:
46
+ similarity_score = cosine_similarity([centroid], [np.mean(embeddings, axis=0)])[0][0]
47
+ else:
48
+ similarity_score = 0 # Default similarity score for empty embeddings
49
+ similarity_scores.append(similarity_score)
50
+
51
+ # Add similarity scores to unlabelled labelled data
52
+ unlabelled_labelled['similarity_score'] = similarity_scores
53
+
54
+ # Sort unlabelled labelled data by similarity score in descending order
55
+ recommended_documents = unlabelled_labelled.sort_values(by='similarity_score', ascending=False)
56
+
57
+ # Return top n recommended documents
58
+ return recommended_documents[['id', 'title', 'similarity_score']].head(n)
requirements (1).txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ joblib
2
+ scikit-learn
3
+ gensim
4
+ scipy
5
+ nltk
6
+ keras
7
+ tensorflow