ssk3232 commited on
Commit
6c9a7ab
1 Parent(s): 8782dc3

Update Cosine_distance.py

Browse files
Files changed (1) hide show
  1. Cosine_distance.py +15 -28
Cosine_distance.py CHANGED
@@ -1,10 +1,9 @@
1
  import pandas as pd
2
- from gensim.models import Word2Vec
3
  import numpy as np
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  from nltk.tokenize import word_tokenize
6
- from keras.preprocessing.text import Tokenizer
7
- from keras.preprocessing.sequence import pad_sequences
8
 
9
  def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
10
  # Load positive labelled data and unlabelled labelled data
@@ -15,37 +14,25 @@ def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
15
  positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
16
  unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
17
 
18
- # Preprocess text and tokenize
19
- positive_labelled_info['tokenized_text'] = positive_labelled_info['text'].apply(word_tokenize)
20
- unlabelled_labelled['tokenized_text'] = unlabelled_labelled['text'].apply(word_tokenize)
 
21
 
22
- # Train Word2Vec model on the tokenized text of positive labelled data
23
- word2vec_model = Word2Vec(sentences=positive_labelled_info['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)
24
 
25
- # Initialize tokenizer
26
- tokenizer = Tokenizer()
27
- tokenizer.fit_on_texts(positive_labelled_info['text'])
28
 
29
- # Convert text to sequences of integer indices
30
- positive_sequences = tokenizer.texts_to_sequences(positive_labelled_info['text'])
31
- unlabelled_sequences = tokenizer.texts_to_sequences(unlabelled_labelled['text'])
32
-
33
- # Pad or truncate sequences to a fixed length
34
- max_length = 50 # Adjust as needed
35
- padded_positive_sequences = pad_sequences(positive_sequences, maxlen=max_length, padding='post', truncating='post')
36
- padded_unlabelled_sequences = pad_sequences(unlabelled_sequences, maxlen=max_length, padding='post', truncating='post')
37
-
38
- # Compute centroid of positive examples' word embeddings
39
- centroid = np.mean([word2vec_model.wv[token] for token in tokenizer.index_word.values() if token in word2vec_model.wv], axis=0)
40
 
41
  # Compute similarity scores for each document in the unlabelled labelled data
42
  similarity_scores = []
43
- for sequence in padded_unlabelled_sequences:
44
- embeddings = [word2vec_model.wv[token] for token in sequence if token in word2vec_model.wv]
45
- if len(embeddings) > 0:
46
- similarity_score = cosine_similarity([centroid], [np.mean(embeddings, axis=0)])[0][0]
47
- else:
48
- similarity_score = 0 # Default similarity score for empty embeddings
49
  similarity_scores.append(similarity_score)
50
 
51
  # Add similarity scores to unlabelled labelled data
 
1
  import pandas as pd
 
2
  import numpy as np
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  from nltk.tokenize import word_tokenize
5
+ from transformers import AutoTokenizer, AutoModel
6
+ import torch
7
 
8
  def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
9
  # Load positive labelled data and unlabelled labelled data
 
14
  positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
15
  unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
16
 
17
+ # Tokenize text using the pre-trained Sentence Transformer model
18
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
19
+ positive_labelled_info['input_ids'] = positive_labelled_info['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
20
+ unlabelled_labelled['input_ids'] = unlabelled_labelled['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
21
 
22
+ # Load the pre-trained Sentence Transformer model
23
+ model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
24
 
25
+ # Compute embeddings for positive labelled data
26
+ positive_labelled_info['embedding'] = positive_labelled_info['input_ids'].apply(lambda x: model(torch.tensor([x]))[0])
 
27
 
28
+ # Compute centroid of positive examples' embeddings
29
+ centroid = np.mean(positive_labelled_info['embedding'].tolist(), axis=0)
 
 
 
 
 
 
 
 
 
30
 
31
  # Compute similarity scores for each document in the unlabelled labelled data
32
  similarity_scores = []
33
+ for input_ids in unlabelled_labelled['input_ids']:
34
+ embedding = model(torch.tensor([input_ids]))[0].detach().numpy()
35
+ similarity_score = cosine_similarity([centroid], [embedding])[0][0]
 
 
 
36
  similarity_scores.append(similarity_score)
37
 
38
  # Add similarity scores to unlabelled labelled data