Spaces:

ssk3232
/

ssk

Runtime error

App Files Files Community

ssk / Cosine_distance.py

ssk3232

Update Cosine_distance.py

6c9a7ab verified 3 months ago

raw

history blame contribute delete

No virus

2.4 kB

	import pandas as pd
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from nltk.tokenize import word_tokenize
	from transformers import AutoTokenizer, AutoModel
	import torch

	def recommend_papers(positive_csv_path, unlabelled_csv_path, n):
	# Load positive labelled data and unlabelled labelled data
	positive_labelled_info = pd.read_csv(positive_csv_path)
	unlabelled_labelled = pd.read_csv(unlabelled_csv_path)

	# Combine title and abstract for positive labelled data and unlabelled labelled data
	positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
	unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']

	# Tokenize text using the pre-trained Sentence Transformer model
	tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
	positive_labelled_info['input_ids'] = positive_labelled_info['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
	unlabelled_labelled['input_ids'] = unlabelled_labelled['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

	# Load the pre-trained Sentence Transformer model
	model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

	# Compute embeddings for positive labelled data
	positive_labelled_info['embedding'] = positive_labelled_info['input_ids'].apply(lambda x: model(torch.tensor([x]))[0])

	# Compute centroid of positive examples' embeddings
	centroid = np.mean(positive_labelled_info['embedding'].tolist(), axis=0)

	# Compute similarity scores for each document in the unlabelled labelled data
	similarity_scores = []
	for input_ids in unlabelled_labelled['input_ids']:
	embedding = model(torch.tensor([input_ids]))[0].detach().numpy()
	similarity_score = cosine_similarity([centroid], [embedding])[0][0]
	similarity_scores.append(similarity_score)

	# Add similarity scores to unlabelled labelled data
	unlabelled_labelled['similarity_score'] = similarity_scores

	# Sort unlabelled labelled data by similarity score in descending order
	recommended_documents = unlabelled_labelled.sort_values(by='similarity_score', ascending=False)

	# Return top n recommended documents
	return recommended_documents[['id', 'title', 'similarity_score']].head(n)