|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.svm import OneClassSVM |
|
|
|
def select_top_n_papers(n, positive_csv_file, unlabelled_csv_file): |
|
|
|
positive_labelled_info = pd.read_csv(positive_csv_file) |
|
unlabelled_labelled = pd.read_csv(unlabelled_csv_file) |
|
|
|
|
|
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract'] |
|
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract'] |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
X_pos = vectorizer.fit_transform(positive_labelled_info['text']) |
|
|
|
|
|
clf = OneClassSVM(kernel='rbf', nu=0.7) |
|
clf.fit(X_pos) |
|
|
|
|
|
X_unlabelled = vectorizer.transform(unlabelled_labelled['text']) |
|
|
|
|
|
predictions = clf.predict(X_unlabelled) |
|
|
|
|
|
positive_indices = predictions == 1 |
|
top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n) |
|
|
|
|
|
selected_paper_info = top_n_positive_papers[['id', 'title', 'date']] |
|
return selected_paper_info |