ssk / oneclass.py
ssk3232's picture
Update oneclass.py
b43b3fa verified
raw
history blame contribute delete
No virus
1.46 kB
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
def select_top_n_papers(n, positive_csv_file, unlabelled_csv_file,nu):
# Load the positive labelled and unlabelled data
positive_labelled_info = pd.read_csv(positive_csv_file)
unlabelled_labelled = pd.read_csv(unlabelled_csv_file)
# Combine title and abstract for both datasets
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
# Train a TF-IDF vectorizer on the positive labelled data
vectorizer = TfidfVectorizer()
X_pos = vectorizer.fit_transform(positive_labelled_info['text'])
# Train a one-class SVM model
clf = OneClassSVM(kernel='rbf', nu=nu) # Adjust parameters as needed
clf.fit(X_pos)
# Transform unlabelled data using the same vectorizer
X_unlabelled = vectorizer.transform(unlabelled_labelled['text'])
# Predict the class of unlabelled data
predictions = clf.predict(X_unlabelled)
# Select the top n papers predicted to be positive
positive_indices = predictions == 1
top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n)
# Return titles and IDs of selected papers
selected_paper_info = top_n_positive_papers[['id', 'title', 'date']]
return selected_paper_info