import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import OneClassSVM def select_top_n_papers(n, positive_csv_file, unlabelled_csv_file): # Load the positive labelled and unlabelled data positive_labelled_info = pd.read_csv(positive_csv_file) unlabelled_labelled = pd.read_csv(unlabelled_csv_file) # Combine title and abstract for both datasets positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract'] unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract'] # Train a TF-IDF vectorizer on the positive labelled data vectorizer = TfidfVectorizer() X_pos = vectorizer.fit_transform(positive_labelled_info['text']) # Train a one-class SVM model clf = OneClassSVM(kernel='rbf', nu=0.7) # Adjust parameters as needed clf.fit(X_pos) # Transform unlabelled data using the same vectorizer X_unlabelled = vectorizer.transform(unlabelled_labelled['text']) # Predict the class of unlabelled data predictions = clf.predict(X_unlabelled) # Select the top n papers predicted to be positive positive_indices = predictions == 1 top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n) # Return titles and IDs of selected papers selected_paper_info = top_n_positive_papers[['id', 'title', 'date']] return selected_paper_info