File size: 1,461 Bytes
9439064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM

def select_top_n_papers(n, positive_csv_file, unlabelled_csv_file):
    # Load the positive labelled and unlabelled data
    positive_labelled_info = pd.read_csv(positive_csv_file)
    unlabelled_labelled = pd.read_csv(unlabelled_csv_file)

    # Combine title and abstract for both datasets
    positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
    unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']

    # Train a TF-IDF vectorizer on the positive labelled data
    vectorizer = TfidfVectorizer()
    X_pos = vectorizer.fit_transform(positive_labelled_info['text'])

    # Train a one-class SVM model
    clf = OneClassSVM(kernel='rbf', nu=0.7)  # Adjust parameters as needed
    clf.fit(X_pos)

    # Transform unlabelled data using the same vectorizer
    X_unlabelled = vectorizer.transform(unlabelled_labelled['text'])

    # Predict the class of unlabelled data
    predictions = clf.predict(X_unlabelled)

    # Select the top n papers predicted to be positive
    positive_indices = predictions == 1
    top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n)

    # Return titles and IDs of selected papers
    selected_paper_info = top_n_positive_papers[['id', 'title', 'date']]
    return selected_paper_info