ssk3232 commited on
Commit
9439064
1 Parent(s): fba5923

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +52 -0
  2. oneclass.py +34 -0
  3. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import pandas as pd
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.svm import OneClassSVM
6
+
7
+ # Load the saved model
8
+ model = joblib.load('one_class_svm_model.pkl')
9
+ vectorizer = joblib.load('tfidf_vectorizer.pkl') # Load the vectorizer used for training
10
+
11
+ # Define a function for making predictions
12
+ def predict(n=5, retrain=False, positive_labelled_file=None, unlabelled_labelled_file=None):
13
+ if retrain and positive_labelled_file is not None and unlabelled_labelled_file is not None:
14
+ # Load the positive labelled and unlabelled data
15
+ positive_labelled_info = pd.read_csv(positive_labelled_file)
16
+ unlabelled_labelled = pd.read_csv(unlabelled_labelled_file)
17
+
18
+ # Combine title and abstract for both datasets
19
+ positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
20
+ unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
21
+
22
+ # Feature extraction for positive labelled data
23
+ X_pos = vectorizer.transform(positive_labelled_info['text'])
24
+
25
+ # Fit the model on the new positive labelled data
26
+ model.fit(X_pos)
27
+
28
+ # Predict the class of unlabelled data
29
+ X_unlabelled = vectorizer.transform(unlabelled_labelled['text'])
30
+ predictions = model.predict(X_unlabelled)
31
+
32
+ # Return top n positive papers from unlabelled data
33
+ positive_indices = predictions == 1
34
+ top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n)
35
+
36
+ # Return titles and IDs of selected papers
37
+ selected_paper_info = top_n_positive_papers[['id', 'title']]
38
+ return selected_paper_info
39
+
40
+ # Define the input components
41
+ n_input = st.slider("Top N papers to return:", min_value=1, max_value=20, value=5)
42
+ retrain_input = st.checkbox("Retrain model?")
43
+ positive_labelled_file = st.file_uploader("Upload Positive Labelled Data:", type=['csv'])
44
+ unlabelled_labelled_file = st.file_uploader("Upload Unlabelled Labelled Data:", type=['csv'])
45
+
46
+ # Check if the user has uploaded files and process them if available
47
+ if positive_labelled_file is not None and unlabelled_labelled_file is not None:
48
+ # Call the predict function with uploaded data and display the result
49
+ result = predict(n_input, retrain_input, positive_labelled_file, unlabelled_labelled_file)
50
+ st.write(result)
51
+ else:
52
+ st.info("Please upload the CSV files.")
oneclass.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.svm import OneClassSVM
4
+
5
+ def select_top_n_papers(n, positive_csv_file, unlabelled_csv_file):
6
+ # Load the positive labelled and unlabelled data
7
+ positive_labelled_info = pd.read_csv(positive_csv_file)
8
+ unlabelled_labelled = pd.read_csv(unlabelled_csv_file)
9
+
10
+ # Combine title and abstract for both datasets
11
+ positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
12
+ unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
13
+
14
+ # Train a TF-IDF vectorizer on the positive labelled data
15
+ vectorizer = TfidfVectorizer()
16
+ X_pos = vectorizer.fit_transform(positive_labelled_info['text'])
17
+
18
+ # Train a one-class SVM model
19
+ clf = OneClassSVM(kernel='rbf', nu=0.7) # Adjust parameters as needed
20
+ clf.fit(X_pos)
21
+
22
+ # Transform unlabelled data using the same vectorizer
23
+ X_unlabelled = vectorizer.transform(unlabelled_labelled['text'])
24
+
25
+ # Predict the class of unlabelled data
26
+ predictions = clf.predict(X_unlabelled)
27
+
28
+ # Select the top n papers predicted to be positive
29
+ positive_indices = predictions == 1
30
+ top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n)
31
+
32
+ # Return titles and IDs of selected papers
33
+ selected_paper_info = top_n_positive_papers[['id', 'title', 'date']]
34
+ return selected_paper_info
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ joblib
2
+ scikit-learn