Upload 3 files
Browse files- app.py +52 -0
- oneclass.py +34 -0
- requirements.txt +2 -0
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import joblib
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.svm import OneClassSVM
|
6 |
+
|
7 |
+
# Load the saved model
|
8 |
+
model = joblib.load('one_class_svm_model.pkl')
|
9 |
+
vectorizer = joblib.load('tfidf_vectorizer.pkl') # Load the vectorizer used for training
|
10 |
+
|
11 |
+
# Define a function for making predictions
|
12 |
+
def predict(n=5, retrain=False, positive_labelled_file=None, unlabelled_labelled_file=None):
|
13 |
+
if retrain and positive_labelled_file is not None and unlabelled_labelled_file is not None:
|
14 |
+
# Load the positive labelled and unlabelled data
|
15 |
+
positive_labelled_info = pd.read_csv(positive_labelled_file)
|
16 |
+
unlabelled_labelled = pd.read_csv(unlabelled_labelled_file)
|
17 |
+
|
18 |
+
# Combine title and abstract for both datasets
|
19 |
+
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
|
20 |
+
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
|
21 |
+
|
22 |
+
# Feature extraction for positive labelled data
|
23 |
+
X_pos = vectorizer.transform(positive_labelled_info['text'])
|
24 |
+
|
25 |
+
# Fit the model on the new positive labelled data
|
26 |
+
model.fit(X_pos)
|
27 |
+
|
28 |
+
# Predict the class of unlabelled data
|
29 |
+
X_unlabelled = vectorizer.transform(unlabelled_labelled['text'])
|
30 |
+
predictions = model.predict(X_unlabelled)
|
31 |
+
|
32 |
+
# Return top n positive papers from unlabelled data
|
33 |
+
positive_indices = predictions == 1
|
34 |
+
top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n)
|
35 |
+
|
36 |
+
# Return titles and IDs of selected papers
|
37 |
+
selected_paper_info = top_n_positive_papers[['id', 'title']]
|
38 |
+
return selected_paper_info
|
39 |
+
|
40 |
+
# Define the input components
|
41 |
+
n_input = st.slider("Top N papers to return:", min_value=1, max_value=20, value=5)
|
42 |
+
retrain_input = st.checkbox("Retrain model?")
|
43 |
+
positive_labelled_file = st.file_uploader("Upload Positive Labelled Data:", type=['csv'])
|
44 |
+
unlabelled_labelled_file = st.file_uploader("Upload Unlabelled Labelled Data:", type=['csv'])
|
45 |
+
|
46 |
+
# Check if the user has uploaded files and process them if available
|
47 |
+
if positive_labelled_file is not None and unlabelled_labelled_file is not None:
|
48 |
+
# Call the predict function with uploaded data and display the result
|
49 |
+
result = predict(n_input, retrain_input, positive_labelled_file, unlabelled_labelled_file)
|
50 |
+
st.write(result)
|
51 |
+
else:
|
52 |
+
st.info("Please upload the CSV files.")
|
oneclass.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
+
from sklearn.svm import OneClassSVM
|
4 |
+
|
5 |
+
def select_top_n_papers(n, positive_csv_file, unlabelled_csv_file):
|
6 |
+
# Load the positive labelled and unlabelled data
|
7 |
+
positive_labelled_info = pd.read_csv(positive_csv_file)
|
8 |
+
unlabelled_labelled = pd.read_csv(unlabelled_csv_file)
|
9 |
+
|
10 |
+
# Combine title and abstract for both datasets
|
11 |
+
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract']
|
12 |
+
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract']
|
13 |
+
|
14 |
+
# Train a TF-IDF vectorizer on the positive labelled data
|
15 |
+
vectorizer = TfidfVectorizer()
|
16 |
+
X_pos = vectorizer.fit_transform(positive_labelled_info['text'])
|
17 |
+
|
18 |
+
# Train a one-class SVM model
|
19 |
+
clf = OneClassSVM(kernel='rbf', nu=0.7) # Adjust parameters as needed
|
20 |
+
clf.fit(X_pos)
|
21 |
+
|
22 |
+
# Transform unlabelled data using the same vectorizer
|
23 |
+
X_unlabelled = vectorizer.transform(unlabelled_labelled['text'])
|
24 |
+
|
25 |
+
# Predict the class of unlabelled data
|
26 |
+
predictions = clf.predict(X_unlabelled)
|
27 |
+
|
28 |
+
# Select the top n papers predicted to be positive
|
29 |
+
positive_indices = predictions == 1
|
30 |
+
top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n)
|
31 |
+
|
32 |
+
# Return titles and IDs of selected papers
|
33 |
+
selected_paper_info = top_n_positive_papers[['id', 'title', 'date']]
|
34 |
+
return selected_paper_info
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
joblib
|
2 |
+
scikit-learn
|