|
import streamlit as st |
|
import joblib |
|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.svm import OneClassSVM |
|
|
|
|
|
model = joblib.load('one_class_svm_model.pkl') |
|
vectorizer = joblib.load('tfidf_vectorizer.pkl') |
|
|
|
|
|
def predict(n=5, retrain=False, positive_labelled_file=None, unlabelled_labelled_file=None): |
|
if retrain and positive_labelled_file is not None and unlabelled_labelled_file is not None: |
|
|
|
positive_labelled_info = pd.read_csv(positive_labelled_file) |
|
unlabelled_labelled = pd.read_csv(unlabelled_labelled_file) |
|
|
|
|
|
positive_labelled_info['text'] = positive_labelled_info['title'] + ' ' + positive_labelled_info['abstract'] |
|
unlabelled_labelled['text'] = unlabelled_labelled['title'] + ' ' + unlabelled_labelled['abstract'] |
|
|
|
|
|
X_pos = vectorizer.transform(positive_labelled_info['text']) |
|
|
|
|
|
model.fit(X_pos) |
|
|
|
|
|
X_unlabelled = vectorizer.transform(unlabelled_labelled['text']) |
|
predictions = model.predict(X_unlabelled) |
|
|
|
|
|
positive_indices = predictions == 1 |
|
top_n_positive_papers = unlabelled_labelled.loc[positive_indices].head(n) |
|
|
|
|
|
selected_paper_info = top_n_positive_papers[['id', 'title']] |
|
return selected_paper_info |
|
|
|
|
|
n_input = st.slider("Top N papers to return:", min_value=1, max_value=20, value=5) |
|
retrain_input = st.checkbox("Retrain model?") |
|
positive_labelled_file = st.file_uploader("Upload Positive Labelled Data:", type=['csv']) |
|
unlabelled_labelled_file = st.file_uploader("Upload Unlabelled Labelled Data:", type=['csv']) |
|
|
|
|
|
if positive_labelled_file is not None and unlabelled_labelled_file is not None: |
|
|
|
result = predict(n_input, retrain_input, positive_labelled_file, unlabelled_labelled_file) |
|
st.write(result) |
|
else: |
|
st.info("Please upload the CSV files.") |
|
|