Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import pickle | |
import os | |
from docx import Document | |
from docx.shared import Inches | |
from docx.dml.color import ColorFormat | |
import sklearn | |
from lightgbm import LGBMClassifier | |
import numpy as np | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold | |
from imblearn.under_sampling import RandomUnderSampler | |
from sklearn.preprocessing import MinMaxScaler | |
from imblearn.over_sampling import SMOTE, BorderlineSMOTE | |
from imblearn.pipeline import Pipeline as imbpipeline | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import cross_val_score, cross_val_predict | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn import model_selection | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from sklearn.metrics import confusion_matrix | |
from sklearn.feature_selection import SequentialFeatureSelector | |
from sklearn.model_selection import GridSearchCV, StratifiedKFold | |
import docx | |
from docx.enum.dml import MSO_THEME_COLOR_INDEX | |
def add_hyperlink(paragraph, text, url): | |
# This gets access to the document.xml.rels file and gets a new relation id value | |
part = paragraph.part | |
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True) | |
# Create the w:hyperlink tag and add needed values | |
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') | |
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, ) | |
# Create a w:r element and a new w:rPr element | |
new_run = docx.oxml.shared.OxmlElement('w:r') | |
rPr = docx.oxml.shared.OxmlElement('w:rPr') | |
# Join all the xml elements together add add the required text to the w:r element | |
new_run.append(rPr) | |
new_run.text = text | |
hyperlink.append(new_run) | |
# Create a new Run object and add the hyperlink into it | |
r = paragraph.add_run () | |
r._r.append (hyperlink) | |
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link) | |
# Delete this if using a template that has the hyperlink style in it | |
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK | |
r.font.underline = True | |
return hyperlink | |
def savedoc(document,name): | |
def delete_paragraph(paragraph): | |
p = paragraph._element | |
p.getparent().remove(p) | |
p._p = p._element = None | |
for para in document.paragraphs: | |
if para.text == '' and para.text != ' ': | |
delete_paragraph(para) | |
document.save(name) | |
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score | |
def measures(predicted, y_test): | |
accuracy = accuracy_score(y_test, predicted) | |
precision = precision_score(y_test, predicted) | |
recall = recall_score(y_test, predicted) | |
f1 = f1_score(y_test, predicted) | |
matrix = confusion_matrix(y_test, predicted) | |
return accuracy | |
def greet(operation,filer): | |
try: | |
if filer == None: | |
return None,"Invalid file submitted" | |
import os | |
coset = pd.read_csv(filer.name) | |
coset = coset.dropna(how='any') | |
document = Document('temp.docx') | |
allowedcols = ['SID', 'TERM', 'CATALOG_NBR', 'INSTRUCTOR_ID', 'GRADE', 'CGPA', 'PROGRAM', 'PROGRAM.1'] | |
if operation == "retrain": | |
allowedcols = allowedcols[1:] | |
for col in coset.columns: | |
if col not in allowedcols: | |
return None,str(col)+" is undefined column name, allowed columns for training are "+str(allowedcols) | |
wanted = coset#.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1']) | |
def termize(x): | |
if str(x)[-1] == "1": | |
return 0 | |
elif str(x)[-1] == "2": | |
return 1 | |
else: | |
return 2 | |
def shorten_major(x): | |
if "Computer Science" in x: | |
return "CS" | |
elif "Computer Information" in x: | |
return "CIS" | |
elif "Artificial" in x: | |
return "AI" | |
elif "Cyber" in x: | |
return "CYS" | |
def binarize_grade(y): | |
todrop = ['TR','DN','NP','IP'] | |
for element in todrop: | |
if element in y: | |
return -1 | |
if 'W' in y: | |
return 1 | |
else: | |
return 0 | |
wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major) | |
wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade) | |
wanted['TERM'] = wanted['TERM'].apply(termize) | |
deleteRow = wanted[wanted['GRADE'] == -1].index | |
wanted.drop(deleteRow, inplace=True) | |
majors = [] | |
catalog = [] | |
acad_prog = [] | |
instructor = [] | |
def numberize(y): | |
if y not in majors: | |
majors.append(y) | |
return majors.index(y) | |
else: | |
return majors.index(y) | |
def catalogize(z): | |
if z not in catalog: | |
catalog.append(z) | |
return catalog.index(z) | |
else: | |
return catalog.index(z) | |
def acadize(w): | |
if w not in acad_prog: | |
acad_prog.append(w) | |
return acad_prog.index(w) | |
else: | |
return acad_prog.index(w) | |
def instructerize(w): | |
if w not in instructor: | |
instructor.append(w) | |
return instructor.index(w) | |
else: | |
return instructor.index(w) | |
def removestring(w): | |
if any(c.isalpha() for c in w): | |
return w[:-1] | |
else: | |
return w | |
wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize) | |
wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize) | |
wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize) | |
wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize) | |
document.add_paragraph(' ') | |
document.add_heading('Retraining report', 0) | |
document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records') | |
records = [] | |
X = wanted.drop(columns=['GRADE']) | |
y = wanted['GRADE'] | |
smote = BorderlineSMOTE(random_state = 11) | |
X_smote, y_smote = smote.fit_resample(X, y) | |
kf = StratifiedKFold(n_splits=10) | |
models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'), | |
RandomForestClassifier(max_depth=100), | |
LGBMClassifier(n_estimators=200, num_leaves=60), | |
VotingClassifier(estimators=[('knn', | |
KNeighborsClassifier(leaf_size=10, | |
metric='manhattan')), | |
('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])] | |
metrics = dict() | |
for model in models1: | |
model.fit(X_smote,y_smote) | |
preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,); | |
metrics[model] = measures(preds,y_smote.values) | |
records.append(((str(type(model).__name__),str(metrics[model])))) | |
document.add_paragraph(' ') | |
records = tuple(records) | |
table = document.add_table(rows=1, cols=2) | |
hdr_cells = table.rows[0].cells | |
hdr_cells[0].text = 'Name' | |
hdr_cells[1].text = 'Accuracy' | |
for ind,qty in records: | |
paragraph = document.add_paragraph() | |
row_cells = table.add_row().cells | |
row_cells[0].text = str(ind) | |
row_cells[1].text = str(qty) | |
table.style = 'TableGrid' | |
dir_name = str(os.getcwd()) | |
test = os.listdir(dir_name) | |
number = 0 | |
for item in test: | |
if item.endswith(".sav") and int(item.split("=")[0]) >= number: | |
number = int(item.split("=")[0]) | |
#os.remove(os.path.join(dir_name, item)) | |
acc = metrics[max(metrics, key=metrics.get)] | |
model = max(metrics, key=metrics.get) | |
number = number + 1 | |
filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav' | |
datavalues = {"majors":str(majors), | |
'acad_prog':str(acad_prog), | |
'catalog':str(catalog), | |
'instructor':str(instructor) | |
} | |
dfv = pd.DataFrame(datavalues,index=[0]) | |
dfv.to_csv(str(number)+"="+"values.csv") | |
document.add_paragraph(" ") | |
document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%') | |
pickle.dump(model, open(filename, 'wb')) | |
document.add_paragraph(" ") | |
p = document.add_paragraph('For more like this contact us at ') | |
add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com") | |
savedoc(document,'retraining_report.docx') | |
#document.save('retraining_report.docx') | |
return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(round(acc*100,2))+'%') | |
allowedcols.remove('GRADE') | |
for col in coset.columns: | |
if col not in allowedcols: | |
return None,str(col)+" is undefined column name, allowed columns for prediction are "+str(allowedcols) | |
majors = [] | |
catalog = [] | |
acad_prog = [] | |
instructor = [] | |
dir_name = str(os.getcwd()) | |
test = os.listdir(dir_name) | |
modelname = "" | |
maxnum = 0 | |
for item in test: | |
if item.endswith(".sav") and int(item.split("=")[0]) > maxnum: | |
maxnum = int(item.split("=")[0]) | |
modelname = item | |
if maxnum == 0: | |
return None,"No model found, please use retrain operation to build one" | |
dfv = pd.read_csv(str(maxnum)+"=values.csv") | |
cols = [majors,acad_prog,catalog,instructor] | |
indexc = 0 | |
for column in dfv.columns: | |
if "[" in str(dfv[column][0]): | |
l = dfv[column][0].replace("'",'') | |
cols[indexc][:] = str(l).strip('][').split(', ') | |
for i,e in enumerate(cols[indexc]): | |
cols[indexc][i] = e.replace(' ','') | |
print(cols[indexc]) | |
indexc = indexc + 1 | |
#modelname = "VotingClassifier=0.95756598831352.sav" | |
loaded_model = pickle.load(open(modelname, 'rb')) | |
droppers = 0 | |
total = 0 | |
document.add_paragraph(' ') | |
document.add_heading('Subjects drop prediction report', 0) | |
document.add_paragraph('This report consists of students who might potentially drop courses they currently are studying based on the supplied information') | |
records = [] | |
for row in coset.iterrows(): | |
row = list(row)[1] | |
semester = 1 | |
row['CATALOG_NBR'] = str(row['CATALOG_NBR']).replace(' ', '') | |
row['TERM'] = str(row['TERM']) | |
if row['TERM'][-1] == 2: | |
semester = 2 | |
elif row['TERM'][-1] == 5: | |
semester = 3 | |
c_id = catalog.index(str(row['CATALOG_NBR'])) | |
in_id = instructor.index(str(row['INSTRUCTOR_ID'])) | |
p_id = acad_prog.index(row['PROGRAM']) | |
major = 0 | |
x = row['PROGRAM.1'] | |
if "Computer Science" in x: | |
major = 0 | |
elif "Computer Information" in x: | |
major = 1 | |
elif "Artificial" in x: | |
major = 3 | |
elif "Cyber" in x: | |
major = 2 | |
gpa = row['CGPA'] | |
prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0] | |
total = total + 1 | |
records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction))) | |
if prediction == 1: | |
droppers = droppers + 1 | |
document.add_paragraph(' ') | |
records = tuple(records) | |
table = document.add_table(rows=1, cols=9) | |
hdr_cells = table.rows[0].cells | |
hdr_cells[0].text = 'Index' | |
hdr_cells[1].text = 'Student ID' | |
hdr_cells[2].text = 'Term' | |
hdr_cells[3].text = 'Catalog ID' | |
hdr_cells[4].text = 'Instructor ID' | |
hdr_cells[5].text = 'Cummulative GPA' | |
hdr_cells[6].text = 'Academic Program' | |
hdr_cells[7].text = 'Major' | |
hdr_cells[8].text = 'Possible Drop Prediction' | |
for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records: | |
paragraph = document.add_paragraph() | |
row_cells = table.add_row().cells | |
row_cells[0].text = ind | |
row_cells[1].text = str(qty) | |
row_cells[2].text = id1 | |
row_cells[3].text = desc | |
row_cells[4].text = inst | |
row_cells[5].text = cgpa | |
row_cells[6].text = aprog | |
row_cells[7].text = maj | |
if pred == "1": | |
pred = "Yes" | |
else: | |
pred = "No" | |
row_cells[8].text = pred | |
table.style = 'TableGrid' | |
#document.add_page_break() | |
document.add_paragraph(" ") | |
modelname = modelname.split("=") | |
lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)' | |
document.add_paragraph(lastpara) | |
savedoc(document,'drop_prediction_report.docx') | |
#document.save('drop_prediction_report.docx') | |
return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")" | |
except Exception as e: | |
return None,str(e) | |
iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=[gr.File(label='Report generated'),gr.Text(label='Log')],debug=True) | |
iface.launch() |