import gradio as gr import pandas as pd import pickle import os from docx import Document from docx.shared import Inches from docx.dml.color import ColorFormat import sklearn from lightgbm import LGBMClassifier import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold from imblearn.under_sampling import RandomUnderSampler from sklearn.preprocessing import MinMaxScaler from imblearn.over_sampling import SMOTE, BorderlineSMOTE from imblearn.pipeline import Pipeline as imbpipeline from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn.neighbors import KNeighborsClassifier from sklearn import model_selection from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import confusion_matrix from sklearn.feature_selection import SequentialFeatureSelector from sklearn.model_selection import GridSearchCV, StratifiedKFold import docx from docx.enum.dml import MSO_THEME_COLOR_INDEX def add_hyperlink(paragraph, text, url): # This gets access to the document.xml.rels file and gets a new relation id value part = paragraph.part r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True) # Create the w:hyperlink tag and add needed values hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, ) # Create a w:r element and a new w:rPr element new_run = docx.oxml.shared.OxmlElement('w:r') rPr = docx.oxml.shared.OxmlElement('w:rPr') # Join all the xml elements together add add the required text to the w:r element new_run.append(rPr) new_run.text = text hyperlink.append(new_run) # Create a new Run object and add the hyperlink into it r = paragraph.add_run () r._r.append (hyperlink) # A workaround for the lack of a hyperlink style (doesn't go purple after using the link) # Delete this if using a template that has the hyperlink style in it r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK r.font.underline = True return hyperlink def savedoc(document,name): def delete_paragraph(paragraph): p = paragraph._element p.getparent().remove(p) p._p = p._element = None for para in document.paragraphs: if para.text == '' and para.text != ' ': delete_paragraph(para) document.save(name) from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score def measures(predicted, y_test): accuracy = accuracy_score(y_test, predicted) precision = precision_score(y_test, predicted) recall = recall_score(y_test, predicted) f1 = f1_score(y_test, predicted) matrix = confusion_matrix(y_test, predicted) return accuracy def greet(operation,filer): try: if filer == None: return None,"Invalid file submitted" import os coset = pd.read_csv(filer.name) coset = coset.dropna(how='any') document = Document('temp.docx') allowedcols = ['SID', 'TERM', 'CATALOG_NBR', 'INSTRUCTOR_ID', 'GRADE', 'CGPA', 'PROGRAM', 'PROGRAM.1'] if operation == "retrain": allowedcols = allowedcols[1:] for col in coset.columns: if col not in allowedcols: return None,str(col)+" is undefined column name, allowed columns for training are "+str(allowedcols) wanted = coset#.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1']) def termize(x): if str(x)[-1] == "1": return 0 elif str(x)[-1] == "2": return 1 else: return 2 def shorten_major(x): if "Computer Science" in x: return "CS" elif "Computer Information" in x: return "CIS" elif "Artificial" in x: return "AI" elif "Cyber" in x: return "CYS" def binarize_grade(y): todrop = ['TR','DN','NP','IP'] for element in todrop: if element in y: return -1 if 'W' in y: return 1 else: return 0 wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major) wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade) wanted['TERM'] = wanted['TERM'].apply(termize) deleteRow = wanted[wanted['GRADE'] == -1].index wanted.drop(deleteRow, inplace=True) majors = [] catalog = [] acad_prog = [] instructor = [] def numberize(y): if y not in majors: majors.append(y) return majors.index(y) else: return majors.index(y) def catalogize(z): if z not in catalog: catalog.append(z) return catalog.index(z) else: return catalog.index(z) def acadize(w): if w not in acad_prog: acad_prog.append(w) return acad_prog.index(w) else: return acad_prog.index(w) def instructerize(w): if w not in instructor: instructor.append(w) return instructor.index(w) else: return instructor.index(w) def removestring(w): if any(c.isalpha() for c in w): return w[:-1] else: return w wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize) wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize) wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize) wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize) document.add_paragraph(' ') document.add_heading('Retraining report', 0) document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records') records = [] X = wanted.drop(columns=['GRADE']) y = wanted['GRADE'] smote = BorderlineSMOTE(random_state = 11) X_smote, y_smote = smote.fit_resample(X, y) kf = StratifiedKFold(n_splits=10) models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'), RandomForestClassifier(max_depth=100), LGBMClassifier(n_estimators=200, num_leaves=60), VotingClassifier(estimators=[('knn', KNeighborsClassifier(leaf_size=10, metric='manhattan')), ('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])] metrics = dict() for model in models1: model.fit(X_smote,y_smote) preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,); metrics[model] = measures(preds,y_smote.values) records.append(((str(type(model).__name__),str(metrics[model])))) document.add_paragraph(' ') records = tuple(records) table = document.add_table(rows=1, cols=2) hdr_cells = table.rows[0].cells hdr_cells[0].text = 'Name' hdr_cells[1].text = 'Accuracy' for ind,qty in records: paragraph = document.add_paragraph() row_cells = table.add_row().cells row_cells[0].text = str(ind) row_cells[1].text = str(qty) table.style = 'TableGrid' dir_name = str(os.getcwd()) test = os.listdir(dir_name) number = 0 for item in test: if item.endswith(".sav") and int(item.split("=")[0]) >= number: number = int(item.split("=")[0]) #os.remove(os.path.join(dir_name, item)) acc = metrics[max(metrics, key=metrics.get)] model = max(metrics, key=metrics.get) number = number + 1 filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav' datavalues = {"majors":str(majors), 'acad_prog':str(acad_prog), 'catalog':str(catalog), 'instructor':str(instructor) } dfv = pd.DataFrame(datavalues,index=[0]) dfv.to_csv(str(number)+"="+"values.csv") document.add_paragraph(" ") document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%') pickle.dump(model, open(filename, 'wb')) document.add_paragraph(" ") p = document.add_paragraph('For more like this contact us at ') add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com") savedoc(document,'retraining_report.docx') #document.save('retraining_report.docx') return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(round(acc*100,2))+'%') allowedcols.remove('GRADE') for col in coset.columns: if col not in allowedcols: return None,str(col)+" is undefined column name, allowed columns for prediction are "+str(allowedcols) majors = [] catalog = [] acad_prog = [] instructor = [] dir_name = str(os.getcwd()) test = os.listdir(dir_name) modelname = "" maxnum = 0 for item in test: if item.endswith(".sav") and int(item.split("=")[0]) > maxnum: maxnum = int(item.split("=")[0]) modelname = item if maxnum == 0: return None,"No model found, please use retrain operation to build one" dfv = pd.read_csv(str(maxnum)+"=values.csv") cols = [majors,acad_prog,catalog,instructor] indexc = 0 for column in dfv.columns: if "[" in str(dfv[column][0]): l = dfv[column][0].replace("'",'') cols[indexc][:] = str(l).strip('][').split(', ') for i,e in enumerate(cols[indexc]): cols[indexc][i] = e.replace(' ','') print(cols[indexc]) indexc = indexc + 1 #modelname = "VotingClassifier=0.95756598831352.sav" loaded_model = pickle.load(open(modelname, 'rb')) droppers = 0 total = 0 document.add_paragraph(' ') document.add_heading('Subjects drop prediction report', 0) document.add_paragraph('This report consists of students who might potentially drop courses they currently are studying based on the supplied information') records = [] for row in coset.iterrows(): row = list(row)[1] semester = 1 row['CATALOG_NBR'] = str(row['CATALOG_NBR']).replace(' ', '') row['TERM'] = str(row['TERM']) if row['TERM'][-1] == 2: semester = 2 elif row['TERM'][-1] == 5: semester = 3 c_id = catalog.index(str(row['CATALOG_NBR'])) in_id = instructor.index(str(row['INSTRUCTOR_ID'])) p_id = acad_prog.index(row['PROGRAM']) major = 0 x = row['PROGRAM.1'] if "Computer Science" in x: major = 0 elif "Computer Information" in x: major = 1 elif "Artificial" in x: major = 3 elif "Cyber" in x: major = 2 gpa = row['CGPA'] prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0] total = total + 1 records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction))) if prediction == 1: droppers = droppers + 1 document.add_paragraph(' ') records = tuple(records) table = document.add_table(rows=1, cols=9) hdr_cells = table.rows[0].cells hdr_cells[0].text = 'Index' hdr_cells[1].text = 'Student ID' hdr_cells[2].text = 'Term' hdr_cells[3].text = 'Catalog ID' hdr_cells[4].text = 'Instructor ID' hdr_cells[5].text = 'Cummulative GPA' hdr_cells[6].text = 'Academic Program' hdr_cells[7].text = 'Major' hdr_cells[8].text = 'Possible Drop Prediction' for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records: paragraph = document.add_paragraph() row_cells = table.add_row().cells row_cells[0].text = ind row_cells[1].text = str(qty) row_cells[2].text = id1 row_cells[3].text = desc row_cells[4].text = inst row_cells[5].text = cgpa row_cells[6].text = aprog row_cells[7].text = maj if pred == "1": pred = "Yes" else: pred = "No" row_cells[8].text = pred table.style = 'TableGrid' #document.add_page_break() document.add_paragraph(" ") modelname = modelname.split("=") lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)' document.add_paragraph(lastpara) savedoc(document,'drop_prediction_report.docx') #document.save('drop_prediction_report.docx') return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")" except Exception as e: return None,str(e) iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=[gr.File(label='Report generated'),gr.Text(label='Log')],debug=True) iface.launch()