Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import pickle | |
import os | |
from docx import Document | |
from docx.shared import Inches | |
from docx.dml.color import ColorFormat | |
import sklearn | |
from lightgbm import LGBMClassifier | |
import numpy as np | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold | |
from imblearn.under_sampling import RandomUnderSampler | |
from sklearn.preprocessing import MinMaxScaler | |
from imblearn.over_sampling import SMOTE, BorderlineSMOTE | |
from imblearn.pipeline import Pipeline as imbpipeline | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import cross_val_score, cross_val_predict | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn import model_selection | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,GradientBoostingClassifier, VotingClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from sklearn.metrics import confusion_matrix | |
from sklearn.feature_selection import SequentialFeatureSelector | |
from sklearn.model_selection import GridSearchCV, StratifiedKFold | |
import docx | |
from docx.enum.dml import MSO_THEME_COLOR_INDEX | |
def add_hyperlink(paragraph, text, url): | |
# This gets access to the document.xml.rels file and gets a new relation id value | |
part = paragraph.part | |
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True) | |
# Create the w:hyperlink tag and add needed values | |
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') | |
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, ) | |
# Create a w:r element and a new w:rPr element | |
new_run = docx.oxml.shared.OxmlElement('w:r') | |
rPr = docx.oxml.shared.OxmlElement('w:rPr') | |
# Join all the xml elements together add add the required text to the w:r element | |
new_run.append(rPr) | |
new_run.text = text | |
hyperlink.append(new_run) | |
# Create a new Run object and add the hyperlink into it | |
r = paragraph.add_run () | |
r._r.append (hyperlink) | |
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link) | |
# Delete this if using a template that has the hyperlink style in it | |
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK | |
r.font.underline = True | |
return hyperlink | |
def savedoc(document,name): | |
def delete_paragraph(paragraph): | |
p = paragraph._element | |
p.getparent().remove(p) | |
p._p = p._element = None | |
for para in document.paragraphs: | |
if para.text == '' and para.text != ' ': | |
delete_paragraph(para) | |
document.save(name) | |
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, cohen_kappa_score, f1_score, recall_score, precision_score | |
def measures(predicted, y_test): | |
accuracy = accuracy_score(y_test, predicted) | |
#print('Accuracy: %f' % accuracy) | |
precision = precision_score(y_test, predicted) | |
#print('Precision: %f' % precision) | |
recall = recall_score(y_test, predicted) | |
#print('Recall: %f' % recall) | |
f1 = f1_score(y_test, predicted) | |
#print('F1 score: %f' % f1) | |
kappa = cohen_kappa_score(y_test, predicted) | |
#print('Cohens kappa: %f' % kappa) | |
auc = roc_auc_score(y_test, predicted) | |
#print('ROC AUC: %f' % auc) | |
matrix = confusion_matrix(y_test, predicted) | |
#print('Confusion Matrix') | |
#print(matrix) | |
return accuracy | |
def greet(operation,filer): | |
if filer == None: | |
return None,"Invalid file submitted" | |
import os | |
coset = pd.read_csv(filer.name) | |
coset = coset.dropna(how='any') | |
document = Document('temp.docx') | |
if operation == "retrain": | |
wanted = coset.drop(columns=['SUBJECT','SID','CRSE_ID','COURSE','ROLE','GPA','INPUT','STATUS','GRADUATION TERM','CLASS #','COLLEGE','COLLEGE.1']) | |
def termize(x): | |
if str(x)[-1] == "1": | |
return 0 | |
elif str(x)[-1] == "2": | |
return 1 | |
else: | |
return 2 | |
def shorten_major(x): | |
if "Computer Science" in x: | |
return "CS" | |
elif "Computer Information" in x: | |
return "CIS" | |
elif "Artificial" in x: | |
return "AI" | |
elif "Cyber" in x: | |
return "CYS" | |
def binarize_grade(y): | |
todrop = ['TR','DN','NP','IP'] | |
for element in todrop: | |
if element in y: | |
return -1 | |
if 'W' in y: | |
return 1 | |
else: | |
return 0 | |
wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(shorten_major) | |
wanted['GRADE'] = wanted['GRADE'].apply(binarize_grade) | |
wanted['TERM'] = wanted['TERM'].apply(termize) | |
deleteRow = wanted[wanted['GRADE'] == -1].index | |
wanted.drop(deleteRow, inplace=True) | |
majors = [] | |
catalog = [] | |
acad_prog = [] | |
instructor = [] | |
def numberize(y): | |
if y not in majors: | |
majors.append(y) | |
return majors.index(y) | |
else: | |
return majors.index(y) | |
def catalogize(z): | |
if z not in catalog: | |
catalog.append(z) | |
return catalog.index(z) | |
else: | |
return catalog.index(z) | |
def acadize(w): | |
if w not in acad_prog: | |
acad_prog.append(w) | |
return acad_prog.index(w) | |
else: | |
return acad_prog.index(w) | |
def instructerize(w): | |
if w not in instructor: | |
instructor.append(w) | |
return instructor.index(w) | |
else: | |
return instructor.index(w) | |
def removestring(w): | |
if any(c.isalpha() for c in w): | |
return w[:-1] | |
else: | |
return w | |
wanted['PROGRAM.1'] = wanted['PROGRAM.1'].apply(numberize) | |
wanted['CATALOG_NBR'] = wanted['CATALOG_NBR'].apply(catalogize) | |
wanted['PROGRAM'] = wanted['PROGRAM'].apply(acadize) | |
wanted['INSTRUCTOR_ID'] = wanted['INSTRUCTOR_ID'].apply(instructerize) | |
document.add_paragraph(' ') | |
document.add_heading('Retraining report', 0) | |
document.add_paragraph('This report consists of the models retraining information on the new dataset with ('+str(len(coset))+') records') | |
records = [] | |
#print(majors,catalog,acad_prog,instructor) | |
X = wanted.drop(columns=['GRADE']) | |
y = wanted['GRADE'] | |
smote = BorderlineSMOTE(random_state = 11) | |
X_smote, y_smote = smote.fit_resample(X, y) | |
kf = StratifiedKFold(n_splits=10) | |
models1 = [KNeighborsClassifier(leaf_size=10,metric='manhattan'), | |
RandomForestClassifier(max_depth=100), | |
LGBMClassifier(n_estimators=200, num_leaves=60), | |
VotingClassifier(estimators=[('knn', | |
KNeighborsClassifier(leaf_size=10, | |
metric='manhattan')), | |
('rf', RandomForestClassifier(max_depth=100)),('gm',LGBMClassifier(n_estimators=200, num_leaves=60))])] | |
metrics = dict() | |
for model in models1: | |
model.fit(X_smote,y_smote) | |
preds = cross_val_predict(model, X_smote.values,y_smote.values, cv=kf, n_jobs=-1,); | |
#print("------------AFTER SMOTE------------") | |
#print(model); | |
metrics[model] = measures(preds,y_smote.values) | |
records.append(((str(type(model).__name__),str(metrics[model])))) | |
document.add_paragraph(' ') | |
records = tuple(records) | |
table = document.add_table(rows=1, cols=2) | |
hdr_cells = table.rows[0].cells | |
hdr_cells[0].text = 'Name' | |
hdr_cells[1].text = 'Accuracy' | |
for ind,qty in records: | |
paragraph = document.add_paragraph() | |
row_cells = table.add_row().cells | |
row_cells[0].text = str(ind) | |
row_cells[1].text = str(qty) | |
table.style = 'TableGrid' | |
dir_name = str(os.getcwd()) | |
test = os.listdir(dir_name) | |
number = 0 | |
for item in test: | |
if item.endswith(".sav") and int(item.split("=")[0]) >= number: | |
number = int(item.split("=")[0]) | |
#os.remove(os.path.join(dir_name, item)) | |
acc = metrics[max(metrics, key=metrics.get)] | |
model = max(metrics, key=metrics.get) | |
number = number + 1 | |
filename = str(number)+"="+type(model).__name__+'='+str(acc)+'.sav' | |
document.add_paragraph(" ") | |
document.add_paragraph(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%') | |
pickle.dump(model, open(filename, 'wb')) | |
document.add_paragraph(" ") | |
p = document.add_paragraph('For more like this contact us at ') | |
add_hyperlink(p, 'contact@mustafasa.com', "contact@mustafasa.com") | |
savedoc(document,'retraining_report.docx') | |
#document.save('retraining_report.docx') | |
return 'retraining_report.docx',str(type(model).__name__+' has been chosen as the prediction model for achieving an accuracy of '+str(acc)+'%') | |
majors = ['CIS', 'CS', 'CYS', 'AI'] | |
catalog = ['222','223','220','251','252','311','313','314','301','413','315','321','322','325','290','310','411','412','414','415','425','421','422','423','424','444','511','512','513','516','524','521','522','517','162','102','132','122P','211','212','151','152','221','207','401','526','273','274','515','262','272','232','302','312','341','352','333','525','523','112','231','241','202','242','514','142','201','111','131','121','141','271','203','205','208','122','101','402','403','404','407','408','410','409','406','507','534','433','501','502','503','529','506','508','416','417','326','320','426','518','101N','207N','162N','142N','111N','351','341R','311R','361R','322R','332R','342R','312R','551','204','206','210','504','560','209'] | |
acad_prog = ['CISFD', 'CSCFD', 'CSC2D', 'CSCMD', 'CIS2D', 'CISMD', 'CIS1D', 'CDFFD', 'CIS6D', 'CIS5D', 'AICFD', 'CDFMD', 'AICMD'] | |
instructor = [2235186531, 2165408699, 2145689266, 2135152745, 2194771260, 4883031977, 4920301432, 4913765767, 4920307238, 2139595378, 4920306734, 4890602415, 2231227000, 2129908065, 4920014122, 2130269265, 4852562299, 4956513533, 4992319499, 5021161517, 5001698233, 2156812428, 4822287888, 4920021772, 2185381526, 2136566892, 2198726144, 2136354172, 4921298316, 4953758866, 2129220549, 4954412399, 2137607920, 2273533376, 5003490316, 4925121550, 2137442755, 5007956497, 5032808504, 5043174000, 5012205890, 5048802701, 5007934582, 4965553369, 2193108841, 5034375380, 5018036932, 4775083712, 2177540220, 2190446772, 2281161940, 5042841852, 2194950255, 2253073385, 2279818469, 2280775317, 5037751586, 2226897545, 2220612254, 2270461075, 5002842423, 2174788639, 2201033491, 4998734283, 5048351213, 2184587834, 2241953656, 2247975430, 2279031534, 2280417257, 5048798913, 4843544176, 4840600404, 4880228474, 4844692118, 4810116057, 4841018244, 4857639249, 4875844031, 4308059859, 4955997289, 4565364037, 4920304031, 4935399429, 4964818982, 4960094672, 4956759643, 5006769072, 5003282600, 5034374855, 4965383274, 5048176530, 2251581958, 2177906213, 5042839625, 4823432797, 2234984479, 4878860232, 5089909144, 5111504685, 2292511023, 5147608189, 5102823255, 4471442437, 4603727617, 2198695213, 4624163190, 4312363731, 4718537496, 4883071390, 4741891814, 4603562365, 4623599851, 4808444621, 2175746189, 5042827891, 2299645941, 2139937238, 2159825301, 2193768846, 2244083117, 2204670572, 5148738831, 2132105991, 2140252153, 2191852492, 5149923257, 2133982707, 2188005486, 4912215897, 4948758684, 2145585515, 2211124646, 5046795282, 4744286056, 2248064163, 2245249731, 2190958765, 2286796460, 2305826215, 2141977896, 4723166278, 4742067952, 2144199492, 4701573536, 1385090162, 2245184895, 4721381658, 4808446202, 2152399610, 2159483449, 5106671984, 2157858618, 2287537336, 5205981215, 5131991031, 2232494366, 2242364376, 5091334876, 2248831606, 2259498731, 4960068758, 4964159309, 2274397962, 5003281896, 4917452237, 4955999572, 5003283211, 4980137016, 2170441656, 2259290137, 2272914009, 5149823043, 2129319034, 2190674916, 2303226693, 4959653529, 4955997870, 4962271367, 4965557457, 2317276819, 5043313315, 2343812773, 2225658147, 4959653691, 5106519327, 4967680674, 5005320761, 5005325659, 4960818335, 4959654127, 5002482818, 2226462785, 4958613075, 2153112707, 2306001696, 4925123179, 4916649578, 5149821042, 5154843441, 4351078655, 5227475626, 2187240364, 4962564241, 4975046019, 4959653674, 2176022109, 5020110780, 5006958981, 2225580216, 4987694365, 2267908548, 5191887278, 5192897851, 2310970719, 5006959475, 4958853885, 5009301759, 4962923880, 2202947881, 2266040432, 2244188415, 4960824601, 4958615055, 5096908433, 4414909094, 5106126173, 4880689344, 5125473603, 4638090996, 5190881427, 2260909364, 2177690056, 2164903703, 4987812801, 4958336415, 5003281669, 2334889939, 2187509735, 5225189005, 5005319730, 5007932166, 5008877090, 5003282196, 5014498902, 2138244238, 4659359826, 5148593633, 4604742805, 5010986266, 2273306536, 5011501597, 5150339186, 2247605157, 5008876391, 2301035418, 4958859692, 5009299880, 5003282356, 2135224245, 5164774288, 2355004193, 5149822626, 4962270246, 4954819395, 4958337068, 5094362869, 5005518168, 2315957042, 5006712625, 4318114771, 4978669968, 2267320079, 2328816035, 4471894179, 4931513752, 2254309220, 2190729464, 2273848680, 2132597263, 4712730199, 2279529530, 4585763023, 5045439236, 5039097585, 4914790972, 4998903923, 5010722011, 5033545452, 2189136713, 4925386944, 2246753860, 4967484918, 5010380619, 4918583022, 5167824846, 5112272457, 2256071967, 4551315881, 471199705, 4841645734, 4989400226, 4782685963, 4303177714, 4333866027, 5019192050, 5001246698, 2280149457, 4828320414, 5002953984, 2247295594, 5049403173, 2297916846, 4981949621, 4915644607, 4886002071, 4909665968, 4964343999, 2282147596, 5025351391, 5150333690, 5046499865, 2313480256, 2306057218, 2141728698, 4652689317, 5049210369, 4962261171, 5002946072, 4962925881, 5048801917, 4968282016, 5049210318, 4861313860, 2133940646, 5001245645, 5006151762, 5046795038, 5177326457, 4982758753, 4969506927, 4992345875, 5005319130, 4998344325, 4609286316, 4678011207, 5005205144, 2321632258, 2226184565, 4958335677, 4917071087, 4972185195, 5006955685, 5048800825, 5050532324, 4961680022, 5010706396, 2172272804, 4810952038, 5043134411, 5005320504, 5102169082, 4720186163, 2231386959, 5113419929, 4780192735, 4522246666, 4920006733, 2224456244, 5091183270, 4961080679, 5002293420, 4962576371, 4565120209, 4625705830, 2248884319, 5002574919, 5097813375, 2146039868, 2217503978, 2127283768, 4498752851, 4962267858, 2134885018, 4878546760, 5150325161, 2153866729, 5000093139, 5004630616, 5010746243, 5003488865, 2252281795, 2225864599, 4964158067, 5147734960, 2136924210, 2190891107, 5161862823, 5004177905, 5010382114, 5181025497, 8003128720, 4960708620, 2142197596, 5002952387, 4956934653, 5062687326, 4954413711, 2262570713, 4954411712, 5046642204, 5046666453, 4919033187, 5112369898, 5054270529, 4954413452, 5050400268, 5095301781, 4998935139, 5156370627, 5119649578, 5117621773, 5071143450, 5004457883, 5025389815, 5005396372, 5162503573, 4880588500, 5094691051, 5099020291, 5093233278, 2185589491, 5047982533, 5059111438, 2167629104, 4741380648, 5106051264, 5119649157, 2213909697, 2276903510, 2164183756, 5113457583, 2261109173, 2275834231, 2243662237, 5094930687, 5113432424, 5098166924, 4619333478, 4997070887, 5034154184, 2281560215, 5096891323, 5046868507, 4985889119, 5056524567, 2131337464, 5149322579, 2243161140, 2174268801, 5117622577, 5096413053, 4951938565, 5001073672, 5111264347, 4920306153, 2186186819, 2305272809, 2314082251, 2246745963, 2165982585, 2196609271, 5117620771, 5018036721, 2207999756, 4956759098, 4924472053, 2187506928, 2312861157, 5016856032, 5016062760, 5097052002, 4953337390, 4850384272, 5087918026, 2183432395, 4924662737, 5122552887, 5000354599, 5090641357, 5053036866, 5150260241, 5156293806, 9166701499, 9166702293, 9166702380, 9166701959, 10170573509, 5148549128, 4972189772, 9166702346, 5115224618, 5151869175, 4961300697, 4907504397, 5151867702, 5054099271, 5115225352, 5117733787, 4982588324, 5113932629, 5028111707, 4954415645, 5168247358, 2175811405, 5115721557, 4921756814, 2218059533, 5098369463, 2140645483, 9166702486, 10026817957, 9174045369, 5090631374, 8954014242, 9166702048, 5195588103, 2258310937, 9166701361, 5149311575, 2127935283, 5146997917, 2145436765, 5191893070, 4924374442, 5148739656, 5059485811, 2141132522, 5042827342, 5227234698, 5128730319, 4923774008, 5113613805, 5115668709, 5148977378, 2174892273, 2193369730, 2285982919, 2169514451, 4769948794, 5205480490, 5151841575, 5150156332, 5149047110, 2244450075, 5117212718, 5175749131, 2225895520, 4918655685, 2282522276, 2137380690, 2229108753, 5192202684, 9703033479, 5099381161, 5156055821, 2320154877, 4770708459, 5103716144, 5104706271, 9173405212, 9173407755, 5159965699, 4977637758, 2277344763, 9166701904, 5149892443, 5161204780, 9175509385, 5010991124, 8966158996, 2280181075, 8764107066, 8666975530, 5191869593, 5192287506, 5192050287, 5122244642, 5191145822, 2161462170, 5145277752, 2132778192, 5149825397, 5153349509, 5204551227, 5190942345, 2232498178, 5191122098, 5191560083, 9166692133, 5191211749, 5153146583, 2198539253, 5261973429, 5191145311, 5163880700, 11297416012, 4808648447, 5163880731, 2233894352, 5190771693, 5207104056, 5152070193, 5099381057, 9405541718, 5171208320, 5111946478, 8849524903] | |
dir_name = str(os.getcwd()) | |
test = os.listdir(dir_name) | |
modelname = "" | |
maxnum = 0 | |
for item in test: | |
if item.endswith(".sav") and int(item.split("=")[0]) > maxnum: | |
maxnum = int(item.split("=")[0]) | |
modelname = item | |
if maxnum == 0: | |
return None,"No model found, please use retrain operation to build one" | |
#modelname = "VotingClassifier=0.95756598831352.sav" | |
loaded_model = pickle.load(open(modelname, 'rb')) | |
droppers = 0 | |
total = 0 | |
document.add_paragraph(' ') | |
document.add_heading('Subjects drop prediction report', 0) | |
document.add_paragraph('This report consists of students who might potentially drop courses they current are studying based on the supplied information') | |
records = [] | |
for row in coset.iterrows(): | |
row = list(row)[1] | |
semester = 1 | |
row['CATALOG_NBR'] = row['CATALOG_NBR'].replace(' ', '') | |
row['TERM'] = str(row['TERM']) | |
if row['TERM'][-1] == 2: | |
semester = 2 | |
elif row['TERM'][-1] == 5: | |
semester = 3 | |
c_id = catalog.index(str(row['CATALOG_NBR'])) | |
in_id = instructor.index(row['INSTRUCTOR_ID']) | |
p_id = acad_prog.index(row['PROGRAM']) | |
major = 0 | |
x = row['PROGRAM.1'] | |
if "Computer Science" in x: | |
major = 0 | |
elif "Computer Information" in x: | |
major = 1 | |
elif "Artificial" in x: | |
major = 3 | |
elif "Cyber" in x: | |
major = 2 | |
gpa = row['CGPA'] | |
prediction = loaded_model.predict([[semester,c_id,in_id,gpa,p_id,major]])[0] | |
total = total + 1 | |
records.append((str(total),str(row['SID']),str(row['TERM']),str(row['CATALOG_NBR']),str(row['INSTRUCTOR_ID']),str(row['CGPA']),str(row['PROGRAM']),str(row['PROGRAM.1']),str(prediction))) | |
if prediction == 1: | |
droppers = droppers + 1 | |
document.add_paragraph(' ') | |
records = tuple(records) | |
table = document.add_table(rows=1, cols=9) | |
hdr_cells = table.rows[0].cells | |
hdr_cells[0].text = 'Index' | |
hdr_cells[1].text = 'Student ID' | |
hdr_cells[2].text = 'Term' | |
hdr_cells[3].text = 'Catalog ID' | |
hdr_cells[4].text = 'Instructor ID' | |
hdr_cells[5].text = 'Cummulative GPA' | |
hdr_cells[6].text = 'Academic Program' | |
hdr_cells[7].text = 'Major' | |
hdr_cells[8].text = 'Possible Drop Prediction' | |
for ind,qty, id1, desc, inst, cgpa,aprog,maj,pred in records: | |
paragraph = document.add_paragraph() | |
row_cells = table.add_row().cells | |
row_cells[0].text = ind | |
row_cells[1].text = str(qty) | |
row_cells[2].text = id1 | |
row_cells[3].text = desc | |
row_cells[4].text = inst | |
row_cells[5].text = cgpa | |
row_cells[6].text = aprog | |
row_cells[7].text = maj | |
if pred == "1": | |
pred = "Yes" | |
else: | |
pred = "No" | |
row_cells[8].text = pred | |
table.style = 'TableGrid' | |
#document.add_page_break() | |
document.add_paragraph(" ") | |
modelname = modelname.split("=") | |
lastpara = 'Out of '+str(total)+' records, it is predicted that '+str(droppers)+' courses might be withdrawn from (Prediction model name:'+modelname[1]+'/Accuracy: '+str(float(modelname[2][0:6])*100)+'%)' | |
document.add_paragraph(lastpara) | |
savedoc(document,'drop_prediction_report.docx') | |
#document.save('drop_prediction_report.docx') | |
return 'drop_prediction_report.docx', lastpara+" (Model no."+modelname[0]+")" | |
iface = gr.Interface(fn=greet, inputs=[gr.Radio(["predict",'retrain'],value="predict"),"file"], outputs=["file","text"],debug=True) | |
iface.launch() |