massock-sipe / module /data_viz.py
Massock's picture
Upload 20 files
b7c5cb8
import streamlit as st
import plotly.express as px
import pandas as pd
from module.load_data import data
from module.dataset import create_data
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import matplotlib.pyplot as plt
#plt.style.use('seaborn-dark-palette')
def grades(mark):
if mark >= 0 and mark < 5:
return 'U'
elif mark >= 5 and mark < 9:
return 'E'
elif mark >= 9 and mark < 12:
return 'D'
elif mark >= 12 and mark < 15:
return 'C'
elif mark >= 15 and mark < 18:
return 'B'
else:
return 'A'
def segmentation(data:pd.DataFrame):
df = data.copy()
factor = pd.factorize(df.form)
df['form'] = factor[0]; definition=factor[1]
cols = [f'seq {i+1}' for i in range(6)] + ['age','form']
tree = DecisionTreeClassifier(random_state=31012020)
tree.fit(df[cols], df['gender'])
export_graphviz(tree, out_file="images/tree.dot", class_names=["Female", "Male"],
feature_names=cols, impurity=False, filled=True,rounded=True)
result = pd.DataFrame({"Feature":cols, "Gain":tree.feature_importances_})
return result
def scatterplot(female=None, male=None, line_separator=None, x_axis=None, y_axis=None):
fig = plt.figure(figsize=(5,5))
plt.scatter(female[x_axis], female[y_axis], marker='v', color='red', label="Girl", lw=1.2)
plt.scatter(male[x_axis], male[y_axis], marker='o', color='blue', label="Boy",lw=1.2)
plt.xlabel(x_axis); plt.ylabel(y_axis)
plt.title("Decision line", fontweight="bold")
for u in line_separator[0]:
plt.vlines(line_separator[0], 0, female[y_axis].max(), color="black",
linestyles="-.", lw=1.75)
for v in line_separator[1] :
plt.hlines(line_separator[1], -1, female[y_axis].max(), color="black",
linestyles="dashed", lw=1.75)
plt.legend(loc="best", frameon=True, fancybox=True, shadow=True, title="Gender")
return fig
df_trim = create_data(data)
total_students = len(data)
average_age = data.age.mean()
gca1 = df_trim["trimester 1"].mean()
gca2 = df_trim["trimester 2"].mean()
gca3 = df_trim["trimester 3"].mean()
annual_gca = (gca1+gca2+gca3)/3
evaluations = ['seq 1', 'seq 2','seq 3','seq 4','seq 5','seq 6']
sgca = pd.DataFrame(columns=['Male','Female'], index=evaluations)
sgca['Male'] = data[data.gender == 'M'][evaluations].mean()
sgca['Female'] = data[data.gender == 'F'][evaluations].mean()
form_counts = data.form.value_counts()
gender_counts = data.gender.value_counts()
progression = sgca.diff()
pass_or_fail =pd.DataFrame({seq:data[seq].apply(lambda x: 'Passed' if x >= 10.0 else 'Failed').value_counts().to_dict() for seq in evaluations})
student_grade = pd.DataFrame({seq:data[seq].apply(grades).value_counts().to_dict() for seq in evaluations})
##print(student_grade)
bar_polar = px.bar_polar(data_frame=sgca, r='Male',
theta=sgca.index, color='Female',
title='General Class Average (GCA) for each evaluation',
barmode="overlay",width=1200, height=500)
pie_form = px.pie(data_frame=form_counts, names=form_counts.index, values=form_counts,
title="Form", hole=0.25, width=500, height=500)
pie_gender = px.pie(data_frame=gender_counts, names=gender_counts.index, values=gender_counts,
title="Gender", hole=0.25, width=500, height=500)
feature_importances = segmentation(data)
bar_feature_importance = px.bar(data_frame=feature_importances, x="Feature", y="Gain",
title="Feature importances bar", width=500)
with open("images/tree.dot") as f:
dot_graph = f.read()
female = data[data.gender == "F"]
male = data[data.gender == "M"]
def visualization():
placeholder1 = st.empty()
with placeholder1.container():
kp1, kp2, kp3, kp4, kp5, kp6 = st.columns(6)
kp1.metric(
label= "**:red[πŸ‘¬ Total students]**",
value = total_students,
delta = ""
)
kp2.metric(
label= ":red[**⏳ Average age**]",
value = f"{average_age:.2f}" ,
delta = ""
)
kp3.metric(
label=":red[**✍ GCA-trimester 1**]",
value=f"{gca1:.2f}/20",
delta = 0.0
)
kp4.metric(
label=":red[**✍ GCA-trimester 2**]",
value = f"{gca2:.2f}/20",
delta = round(gca2-gca1, 3)
)
kp5.metric(
label =":red[**✍ GCA-trimester 3**]",
value = f"{gca3:.2f}/20",
delta = round(gca3-gca2, 3)
)
kp6.metric(
label =":red[**πŸŽ“ Annual-GCA**]",
value = f"{annual_gca:.2f}/20",
delta = ""
)
placeholder2 = st.empty()
with placeholder2.container():
col1, col2 = st.columns(2)
col1.plotly_chart(pie_gender)
col1.caption('**We represent the gender of students in form 6e-5e (francophone section one of the cameroonian education system).**')
col2.plotly_chart(pie_form)
col2.caption("""**We represent the number of students in each form 6e and 5e**.
**NB**: **Cameroon have two sub-education systems one is a francophone and a second is an anglosaxone.**
""")
tab1, tab2, tab3 = st.tabs([':orange[**Performance**]', ':orange[**Distribution**]', ':orange[**Miscelaneous**]'])
with tab1:
tab1.plotly_chart(bar_polar)
tab1.caption("""We represent the general class average of students in each sequence. Colors bar shows the marks
for female gender and bar polar also shows the marks for male gender.""")
tab1.subheader('Progression')
tab1.line_chart(progression)
tab1.caption("""This chart shows a progression of students during the six evaluations.
We just make a difference between the previous sequence and the next sequence.
The x-axis represent a sequence and the y-axis represent the growth of students.""")
tab1.subheader('Passed or Failed')
col1, col2 = tab1.columns(2)
col1.caption(':red[πŸ’ƒ passed or failed table πŸ‘‰]')
col1.dataframe(pass_or_fail)
col1.caption("""
- In left: We got this table to compute the number of students that the mark is >10 (passed) or <10 (failed) for
each sequence.
- In right: We plot this table. The chart explains how a student make some effort to succeed a ICT course.
- In general, we can appreciate the effort of the students in the form 6e and 5e for each evaluation.
""")
col2.bar_chart(pass_or_fail.T)
tab1.subheader("Student grades")
col3, col4 = tab1.columns(2)
col3.caption(':red[πŸ’ƒ grades table πŸ‘‰]')
col3.dataframe(student_grade.fillna(0))
col3.caption("""
The student grade respect this decision:
- U -> [0 - 5[; E -> [5 - 9[
- D -> [9 - 12[; C -> [12 - 15[
- B -> [15 - 18[; A -> [18 - 20[
""")
col4.bar_chart(student_grade.T)
col4.caption('This bar chart shows the number of student in each grade for each sequence.')
with tab2:
tab2.subheader('evaluation and age distribution')
tab2.caption('Histogram')
var1 = tab2.selectbox('Choose items', evaluations+['age'], key=6)
fig1 = px.histogram(data_frame=data, x=var1, width=1200, height=500, opacity=0.75)
tab2.plotly_chart(fig1)
col1, col2 = tab2.columns(2)
col1.caption('Boxplot'); col2.caption('Violin')
var2 = col1.selectbox('Choose items', evaluations+['age'], key=7)
var3 = col2.selectbox('Choose items', evaluations+['age'], key=8)
col1.plotly_chart(px.box(data_frame=data, y=var2, width=500, height=500))
col2.plotly_chart(px.violin(data_frame=data, x=var3, width=500, height=500))
with tab3:
st.subheader('Relation graph')
container = tab3.empty()
vcol1, vcol2, vcol3 = container.columns(3)
trim1 = px.scatter(data_frame=df_trim, x="trimester 1", y="trimester 2", width=350,
height=350,title="trimester 1 & 2")
vcol1.plotly_chart(trim1)
trim2 = px.scatter(data_frame=df_trim, x="trimester 2", y="trimester 3", width=350, height=350,title="trimester 3 & 2")
vcol2.plotly_chart(trim2)
trim3 = px.scatter(data_frame=df_trim, x="trimester 3", y="trimester 1", width=350, height=350,title="trimester 1 & 3")
vcol3.plotly_chart(trim3)
tab3.caption("""
The three charts shows the monotony of the relation function between the three trimesters.
Each chart prove that the students for form 6e-5e francophone education systems at the Tebap college make considerably
an effort to succeed an ICT's course.
""")
tab3.subheader('Supervised segmentation with tree')
tab3.caption("""
In this section, we are making a supervised segmentation to segment the population of student into subgroups
that have different values for the target gender. To find a subgroups, we are using a tree structured model.
We cannot make a classification here because our dataset is just a size equal to 46. Let's go! πŸ’‚
""")
block = tab3.empty()
hcol1, hcol2 = block.columns(2)
hcol1.caption('**Feature importance table**πŸ‘‰')
hcol1.dataframe(feature_importances)
hcol1.caption("""
The table shows the features that the tree structured model
consider very importance for segmenting students
population in to subgroups.
""")
hcol2.plotly_chart(bar_feature_importance)
tab3.subheader('Dot graph')
tab3.graphviz_chart(dot_graph)
tab3.caption("""
Let's us interprete this graph.
- node 0: the student that the mark in seq 6 <= 11.375 are female gender; the answer is False. In each node, we have
a condition where the next node give an answers.
Let's plot the line separating the region.
""")
bcol1, bcol2 = tab3.columns(2)
var1 = bcol1.selectbox("Choose x-axis (line[i])", ['seq 1', "seq 2", "seq 4", "seq 6"])
var2 = bcol1.selectbox("Choose y-axis (line[i+1])", reversed(['seq 1', "seq 2", "seq 4", "seq 6"]))
line1 = bcol2.multiselect("Line separator for x-axis", [1.25, 2.75, 3.125, 4.375,5.0, 5.625, 11.375,13.5])
line2 = bcol2.multiselect("Line separator for y-axis", [1.25, 2.75, 3.125, 4.375,5.0, 5.625, 11.375,13.5])
tab3.caption("""
We have eight lines line 1 (node 0) to line 8 (node 12). We start to line 1 and line 2.
""")
gplot = scatterplot(female, male, line_separator=[line1, line2], x_axis=var1, y_axis=var2)
tab3.pyplot(gplot)