Spaces:

Massock
/

massock-sipe

Sleeping

App Files Files Community

massock-sipe / module /data_viz.py

Massock

Upload 20 files

b7c5cb8 over 1 year ago

raw

history blame contribute delete

9.83 kB

	import streamlit as st
	import plotly.express as px
	import pandas as pd
	from module.load_data import data
	from module.dataset import create_data
	import plotly.express as px
	from sklearn.tree import DecisionTreeClassifier, export_graphviz
	import matplotlib.pyplot as plt

	#plt.style.use('seaborn-dark-palette')

	def grades(mark):
	if mark >= 0 and mark < 5:
	return 'U'
	elif mark >= 5 and mark < 9:
	return 'E'
	elif mark >= 9 and mark < 12:
	return 'D'
	elif mark >= 12 and mark < 15:
	return 'C'
	elif mark >= 15 and mark < 18:
	return 'B'
	else:
	return 'A'

	def segmentation(data:pd.DataFrame):
	df = data.copy()
	factor = pd.factorize(df.form)
	df['form'] = factor[0]; definition=factor[1]

	cols = [f'seq {i+1}' for i in range(6)] + ['age','form']
	tree = DecisionTreeClassifier(random_state=31012020)
	tree.fit(df[cols], df['gender'])

	export_graphviz(tree, out_file="images/tree.dot", class_names=["Female", "Male"],
	feature_names=cols, impurity=False, filled=True,rounded=True)
	result = pd.DataFrame({"Feature":cols, "Gain":tree.feature_importances_})
	return result

	def scatterplot(female=None, male=None, line_separator=None, x_axis=None, y_axis=None):
	fig = plt.figure(figsize=(5,5))
	plt.scatter(female[x_axis], female[y_axis], marker='v', color='red', label="Girl", lw=1.2)
	plt.scatter(male[x_axis], male[y_axis], marker='o', color='blue', label="Boy",lw=1.2)
	plt.xlabel(x_axis); plt.ylabel(y_axis)
	plt.title("Decision line", fontweight="bold")
	for u in line_separator[0]:
	plt.vlines(line_separator[0], 0, female[y_axis].max(), color="black",
	linestyles="-.", lw=1.75)

	for v in line_separator[1] :
	plt.hlines(line_separator[1], -1, female[y_axis].max(), color="black",
	linestyles="dashed", lw=1.75)

	plt.legend(loc="best", frameon=True, fancybox=True, shadow=True, title="Gender")
	return fig


	df_trim = create_data(data)
	total_students = len(data)
	average_age = data.age.mean()
	gca1 = df_trim["trimester 1"].mean()
	gca2 = df_trim["trimester 2"].mean()
	gca3 = df_trim["trimester 3"].mean()
	annual_gca = (gca1+gca2+gca3)/3

	evaluations = ['seq 1', 'seq 2','seq 3','seq 4','seq 5','seq 6']

	sgca = pd.DataFrame(columns=['Male','Female'], index=evaluations)

	sgca['Male'] = data[data.gender == 'M'][evaluations].mean()
	sgca['Female'] = data[data.gender == 'F'][evaluations].mean()

	form_counts = data.form.value_counts()
	gender_counts = data.gender.value_counts()

	progression = sgca.diff()

	pass_or_fail =pd.DataFrame({seq:data[seq].apply(lambda x: 'Passed' if x >= 10.0 else 'Failed').value_counts().to_dict() for seq in evaluations})

	student_grade = pd.DataFrame({seq:data[seq].apply(grades).value_counts().to_dict() for seq in evaluations})

	##print(student_grade)

	bar_polar = px.bar_polar(data_frame=sgca, r='Male',
	theta=sgca.index, color='Female',
	title='General Class Average (GCA) for each evaluation',
	barmode="overlay",width=1200, height=500)
	pie_form = px.pie(data_frame=form_counts, names=form_counts.index, values=form_counts,
	title="Form", hole=0.25, width=500, height=500)
	pie_gender = px.pie(data_frame=gender_counts, names=gender_counts.index, values=gender_counts,
	title="Gender", hole=0.25, width=500, height=500)


	feature_importances = segmentation(data)

	bar_feature_importance = px.bar(data_frame=feature_importances, x="Feature", y="Gain",
	title="Feature importances bar", width=500)

	with open("images/tree.dot") as f:
	dot_graph = f.read()

	female = data[data.gender == "F"]
	male = data[data.gender == "M"]


	def visualization():
	placeholder1 = st.empty()

	with placeholder1.container():
	kp1, kp2, kp3, kp4, kp5, kp6 = st.columns(6)

	kp1.metric(
	label= ":red[👬 Total students]",
	value = total_students,
	delta = ""
	)

	kp2.metric(
	label= ":red[⏳ Average age]",
	value = f"{average_age:.2f}" ,
	delta = ""
	)

	kp3.metric(
	label=":red[✍ GCA-trimester 1]",
	value=f"{gca1:.2f}/20",
	delta = 0.0
	)
	kp4.metric(
	label=":red[✍ GCA-trimester 2]",
	value = f"{gca2:.2f}/20",
	delta = round(gca2-gca1, 3)
	)
	kp5.metric(
	label =":red[✍ GCA-trimester 3]",
	value = f"{gca3:.2f}/20",
	delta = round(gca3-gca2, 3)
	)
	kp6.metric(
	label =":red[🎓 Annual-GCA]",
	value = f"{annual_gca:.2f}/20",
	delta = ""
	)


	placeholder2 = st.empty()
	with placeholder2.container():
	col1, col2 = st.columns(2)
	col1.plotly_chart(pie_gender)
	col1.caption('We represent the gender of students in form 6e-5e (francophone section one of the cameroonian education system).')
	col2.plotly_chart(pie_form)
	col2.caption("""We represent the number of students in each form 6e and 5e.
	NB: Cameroon have two sub-education systems one is a francophone and a second is an anglosaxone.
	""")

	tab1, tab2, tab3 = st.tabs([':orange[Performance]', ':orange[Distribution]', ':orange[Miscelaneous]'])

	with tab1:
	tab1.plotly_chart(bar_polar)
	tab1.caption("""We represent the general class average of students in each sequence. Colors bar shows the marks
	for female gender and bar polar also shows the marks for male gender.""")
	tab1.subheader('Progression')
	tab1.line_chart(progression)
	tab1.caption("""This chart shows a progression of students during the six evaluations.
	We just make a difference between the previous sequence and the next sequence.
	The x-axis represent a sequence and the y-axis represent the growth of students.""")

	tab1.subheader('Passed or Failed')
	col1, col2 = tab1.columns(2)
	col1.caption(':red[💃 passed or failed table 👉]')
	col1.dataframe(pass_or_fail)
	col1.caption("""
	- In left: We got this table to compute the number of students that the mark is >10 (passed) or <10 (failed) for
	each sequence.
	- In right: We plot this table. The chart explains how a student make some effort to succeed a ICT course.
	- In general, we can appreciate the effort of the students in the form 6e and 5e for each evaluation.
	""")
	col2.bar_chart(pass_or_fail.T)

	tab1.subheader("Student grades")
	col3, col4 = tab1.columns(2)
	col3.caption(':red[💃 grades table 👉]')
	col3.dataframe(student_grade.fillna(0))
	col3.caption("""
	The student grade respect this decision:
	- U -> [0 - 5[; E -> [5 - 9[
	- D -> [9 - 12[; C -> [12 - 15[
	- B -> [15 - 18[; A -> [18 - 20[
	""")
	col4.bar_chart(student_grade.T)
	col4.caption('This bar chart shows the number of student in each grade for each sequence.')

	with tab2:
	tab2.subheader('evaluation and age distribution')
	tab2.caption('Histogram')
	var1 = tab2.selectbox('Choose items', evaluations+['age'], key=6)
	fig1 = px.histogram(data_frame=data, x=var1, width=1200, height=500, opacity=0.75)
	tab2.plotly_chart(fig1)
	col1, col2 = tab2.columns(2)
	col1.caption('Boxplot'); col2.caption('Violin')
	var2 = col1.selectbox('Choose items', evaluations+['age'], key=7)
	var3 = col2.selectbox('Choose items', evaluations+['age'], key=8)
	col1.plotly_chart(px.box(data_frame=data, y=var2, width=500, height=500))
	col2.plotly_chart(px.violin(data_frame=data, x=var3, width=500, height=500))


	with tab3:
	st.subheader('Relation graph')
	container = tab3.empty()
	vcol1, vcol2, vcol3 = container.columns(3)

	trim1 = px.scatter(data_frame=df_trim, x="trimester 1", y="trimester 2", width=350,
	height=350,title="trimester 1 & 2")
	vcol1.plotly_chart(trim1)

	trim2 = px.scatter(data_frame=df_trim, x="trimester 2", y="trimester 3", width=350, height=350,title="trimester 3 & 2")
	vcol2.plotly_chart(trim2)

	trim3 = px.scatter(data_frame=df_trim, x="trimester 3", y="trimester 1", width=350, height=350,title="trimester 1 & 3")
	vcol3.plotly_chart(trim3)

	tab3.caption("""
	The three charts shows the monotony of the relation function between the three trimesters.
	Each chart prove that the students for form 6e-5e francophone education systems at the Tebap college make considerably
	an effort to succeed an ICT's course.
	""")
	tab3.subheader('Supervised segmentation with tree')
	tab3.caption("""
	In this section, we are making a supervised segmentation to segment the population of student into subgroups
	that have different values for the target gender. To find a subgroups, we are using a tree structured model.
	We cannot make a classification here because our dataset is just a size equal to 46. Let's go! 💂
	""")
	block = tab3.empty()
	hcol1, hcol2 = block.columns(2)
	hcol1.caption('Feature importance table👉')
	hcol1.dataframe(feature_importances)
	hcol1.caption("""
	The table shows the features that the tree structured model
	consider very importance for segmenting students
	population in to subgroups.
	""")

	hcol2.plotly_chart(bar_feature_importance)
	tab3.subheader('Dot graph')
	tab3.graphviz_chart(dot_graph)
	tab3.caption("""
	Let's us interprete this graph.

	- node 0: the student that the mark in seq 6 <= 11.375 are female gender; the answer is False. In each node, we have
	a condition where the next node give an answers.

	Let's plot the line separating the region.
	""")

	bcol1, bcol2 = tab3.columns(2)


	var1 = bcol1.selectbox("Choose x-axis (line[i])", ['seq 1', "seq 2", "seq 4", "seq 6"])
	var2 = bcol1.selectbox("Choose y-axis (line[i+1])", reversed(['seq 1', "seq 2", "seq 4", "seq 6"]))

	line1 = bcol2.multiselect("Line separator for x-axis", [1.25, 2.75, 3.125, 4.375,5.0, 5.625, 11.375,13.5])
	line2 = bcol2.multiselect("Line separator for y-axis", [1.25, 2.75, 3.125, 4.375,5.0, 5.625, 11.375,13.5])
	tab3.caption("""
	We have eight lines line 1 (node 0) to line 8 (node 12). We start to line 1 and line 2.
	""")
	gplot = scatterplot(female, male, line_separator=[line1, line2], x_axis=var1, y_axis=var2)
	tab3.pyplot(gplot)