massock-sipe / module /dataset.py
Massock's picture
Upload 20 files
b7c5cb8
import streamlit as st
import numpy as np
import pandas as pd
from module.load_data import data
from scipy import stats
def create_data(data:pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame({"trimester 1": round((data['seq 1']+data['seq 2'])/2, 2),
"trimester 2": round((data['seq 3']+data['seq 4'])/2, 2),
"trimester 3": round((data['seq 5']+data['seq 6'])/2, 2),
"age":data["age"]})
return df
def subdata(data:pd.DataFrame, items:list) -> pd.DataFrame:
if len(items) == 3:
return data[(data[items[0][0]] == items[0][1]) & (data[items[1][0]] == items[1][1]) & (data[items[2][0]] == items[2][1])]['average_marks']
if len(items) == 2:
return data[(data[items[0][0]] == items[0][1]) & (data[items[1][0]] == items[1][1])]['average_marks']
if len(items) == 1:
return data[(data[items[0][0]] == items[0][1])]['average_marks']
def hypothesis_testing(group1:pd.DataFrame, group2:pd.DataFrame, label1:list, label2:list) -> list:
mean1, std1, nobs1 = group1.mean(), group1.std(), group1.count()
mean2, std2, nobs2 = group2.mean(), group2.std(), group2.count()
res = stats.ttest_ind_from_stats(mean1=mean1, std1=std1, nobs1=nobs1,
mean2=mean2, std2=std2, nobs2=nobs2,equal_var=False)
tvalue = res.statistic
pvalue = res.pvalue
diff = mean1 - mean2
std_error = np.sqrt((std1**2/nobs1) + (std2**2/nobs2))
low_bound = diff - tvalue*std_error
upper_bound = diff + tvalue*std_error
text1 = f"""
:green[**Welch two-sample t-test**]
**data**: group1={label1} and group2={label2}
t = {tvalue:.4f}, diff = {diff:.4f}, p-value = {pvalue:.4f}
**alternative hypothesis**: true difference in means is not equal to 0
**95% confidence interval**: ({low_bound}, {upper_bound})
**mean of group1**: {mean1:.3f}
**mean of group2**: {mean2:.3f}
"""
text2 = f"""
:green[**Welch two-sample t-test**]
**data**: group1={label1} and group2={label2}
t = {tvalue:.8f}, diff = {diff:.8f}, p-value = {pvalue:.8f}
**null hypothesis**: true difference in means is equal to 0
**mean of group1**: {mean1:.3f}
**mean of group2**: {mean2:.3f}
"""
return text1 if pvalue < 0.05 else text2
def eda():
df = pd.DataFrame({"trimester 1": round((data['seq 1']+data['seq 2'])/2, 2),
"trimester 2": round((data['seq 3']+data['seq 4'])/2, 2),
"trimester 3": round((data['seq 5']+data['seq 6'])/2, 2),
"age":data["age"]})
corr = df.corr()
vif = np.linalg.inv(corr.to_numpy()).diagonal()
vifs = pd.Series(np.round(vif,2), index=df.columns.tolist(), name="VIF")
female = data[data.gender == 'F']
male = data[data.gender == 'M']
df_female = create_data(female)
df_male = create_data(male)
items = ['F','M','6e','5e']
st.subheader('Sequence')
placeholder1 = st.empty()
with placeholder1.container():
col1, col2 = st.columns(2)
col1.caption(':red[**Descriptive statistics**]')
col1.dataframe(round(data.describe(), 2))
col2.caption(':red[**Correlation**]')
col2.dataframe(round(data.corr(), 2))
st.subheader("Trimester")
st.caption(':red[**Descriptive statistics**]')
st.dataframe(round(df.describe(), 2), use_container_width=True)
placeholder2 = st.empty()
with placeholder2.container():
col1, col2 = st.columns(2)
col1.caption(':red[**Correlation**]')
col1.dataframe(round(corr, 2))
col2.caption(':red[**Variance Inflation Factor**]')
col2.dataframe(vifs)
with st.expander("πŸ‘ Read more"):
st.markdown("""
> 1. :orange[Descriptive statistics] help to define mean, standard deviation, minimun, maximun, median, etc..
such that we can summarize the dataset and discover the patterns.
> 2. :orange[Correlation] help to find the tendance or colinearity between two or more attributes in the dataset.
> 3. :orange[VIF] is a mesure of colinearity among predictor variables within a multiple regression.
>> 1. If outcome is 1, it's okay.
>> 2. If it is between 1 and 5, it show low to average colinearity, and above 5 generally means highly redundant
and variable should be dropped.
""")
st.subheader('Sequence by gender')
placeholder3 = st.empty()
placeholder4 = st.empty()
with placeholder3.container():
col1, col2 = st.columns(2)
col1.caption(':orange[**Descriptive statistics: Female**]')
col1.dataframe(round(female.describe(), 2))
col2.caption(':orange[**Descriptive statistics: Male**]')
col2.dataframe(round(male.describe(), 2))
with placeholder4.container():
col1, col2 = st.columns(2)
col1.caption(':orange[**Correlation: Female**]')
col1.dataframe(round(female.corr(), 2))
col2.caption(':orange[**Correlation: Male**]')
col2.dataframe(round(male.corr(), 2))
st.subheader('Trimester by gender')
placeholder5 = st.empty()
placeholder6 = st.empty()
with placeholder5.container():
col1, col2 = st.columns(2)
col1.caption(':orange[**Descriptive statistics: Female**]')
col1.dataframe(round(df_female.describe(), 2))
col2.caption(':orange[**Descriptive statistics: Male**]')
col2.dataframe(round(df_male.describe(), 2))
with placeholder6.container():
col1, col2 = st.columns(2)
col1.caption(':orange[**Correlation: Female**]')
col1.dataframe(round(df_female.corr(), 2))
col2.caption(':orange[**Correlation: Male**]')
col2.dataframe(round(df_male.corr(), 2))
st.subheader('Assumption')
st.markdown("""
In this section, you can make your assumption to know which group is best than other group in this ICT course.
For example: According to table trimester evaluation by gender, we have two groups (female students and male students).
By the observation,
> **Is it true that female students are best than male students in the ICT course for Tebap college?**
This question we allow us to compare the general annual class average (mean) for one group to other group. That's lead to
compute the difference between the two means. We have two hypothesis:
1. **Null hypothesis:** :blue[ true difference in means is equal to 0].
2. **Alternative hypothesis:** :blue[ true difference in means is not equal to 0]
**NB**: difference = $\mu$(group1) - $\mu$(group2)
We choose the right hypothesis as follows:
- **if p-value < 5% then we reject null hypothesis**
- **if p-value > 5% then we accept null hypothesis**
""")
placeholder7 = st.empty()
with placeholder7.container():
col1, col2 = st.columns(2)
with col1:
label1 = []
st.caption(':red[Create group A]')
scol1, scol2, scol3 = st.columns(3)
gender1 = scol1.selectbox('Gender', ['Choose an option', 'F', 'M'])
form1 = scol2.selectbox('Form', ['Choose an option', '6e', '5e'])
age1 = scol3.selectbox('Age', ['Choose an option',]+sorted(data.age.unique().tolist()))
if gender1 != 'Choose an option':
label1.append(('gender', gender1))
if form1 != 'Choose an option':
label1.append(('form', form1))
if age1 != 'Choose an option':
label1.append(('age', age1))
group1 = subdata(data, label1)
if type(group1) == pd.Series:
scol2.write(group1)
with col2:
label2 = []
st.caption(':red[Create group B]')
tcol1, tcol2, tcol3 = st.columns(3)
gender2 = tcol1.selectbox('Gender', ['Choose an option', 'F', 'M'], key=1)
form2 = tcol2.selectbox('Form', ['Choose an option', '6e', '5e'], key=2)
age2 = tcol3.selectbox('Age', ['Choose an option',]+sorted(data.age.unique().tolist()), key=3)
if gender2 != 'Choose an option':
label2.append(('gender', gender2))
if form2 != 'Choose an option':
label2.append(('form', form2))
if age2 != 'Choose an option':
label2.append(('age', age2))
group2 = subdata(data, label2)
if type(group2) == pd.Series:
tcol2.write(group2)
st.caption('Choose only one or two item(s) per group. For example group A = (F, 5e) or group B = (M)')
if (sorted(label2) == sorted(label1)) and not(len(label1) == 0 or len(label2) == 0):
st.info(f'You cannot create two different groups with same items: label1 = {label1} & label2 = {label2}.')
if st.button('t-test'):
res = hypothesis_testing(group1, group2, label1, label2)
st.write(res)