Heizsenberg's picture
init
4d8779f
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from datasets import load_dataset
import random
def run():
st.title('Tomato Leaf Health Classification')
st.subheader("this page contains the EDA about tomato leaf health classification")
# image = Image.open("./src/credit_card.jpg")
# st.image(image, caption="Credit Card")
# write
st.write("the EDA will explore and analyse classifier tomato leaf health")
# fetch dataset
dataset_dict = load_dataset("Heizsenberg/leaf-image-dataset")
label_names = dataset_dict["train"].features["label"].names
dataset_df = dataset_dict['train'].to_pandas()
dataset_df["label_name"] = dataset_df["label"].map(dict(enumerate(label_names)))
st.write("sample from the dataframe")
st.write(dataset_df.sample(15))
st.write("content of the dataframe")
st.write("Total images:", len(dataset_df))
st.write("Total classes:", dataset_df["label"].nunique())
st.write("Tomato Leaf Training dataset class distribution")
fig, ax = plt.subplots(figsize=(10,5))
sns.countplot(data=dataset_df, x="label_name", order=dataset_df["label_name"].value_counts().index, ax=ax)
plt.xticks(rotation=90)
plt.title("Class Distribution")
st.pyplot(fig)
st.write("sample image size and mode")
sample_path_obj = random.choice(dataset_df["image"].values)
sample_path = sample_path_obj['path']
img = Image.open(sample_path)
st.write("Image size:", img.size)
st.write("Image mode:", img.mode)
st.write("sample from each classes")
fig_samp, ax_samp = plt.subplots(4, 3, figsize=(12,12))
# samples = dataset_df.sample(10)
samples = dataset_df.groupby("label_name").sample(1, random_state=42)
for ax, (_, row) in zip(ax_samp.flatten(), samples.iterrows()):
image_path = row['image']
img = Image.open(image_path['path'])
ax.imshow(img)
ax.set_title(row["label_name"])
ax.axis("off")
plt.tight_layout()
# Show inside Streamlit
st.pyplot(fig_samp)
st.write("""
## Insight
1. dataset contains around 16.011 in 10 classes
2. class distribution generally spread evenly with few exceptions on `tomato_tomato_mosaic_virus` has lowest samples and `Tomato_YellowLeaf_curl_virus` having the largest samples, showing complexity in detecting the diseases and easier detection of tomato mosaic virus
3. the dataset images is on size (256x256) which needs to be rescaled for lower GPU load
4. several samples is shown from 10 different classes, showing both healthy and disease afflicted leaves
""")
if __name__ == '__main__':
run()