Spaces:

Heizsenberg
/

leaf_classification

Sleeping

App Files Files Community

leaf_classification / src /eda.py

Heizsenberg

init

4d8779f 27 days ago

raw

history blame contribute delete

2.82 kB

	import streamlit as st
	import seaborn as sns
	import matplotlib.pyplot as plt
	from PIL import Image
	from datasets import load_dataset
	import random

	def run():
	st.title('Tomato Leaf Health Classification')
	st.subheader("this page contains the EDA about tomato leaf health classification")

	# image = Image.open("./src/credit_card.jpg")
	# st.image(image, caption="Credit Card")

	# write
	st.write("the EDA will explore and analyse classifier tomato leaf health")

	# fetch dataset
	dataset_dict = load_dataset("Heizsenberg/leaf-image-dataset")
	label_names = dataset_dict["train"].features["label"].names


	dataset_df = dataset_dict['train'].to_pandas()
	dataset_df["label_name"] = dataset_df["label"].map(dict(enumerate(label_names)))

	st.write("sample from the dataframe")
	st.write(dataset_df.sample(15))

	st.write("content of the dataframe")
	st.write("Total images:", len(dataset_df))
	st.write("Total classes:", dataset_df["label"].nunique())

	st.write("Tomato Leaf Training dataset class distribution")
	fig, ax = plt.subplots(figsize=(10,5))
	sns.countplot(data=dataset_df, x="label_name", order=dataset_df["label_name"].value_counts().index, ax=ax)
	plt.xticks(rotation=90)
	plt.title("Class Distribution")

	st.pyplot(fig)

	st.write("sample image size and mode")
	sample_path_obj = random.choice(dataset_df["image"].values)
	sample_path = sample_path_obj['path']
	img = Image.open(sample_path)

	st.write("Image size:", img.size)
	st.write("Image mode:", img.mode)

	st.write("sample from each classes")
	fig_samp, ax_samp = plt.subplots(4, 3, figsize=(12,12))

	# samples = dataset_df.sample(10)
	samples = dataset_df.groupby("label_name").sample(1, random_state=42)

	for ax, (_, row) in zip(ax_samp.flatten(), samples.iterrows()):
	image_path = row['image']
	img = Image.open(image_path['path'])
	ax.imshow(img)
	ax.set_title(row["label_name"])
	ax.axis("off")

	plt.tight_layout()

	# Show inside Streamlit
	st.pyplot(fig_samp)

	st.write("""
	## Insight

	1. dataset contains around 16.011 in 10 classes
	2. class distribution generally spread evenly with few exceptions on `tomato_tomato_mosaic_virus` has lowest samples and `Tomato_YellowLeaf_curl_virus` having the largest samples, showing complexity in detecting the diseases and easier detection of tomato mosaic virus
	3. the dataset images is on size (256x256) which needs to be rescaled for lower GPU load
	4. several samples is shown from 10 different classes, showing both healthy and disease afflicted leaves
	""")


	if __name__ == '__main__':
	run()