|
import os |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from matplotlib import rcParams |
|
|
|
rcParams["font.family"] = "Times New Roman" |
|
|
|
|
|
dataset_directory = "data/train/combined/Task 1/" |
|
|
|
|
|
class_labels = os.listdir(dataset_directory) |
|
|
|
|
|
num_samples_per_class = [] |
|
class_labels_processed = [] |
|
|
|
|
|
image_dimensions_df = pd.DataFrame(columns=["Height", "Width"]) |
|
|
|
|
|
sampled_images = {label: [] for label in class_labels} |
|
|
|
|
|
for label in class_labels: |
|
if label != ".DS_Store": |
|
class_directory = os.path.join(dataset_directory, label) |
|
num_samples = len(os.listdir(class_directory)) |
|
num_samples_per_class.append(num_samples) |
|
class_labels_processed.append(label) |
|
|
|
|
|
for image_file in os.listdir(class_directory): |
|
image_path = os.path.join(class_directory, image_file) |
|
image = plt.imread(image_path) |
|
height, width, _ = image.shape |
|
image_dimensions_df = image_dimensions_df._append( |
|
{"Height": height, "Width": width}, ignore_index=True |
|
) |
|
|
|
|
|
if len(sampled_images[label]) < 5: |
|
sampled_images[label].append(image) |
|
|
|
|
|
eda_data = pd.DataFrame( |
|
{"Class Label": class_labels_processed, "Number of Samples": num_samples_per_class} |
|
) |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.barplot(x="Class Label", y="Number of Samples", data=eda_data) |
|
plt.title("Number of Samples per Class") |
|
plt.xticks(rotation=45) |
|
plt.xlabel("Class Label") |
|
plt.ylabel("Number of Samples") |
|
plt.subplots_adjust( |
|
top=0.88, bottom=0.21, left=0.125, right=0.9, hspace=0.2, wspace=0.2 |
|
) |
|
plt.savefig("docs/eda/Number of Samples per Class.png") |
|
plt.show() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.scatter(image_dimensions_df["Width"], image_dimensions_df["Height"], alpha=0.5) |
|
plt.title("Distribution of Sample Sizes (Image Dimensions)") |
|
plt.xlabel("Width (Pixels)") |
|
plt.ylabel("Height (Pixels)") |
|
plt.savefig("docs/eda/Distribution of Sample Sizes (Image Dimensions).png") |
|
plt.show() |
|
|
|
|
|
for label, images in sampled_images.items(): |
|
plt.figure(figsize=(15, 5)) |
|
plt.suptitle(f"Random Sample of Images from Class: {label}") |
|
for i, image in enumerate(images, start=1): |
|
plt.subplot(1, 5, i) |
|
plt.imshow(image) |
|
plt.axis("off") |
|
plt.title(f"Sample {i}") |
|
plt.savefig(f"docs/eda/Random Sample of Images from Class {label}.png") |
|
plt.show() |
|
|
|
|
|
correlation_matrix = image_dimensions_df.corr() |
|
plt.figure(figsize=(8, 6)) |
|
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5) |
|
plt.title("Correlation Matrix of Image Dimensions") |
|
plt.savefig("docs/eda/Correlation Matrix of Image Dimensions.png") |
|
plt.show() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.histplot(image_dimensions_df["Width"], bins=20, kde=True) |
|
plt.title("Distribution of Image Widths") |
|
plt.xlabel("Width (Pixels)") |
|
plt.ylabel("Frequency") |
|
plt.savefig("docs/eda/Distribution of Image Widths.png") |
|
plt.show() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.histplot(image_dimensions_df["Height"], bins=20, kde=True) |
|
plt.title("Distribution of Image Heights") |
|
plt.xlabel("Height (Pixels)") |
|
plt.ylabel("Frequency") |
|
plt.savefig("docs/eda/Distribution of Image Heights.png") |
|
plt.show() |
|
|