SpiralSense / eda.py
cycool29's picture
Update
c45c4e1
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams["font.family"] = "Times New Roman"
# Define the directory where your dataset is located
dataset_directory = "data/train/combined/Task 1/"
# Create a list of class labels based on subdirectories in the dataset directory
class_labels = os.listdir(dataset_directory)
# Initialize lists to store data for EDA
num_samples_per_class = []
class_labels_processed = []
# Initialize an empty DataFrame to store image dimensions
image_dimensions_df = pd.DataFrame(columns=["Height", "Width"])
# Initialize a dictionary to store a random sample of images from each class
sampled_images = {label: [] for label in class_labels}
# Iterate through class labels and count the number of samples per class
for label in class_labels:
if label != ".DS_Store":
class_directory = os.path.join(dataset_directory, label)
num_samples = len(os.listdir(class_directory))
num_samples_per_class.append(num_samples)
class_labels_processed.append(label)
# Extract image dimensions and add them to the DataFrame
for image_file in os.listdir(class_directory):
image_path = os.path.join(class_directory, image_file)
image = plt.imread(image_path)
height, width, _ = image.shape
image_dimensions_df = image_dimensions_df._append(
{"Height": height, "Width": width}, ignore_index=True
)
# Randomly sample 5 images from each class for visualization
if len(sampled_images[label]) < 5:
sampled_images[label].append(image)
# Create a Pandas DataFrame for EDA
eda_data = pd.DataFrame(
{"Class Label": class_labels_processed, "Number of Samples": num_samples_per_class}
)
# Plot the number of samples per class
plt.figure(figsize=(10, 6))
sns.barplot(x="Class Label", y="Number of Samples", data=eda_data)
plt.title("Number of Samples per Class")
plt.xticks(rotation=45)
plt.xlabel("Class Label")
plt.ylabel("Number of Samples")
plt.subplots_adjust(
top=0.88, bottom=0.21, left=0.125, right=0.9, hspace=0.2, wspace=0.2
)
plt.savefig("docs/eda/Number of Samples per Class.png")
plt.show()
# Calculate and plot the distribution of sample sizes (image dimensions)
plt.figure(figsize=(10, 6))
plt.scatter(image_dimensions_df["Width"], image_dimensions_df["Height"], alpha=0.5)
plt.title("Distribution of Sample Sizes (Image Dimensions)")
plt.xlabel("Width (Pixels)")
plt.ylabel("Height (Pixels)")
plt.savefig("docs/eda/Distribution of Sample Sizes (Image Dimensions).png")
plt.show()
# Plot a random sample of images from each class
for label, images in sampled_images.items():
plt.figure(figsize=(15, 5))
plt.suptitle(f"Random Sample of Images from Class: {label}")
for i, image in enumerate(images, start=1):
plt.subplot(1, 5, i)
plt.imshow(image)
plt.axis("off")
plt.title(f"Sample {i}")
plt.savefig(f"docs/eda/Random Sample of Images from Class {label}.png")
plt.show()
# Calculate and plot the correlation matrix for image dimensions
correlation_matrix = image_dimensions_df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Image Dimensions")
plt.savefig("docs/eda/Correlation Matrix of Image Dimensions.png")
plt.show()
# Plot the distribution of image widths
plt.figure(figsize=(10, 6))
sns.histplot(image_dimensions_df["Width"], bins=20, kde=True)
plt.title("Distribution of Image Widths")
plt.xlabel("Width (Pixels)")
plt.ylabel("Frequency")
plt.savefig("docs/eda/Distribution of Image Widths.png")
plt.show()
# Plot the distribution of image heights
plt.figure(figsize=(10, 6))
sns.histplot(image_dimensions_df["Height"], bins=20, kde=True)
plt.title("Distribution of Image Heights")
plt.xlabel("Height (Pixels)")
plt.ylabel("Frequency")
plt.savefig("docs/eda/Distribution of Image Heights.png")
plt.show()