Spaces:
Runtime error
Runtime error
File size: 4,053 Bytes
92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 c45c4e1 92bf372 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams["font.family"] = "Times New Roman"
# Define the directory where your dataset is located
dataset_directory = "data/train/combined/Task 1/"
# Create a list of class labels based on subdirectories in the dataset directory
class_labels = os.listdir(dataset_directory)
# Initialize lists to store data for EDA
num_samples_per_class = []
class_labels_processed = []
# Initialize an empty DataFrame to store image dimensions
image_dimensions_df = pd.DataFrame(columns=["Height", "Width"])
# Initialize a dictionary to store a random sample of images from each class
sampled_images = {label: [] for label in class_labels}
# Iterate through class labels and count the number of samples per class
for label in class_labels:
if label != ".DS_Store":
class_directory = os.path.join(dataset_directory, label)
num_samples = len(os.listdir(class_directory))
num_samples_per_class.append(num_samples)
class_labels_processed.append(label)
# Extract image dimensions and add them to the DataFrame
for image_file in os.listdir(class_directory):
image_path = os.path.join(class_directory, image_file)
image = plt.imread(image_path)
height, width, _ = image.shape
image_dimensions_df = image_dimensions_df._append(
{"Height": height, "Width": width}, ignore_index=True
)
# Randomly sample 5 images from each class for visualization
if len(sampled_images[label]) < 5:
sampled_images[label].append(image)
# Create a Pandas DataFrame for EDA
eda_data = pd.DataFrame(
{"Class Label": class_labels_processed, "Number of Samples": num_samples_per_class}
)
# Plot the number of samples per class
plt.figure(figsize=(10, 6))
sns.barplot(x="Class Label", y="Number of Samples", data=eda_data)
plt.title("Number of Samples per Class")
plt.xticks(rotation=45)
plt.xlabel("Class Label")
plt.ylabel("Number of Samples")
plt.subplots_adjust(
top=0.88, bottom=0.21, left=0.125, right=0.9, hspace=0.2, wspace=0.2
)
plt.savefig("docs/eda/Number of Samples per Class.png")
plt.show()
# Calculate and plot the distribution of sample sizes (image dimensions)
plt.figure(figsize=(10, 6))
plt.scatter(image_dimensions_df["Width"], image_dimensions_df["Height"], alpha=0.5)
plt.title("Distribution of Sample Sizes (Image Dimensions)")
plt.xlabel("Width (Pixels)")
plt.ylabel("Height (Pixels)")
plt.savefig("docs/eda/Distribution of Sample Sizes (Image Dimensions).png")
plt.show()
# Plot a random sample of images from each class
for label, images in sampled_images.items():
plt.figure(figsize=(15, 5))
plt.suptitle(f"Random Sample of Images from Class: {label}")
for i, image in enumerate(images, start=1):
plt.subplot(1, 5, i)
plt.imshow(image)
plt.axis("off")
plt.title(f"Sample {i}")
plt.savefig(f"docs/eda/Random Sample of Images from Class {label}.png")
plt.show()
# Calculate and plot the correlation matrix for image dimensions
correlation_matrix = image_dimensions_df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Image Dimensions")
plt.savefig("docs/eda/Correlation Matrix of Image Dimensions.png")
plt.show()
# Plot the distribution of image widths
plt.figure(figsize=(10, 6))
sns.histplot(image_dimensions_df["Width"], bins=20, kde=True)
plt.title("Distribution of Image Widths")
plt.xlabel("Width (Pixels)")
plt.ylabel("Frequency")
plt.savefig("docs/eda/Distribution of Image Widths.png")
plt.show()
# Plot the distribution of image heights
plt.figure(figsize=(10, 6))
sns.histplot(image_dimensions_df["Height"], bins=20, kde=True)
plt.title("Distribution of Image Heights")
plt.xlabel("Height (Pixels)")
plt.ylabel("Frequency")
plt.savefig("docs/eda/Distribution of Image Heights.png")
plt.show()
|