File size: 4,053 Bytes
92bf372
 
 
 
 
 
 
c45c4e1
92bf372
 
c45c4e1
92bf372
 
 
 
 
 
 
 
 
c45c4e1
92bf372
 
 
 
 
 
 
 
 
 
 
c45c4e1
92bf372
 
 
 
 
c45c4e1
 
 
 
92bf372
 
 
 
 
c45c4e1
 
 
92bf372
 
 
c45c4e1
 
92bf372
c45c4e1
 
 
 
 
 
92bf372
 
 
 
c45c4e1
 
 
 
 
92bf372
 
 
 
 
c45c4e1
92bf372
 
 
c45c4e1
 
 
92bf372
 
 
 
 
c45c4e1
 
 
92bf372
 
 
 
c45c4e1
 
 
 
 
92bf372
 
 
 
c45c4e1
 
 
 
 
92bf372
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

rcParams["font.family"] = "Times New Roman"

# Define the directory where your dataset is located
dataset_directory = "data/train/combined/Task 1/"

# Create a list of class labels based on subdirectories in the dataset directory
class_labels = os.listdir(dataset_directory)

# Initialize lists to store data for EDA
num_samples_per_class = []
class_labels_processed = []

# Initialize an empty DataFrame to store image dimensions
image_dimensions_df = pd.DataFrame(columns=["Height", "Width"])

# Initialize a dictionary to store a random sample of images from each class
sampled_images = {label: [] for label in class_labels}

# Iterate through class labels and count the number of samples per class
for label in class_labels:
    if label != ".DS_Store":
        class_directory = os.path.join(dataset_directory, label)
        num_samples = len(os.listdir(class_directory))
        num_samples_per_class.append(num_samples)
        class_labels_processed.append(label)

        # Extract image dimensions and add them to the DataFrame
        for image_file in os.listdir(class_directory):
            image_path = os.path.join(class_directory, image_file)
            image = plt.imread(image_path)
            height, width, _ = image.shape
            image_dimensions_df = image_dimensions_df._append(
                {"Height": height, "Width": width}, ignore_index=True
            )

            # Randomly sample 5 images from each class for visualization
            if len(sampled_images[label]) < 5:
                sampled_images[label].append(image)

# Create a Pandas DataFrame for EDA
eda_data = pd.DataFrame(
    {"Class Label": class_labels_processed, "Number of Samples": num_samples_per_class}
)

# Plot the number of samples per class
plt.figure(figsize=(10, 6))
sns.barplot(x="Class Label", y="Number of Samples", data=eda_data)
plt.title("Number of Samples per Class")
plt.xticks(rotation=45)
plt.xlabel("Class Label")
plt.ylabel("Number of Samples")
plt.subplots_adjust(
    top=0.88, bottom=0.21, left=0.125, right=0.9, hspace=0.2, wspace=0.2
)
plt.savefig("docs/eda/Number of Samples per Class.png")
plt.show()

# Calculate and plot the distribution of sample sizes (image dimensions)
plt.figure(figsize=(10, 6))
plt.scatter(image_dimensions_df["Width"], image_dimensions_df["Height"], alpha=0.5)
plt.title("Distribution of Sample Sizes (Image Dimensions)")
plt.xlabel("Width (Pixels)")
plt.ylabel("Height (Pixels)")
plt.savefig("docs/eda/Distribution of Sample Sizes (Image Dimensions).png")
plt.show()

# Plot a random sample of images from each class
for label, images in sampled_images.items():
    plt.figure(figsize=(15, 5))
    plt.suptitle(f"Random Sample of Images from Class: {label}")
    for i, image in enumerate(images, start=1):
        plt.subplot(1, 5, i)
        plt.imshow(image)
        plt.axis("off")
        plt.title(f"Sample {i}")
    plt.savefig(f"docs/eda/Random Sample of Images from Class {label}.png")
    plt.show()

# Calculate and plot the correlation matrix for image dimensions
correlation_matrix = image_dimensions_df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Image Dimensions")
plt.savefig("docs/eda/Correlation Matrix of Image Dimensions.png")
plt.show()

# Plot the distribution of image widths
plt.figure(figsize=(10, 6))
sns.histplot(image_dimensions_df["Width"], bins=20, kde=True)
plt.title("Distribution of Image Widths")
plt.xlabel("Width (Pixels)")
plt.ylabel("Frequency")
plt.savefig("docs/eda/Distribution of Image Widths.png")
plt.show()

# Plot the distribution of image heights
plt.figure(figsize=(10, 6))
sns.histplot(image_dimensions_df["Height"], bins=20, kde=True)
plt.title("Distribution of Image Heights")
plt.xlabel("Height (Pixels)")
plt.ylabel("Frequency")
plt.savefig("docs/eda/Distribution of Image Heights.png")
plt.show()