jsulz HF staff commited on
Commit
34ae673
·
1 Parent(s): adbb8fc

needs cleanup, but most of the treemap is working correctly

Browse files
Files changed (1) hide show
  1. app.py +65 -33
app.py CHANGED
@@ -12,19 +12,17 @@ import numpy as np
12
  HF_API = HfApi()
13
 
14
 
15
- def apply_power_scaling(sizes, exponent=0.2):
16
  """Apply custom power scaling to the sizes."""
17
- """skip over if size is none, but make sure to fill it as 0"""
18
  return [size**exponent if size is not None else 0 for size in sizes]
19
 
20
 
21
- def count_chunks(sizes):
22
- """Count the number of chunks, which are 64KB each in size - which are bytes"""
23
- """always round up to the nearest chunk"""
24
  return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
25
 
26
 
27
- def build_hierarchy(siblings):
28
  """Builds a hierarchical structure from the list of RepoSibling objects."""
29
  hierarchy = defaultdict(dict)
30
 
@@ -33,9 +31,9 @@ def build_hierarchy(siblings):
33
  size = sibling.lfs.size if sibling.lfs else sibling.size
34
 
35
  current_level = hierarchy
36
- for part in path_parts[:-1]: # Traverse directories
37
  current_level = current_level.setdefault(part, {})
38
- current_level[path_parts[-1]] = size # Assign size to the file
39
 
40
  return hierarchy
41
 
@@ -45,49 +43,60 @@ def calculate_directory_sizes(hierarchy):
45
  total_size = 0
46
 
47
  for key, value in hierarchy.items():
48
- if isinstance(value, dict): # Directory
49
- dir_size = calculate_directory_sizes(value) # Recursively calculate size
50
  hierarchy[key] = {
51
  "__size__": dir_size,
52
  **value,
53
- } # Add size to directory metadata
54
  total_size += dir_size
55
- else: # File
56
  total_size += value
57
 
58
  return total_size
59
 
60
 
61
- def flatten_hierarchy_with_directory_sizes(hierarchy, root_name="Repository"):
 
 
 
 
62
  """Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
63
  labels = []
64
  parents = []
65
  sizes = []
 
66
 
67
  # Recursively process the hierarchy
68
  def process_level(current_hierarchy, current_parent):
69
  for key, value in current_hierarchy.items():
70
- if isinstance(value, dict) and "__size__" in value: # Directory
71
- dir_size = value.pop("__size__") # Extract directory size
 
 
72
  labels.append(key)
73
  parents.append(current_parent)
74
  sizes.append(dir_size)
75
- process_level(value, key) # Recurse into subdirectories
76
- else: # File
 
 
77
  labels.append(key)
78
  parents.append(current_parent)
79
  sizes.append(value)
 
80
 
81
  # Add the root node
82
  total_size = calculate_directory_sizes(hierarchy)
83
  labels.append(root_name)
84
- parents.append("") # Root has no parent
85
  sizes.append(total_size)
 
86
 
87
  # Process the hierarchy
88
  process_level(hierarchy, root_name)
89
 
90
- return labels, parents, sizes
91
 
92
 
93
  def visualize_repo_treemap(r_info):
@@ -98,55 +107,78 @@ def visualize_repo_treemap(r_info):
98
  # Calculate directory sizes
99
  calculate_directory_sizes(hierarchy)
100
 
101
- # Flatten the hierarchy into Plotly-compatible format
102
- labels, parents, sizes = flatten_hierarchy_with_directory_sizes(hierarchy)
103
 
104
- # Apply the chosen scaling function for visualization
105
  scaled_sizes = apply_power_scaling(sizes)
106
 
107
  # Format the original sizes using the helper function
108
  formatted_sizes = [
109
- (
110
- format_repo_size(size) if size is not None else None
111
- ) # Format both files and directories
112
- for size in sizes
113
  ]
114
 
115
  chunks = count_chunks(sizes)
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # Create the treemap
118
  fig = px.treemap(
119
  names=labels,
120
  parents=parents,
121
  values=scaled_sizes,
 
 
122
  title="Repo by Chunks",
123
  custom_data=[formatted_sizes, chunks],
 
 
124
  )
125
 
 
 
126
  # Add subtitle by updating the layout
127
  fig.update_layout(
128
  title={
129
  "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
130
- "x": 0.5, # Center the title and subtitle
131
  "xanchor": "center",
132
- }
 
133
  )
134
 
135
- # Customize the hover template to include directory sizes
136
  fig.update_traces(
137
  hovertemplate=(
138
- "<b>%{label}</b><br>" # File/Directory name
139
- "Size: %{customdata[0]}<br>" # Scaled size shown in treemap
140
- "# of Chunks: %{customdata[1]}" # Formatted size from custom data
141
  )
142
  )
143
  fig.update_traces(root_color="lightgrey")
144
- fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
145
 
146
  return fig
147
 
148
 
149
  def format_repo_size(r_size: int) -> str:
 
 
 
 
 
 
 
 
 
150
  units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
151
  order = 0
152
  while r_size >= 1024 and order < len(units) - 1:
 
12
  HF_API = HfApi()
13
 
14
 
15
+ def apply_power_scaling(sizes: list, exponent=0.2) -> list:
16
  """Apply custom power scaling to the sizes."""
 
17
  return [size**exponent if size is not None else 0 for size in sizes]
18
 
19
 
20
+ def count_chunks(sizes: list) -> list:
21
+ """Count the number of chunks, which are 64KB each in size; always roundup"""
 
22
  return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
23
 
24
 
25
+ def build_hierarchy(siblings: list) -> dict:
26
  """Builds a hierarchical structure from the list of RepoSibling objects."""
27
  hierarchy = defaultdict(dict)
28
 
 
31
  size = sibling.lfs.size if sibling.lfs else sibling.size
32
 
33
  current_level = hierarchy
34
+ for part in path_parts[:-1]:
35
  current_level = current_level.setdefault(part, {})
36
+ current_level[path_parts[-1]] = size
37
 
38
  return hierarchy
39
 
 
43
  total_size = 0
44
 
45
  for key, value in hierarchy.items():
46
+ if isinstance(value, dict):
47
+ dir_size = calculate_directory_sizes(value)
48
  hierarchy[key] = {
49
  "__size__": dir_size,
50
  **value,
51
+ }
52
  total_size += dir_size
53
+ else:
54
  total_size += value
55
 
56
  return total_size
57
 
58
 
59
+ def build_full_path(current_parent, key):
60
+ return f"{current_parent}/{key}" if current_parent else key
61
+
62
+
63
+ def flatten_hierarchy(hierarchy, root_name="Repository"):
64
  """Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
65
  labels = []
66
  parents = []
67
  sizes = []
68
+ ids = []
69
 
70
  # Recursively process the hierarchy
71
  def process_level(current_hierarchy, current_parent):
72
  for key, value in current_hierarchy.items():
73
+ full_path = build_full_path(current_parent, key)
74
+ if isinstance(value, dict) and "__size__" in value:
75
+ # Handle directories
76
+ dir_size = value.pop("__size__")
77
  labels.append(key)
78
  parents.append(current_parent)
79
  sizes.append(dir_size)
80
+ ids.append(full_path)
81
+ process_level(value, full_path)
82
+ else:
83
+ # Handle files
84
  labels.append(key)
85
  parents.append(current_parent)
86
  sizes.append(value)
87
+ ids.append(full_path)
88
 
89
  # Add the root node
90
  total_size = calculate_directory_sizes(hierarchy)
91
  labels.append(root_name)
92
+ parents.append("")
93
  sizes.append(total_size)
94
+ ids.append(root_name)
95
 
96
  # Process the hierarchy
97
  process_level(hierarchy, root_name)
98
 
99
+ return labels, parents, sizes, ids
100
 
101
 
102
  def visualize_repo_treemap(r_info):
 
107
  # Calculate directory sizes
108
  calculate_directory_sizes(hierarchy)
109
 
110
+ # Flatten the hierarchy for Plotly
111
+ labels, parents, sizes, ids = flatten_hierarchy(hierarchy)
112
 
113
+ # Scale for vix
114
  scaled_sizes = apply_power_scaling(sizes)
115
 
116
  # Format the original sizes using the helper function
117
  formatted_sizes = [
118
+ (format_repo_size(size) if size is not None else None) for size in sizes
 
 
 
119
  ]
120
 
121
  chunks = count_chunks(sizes)
122
+ colors = scaled_sizes[:]
123
+ colors[0] = -1
124
+ max_value = max(scaled_sizes)
125
+ normalized_colors = [value / max_value if value > 0 else 0 for value in colors]
126
+
127
+ # Define the colorscale; mimics the plasma scale
128
+ colorscale = [
129
+ [0.0, "#0d0887"],
130
+ [0.5, "#bd3786"],
131
+ [1.0, "#f0f921"],
132
+ ]
133
 
134
  # Create the treemap
135
  fig = px.treemap(
136
  names=labels,
137
  parents=parents,
138
  values=scaled_sizes,
139
+ color=normalized_colors,
140
+ color_continuous_scale=colorscale,
141
  title="Repo by Chunks",
142
  custom_data=[formatted_sizes, chunks],
143
+ height=1000,
144
+ ids=ids,
145
  )
146
 
147
+ fig.update_traces(marker={"colors": ["lightgrey"] + normalized_colors[1:]})
148
+
149
  # Add subtitle by updating the layout
150
  fig.update_layout(
151
  title={
152
  "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
153
+ "x": 0.5,
154
  "xanchor": "center",
155
+ },
156
+ coloraxis_showscale=False,
157
  )
158
 
159
+ # Customize the hover template
160
  fig.update_traces(
161
  hovertemplate=(
162
+ "<b>%{label}</b><br>"
163
+ "Size: %{customdata[0]}<br>"
164
+ "# of Chunks: %{customdata[1]}"
165
  )
166
  )
167
  fig.update_traces(root_color="lightgrey")
 
168
 
169
  return fig
170
 
171
 
172
  def format_repo_size(r_size: int) -> str:
173
+ """
174
+ Convert a repository size in bytes to a human-readable string with appropriate units.
175
+
176
+ Args:
177
+ r_size (int): The size of the repository in bytes.
178
+
179
+ Returns:
180
+ str: The formatted size string with appropriate units (B, KB, MB, GB, TB, PB).
181
+ """
182
  units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
183
  order = 0
184
  while r_size >= 1024 and order < len(units) - 1: