Joschka Strueber commited on
Commit
e64ca4e
·
1 Parent(s): 5815cf9

[Add] cache loading data from hf

Browse files
Files changed (2) hide show
  1. src/dataloading.py +6 -5
  2. src/similarity.py +2 -2
src/dataloading.py CHANGED
@@ -3,6 +3,7 @@ import numpy as np
3
  from huggingface_hub import HfApi
4
 
5
  from functools import lru_cache
 
6
 
7
 
8
  def get_leaderboard_models():
@@ -17,9 +18,7 @@ def get_leaderboard_models():
17
  dataset_id = dataset.id
18
  try:
19
  # Check if the dataset can be loaded
20
- print(dataset_id)
21
  check_gated = datasets.get_dataset_config_names(dataset_id)
22
- print(check_gated)
23
  # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
24
  model_part = dataset_id.split("/")[-1].replace("-details", "")
25
  if "__" in model_part:
@@ -27,7 +26,7 @@ def get_leaderboard_models():
27
  models.append(f"{provider}/{model}")
28
  else:
29
  models.append(model_part)
30
- except Exception as e:
31
  pass
32
 
33
  return sorted(models)
@@ -77,6 +76,7 @@ def filter_labels(doc):
77
  return labels
78
 
79
 
 
80
  def load_run_data(model_name, dataset_name):
81
  try:
82
  model_name = model_name.replace("/", "__")
@@ -104,6 +104,7 @@ def load_run_data(model_name, dataset_name):
104
  return log_probs, labels
105
 
106
 
107
-
108
-
 
109
 
 
3
  from huggingface_hub import HfApi
4
 
5
  from functools import lru_cache
6
+ from datasets.exceptions import DatasetNotFoundError
7
 
8
 
9
  def get_leaderboard_models():
 
18
  dataset_id = dataset.id
19
  try:
20
  # Check if the dataset can be loaded
 
21
  check_gated = datasets.get_dataset_config_names(dataset_id)
 
22
  # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
23
  model_part = dataset_id.split("/")[-1].replace("-details", "")
24
  if "__" in model_part:
 
26
  models.append(f"{provider}/{model}")
27
  else:
28
  models.append(model_part)
29
+ except DatasetNotFoundError as e:
30
  pass
31
 
32
  return sorted(models)
 
76
  return labels
77
 
78
 
79
+
80
  def load_run_data(model_name, dataset_name):
81
  try:
82
  model_name = model_name.replace("/", "__")
 
104
  return log_probs, labels
105
 
106
 
107
+ @lru_cache(maxsize=8)
108
+ def load_run_data_cached(model_name, dataset_name):
109
+ return load_run_data(model_name, dataset_name)
110
 
src/similarity.py CHANGED
@@ -2,7 +2,7 @@ import numpy as np
2
 
3
  from lmsim.metrics import Metrics, CAPA, EC
4
 
5
- from src.dataloading import load_run_data
6
  from src.utils import softmax, one_hot
7
 
8
  def load_data_and_compute_similarities(models: list[str], dataset: str, metric_name: str) -> np.array:
@@ -10,7 +10,7 @@ def load_data_and_compute_similarities(models: list[str], dataset: str, metric_n
10
  probs = []
11
  gts = []
12
  for model in models:
13
- model_probs, model_gt = load_run_data(model, dataset)
14
  probs.append(model_probs)
15
  gts.append(model_gt)
16
 
 
2
 
3
  from lmsim.metrics import Metrics, CAPA, EC
4
 
5
+ from src.dataloading import load_run_data_cached
6
  from src.utils import softmax, one_hot
7
 
8
  def load_data_and_compute_similarities(models: list[str], dataset: str, metric_name: str) -> np.array:
 
10
  probs = []
11
  gts = []
12
  for model in models:
13
+ model_probs, model_gt = load_run_data_cached(model, dataset)
14
  probs.append(model_probs)
15
  gts.append(model_gt)
16