Commit
·
7fdb5f5
1
Parent(s):
b61f534
fix: show partial results even if some evaluations haven't finished
Browse files- app.py +1 -1
- src/leaderboard/read_evals.py +4 -1
- src/populate.py +4 -5
app.py
CHANGED
@@ -65,7 +65,7 @@ def update_table(
|
|
65 |
|
66 |
def apply_format_styling(df: pd.DataFrame) -> style.Styler:
|
67 |
df = df.style.format(
|
68 |
-
{c: "{:.1f}" for c in BENCHMARK_COLS} | {AutoEvalColumn.average.name: "{:.2f}"}
|
69 |
)
|
70 |
return df
|
71 |
|
|
|
65 |
|
66 |
def apply_format_styling(df: pd.DataFrame) -> style.Styler:
|
67 |
df = df.style.format(
|
68 |
+
{c: "{:.1f}" for c in BENCHMARK_COLS} | {AutoEvalColumn.average.name: "{:.2f}"}, na_rep="-"
|
69 |
)
|
70 |
return df
|
71 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -124,7 +124,10 @@ class EvalResult:
|
|
124 |
}
|
125 |
|
126 |
for task in Tasks:
|
127 |
-
|
|
|
|
|
|
|
128 |
|
129 |
return data_dict
|
130 |
|
|
|
124 |
}
|
125 |
|
126 |
for task in Tasks:
|
127 |
+
if task.value.benchmark in self.results.keys():
|
128 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
129 |
+
else:
|
130 |
+
data_dict[task.value.col_name] = None
|
131 |
|
132 |
return data_dict
|
133 |
|
src/populate.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import pandas as pd
|
2 |
|
3 |
-
from src.display.formatting import has_no_nan_values
|
4 |
from src.display.utils import AutoEvalColumn
|
5 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
6 |
|
7 |
|
8 |
-
def get_leaderboard_df(
|
|
|
|
|
9 |
"""Creates a dataframe from all the individual experiment results"""
|
10 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
11 |
all_data_json = [v.to_dict() for v in raw_data]
|
@@ -14,6 +15,4 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
15 |
df = df[cols].round(decimals=2)
|
16 |
|
17 |
-
# filter out if any of the benchmarks have not been produced
|
18 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
19 |
return raw_data, df
|
|
|
1 |
import pandas as pd
|
2 |
|
|
|
3 |
from src.display.utils import AutoEvalColumn
|
4 |
+
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
5 |
|
6 |
|
7 |
+
def get_leaderboard_df(
|
8 |
+
results_path: str, requests_path: str, cols: list, benchmark_cols: list
|
9 |
+
) -> tuple[list[EvalResult], pd.DataFrame]:
|
10 |
"""Creates a dataframe from all the individual experiment results"""
|
11 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
12 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
15 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
16 |
df = df[cols].round(decimals=2)
|
17 |
|
|
|
|
|
18 |
return raw_data, df
|