Update Leaderboard
Browse files- gen_table.py +14 -4
- meta_data.py +4 -2
gen_table.py
CHANGED
@@ -88,8 +88,12 @@ def BUILD_L2_DF(results, dataset):
|
|
88 |
assert len(sub), dataset
|
89 |
fields = list(sub[0][dataset].keys())
|
90 |
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
|
94 |
for m in results:
|
95 |
item = results[m]
|
@@ -117,7 +121,11 @@ def BUILD_L2_DF(results, dataset):
|
|
117 |
# Use the first 5 non-overall fields as required fields
|
118 |
required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
119 |
|
120 |
-
|
|
|
|
|
|
|
|
|
121 |
df = df.iloc[::-1]
|
122 |
|
123 |
check_box = {}
|
@@ -152,7 +160,9 @@ def generate_table(results):
|
|
152 |
|
153 |
|
154 |
for d in DATASETS_ALL:
|
155 |
-
key_name = 'Overall'
|
|
|
|
|
156 |
if d in item:
|
157 |
val = float(item[d][key_name])
|
158 |
val = float(f'{val:.1f}')
|
|
|
88 |
assert len(sub), dataset
|
89 |
fields = list(sub[0][dataset].keys())
|
90 |
|
91 |
+
if dataset == 'WeMath':
|
92 |
+
non_overall_fields = [x for x in fields if 'Score' in x]
|
93 |
+
overall_fields = [x for x in fields if 'Score' not in x]
|
94 |
+
else:
|
95 |
+
non_overall_fields = [x for x in fields if 'Overall' not in x]
|
96 |
+
overall_fields = [x for x in fields if 'Overall' in x]
|
97 |
|
98 |
for m in results:
|
99 |
item = results[m]
|
|
|
121 |
# Use the first 5 non-overall fields as required fields
|
122 |
required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
123 |
|
124 |
+
if 'Score (Strict)' in df:
|
125 |
+
df = df.sort_values('Score (Strict)')
|
126 |
+
else:
|
127 |
+
df = df.sort_values('Overall')
|
128 |
+
|
129 |
df = df.iloc[::-1]
|
130 |
|
131 |
check_box = {}
|
|
|
160 |
|
161 |
|
162 |
for d in DATASETS_ALL:
|
163 |
+
key_name = 'Overall'
|
164 |
+
if d == 'WeMath':
|
165 |
+
key_name = 'Score (Strict)'
|
166 |
if d in item:
|
167 |
val = float(item[d][key_name])
|
168 |
val = float(f'{val:.1f}')
|
meta_data.py
CHANGED
@@ -22,13 +22,15 @@ We obtain all evaluation results based on the [VLMEvalKit](https://github.com/op
|
|
22 |
2. MathVision: The Full test set of MathVision, around 3000 samples.
|
23 |
3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
|
24 |
4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
|
|
|
|
|
25 |
|
26 |
To suggest new models or benchmarks for this leaderboard, please contact [email protected].
|
27 |
"""
|
28 |
|
29 |
# CONSTANTS-FIELDS
|
30 |
-
DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
|
31 |
-
DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
|
32 |
META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
|
33 |
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
34 |
MODEL_TYPE = ['OpenSource', 'API']
|
|
|
22 |
2. MathVision: The Full test set of MathVision, around 3000 samples.
|
23 |
3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
|
24 |
4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
|
25 |
+
5. WeMath: The Test Mini split of WeMath, around 1740 samples, we report "Score (Strict)" as the main metric.
|
26 |
+
6. LogicVista: The Full test set of LogicVista, around 450 samples.
|
27 |
|
28 |
To suggest new models or benchmarks for this leaderboard, please contact [email protected].
|
29 |
"""
|
30 |
|
31 |
# CONSTANTS-FIELDS
|
32 |
+
DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
|
33 |
+
DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
|
34 |
META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
|
35 |
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
36 |
MODEL_TYPE = ['OpenSource', 'API']
|