KennyUTC commited on
Commit
151fc9b
·
1 Parent(s): 1b0ca05

Update Leaderboard

Browse files
Files changed (2) hide show
  1. gen_table.py +14 -4
  2. meta_data.py +4 -2
gen_table.py CHANGED
@@ -88,8 +88,12 @@ def BUILD_L2_DF(results, dataset):
88
  assert len(sub), dataset
89
  fields = list(sub[0][dataset].keys())
90
 
91
- non_overall_fields = [x for x in fields if 'Overall' not in x]
92
- overall_fields = [x for x in fields if 'Overall' in x]
 
 
 
 
93
 
94
  for m in results:
95
  item = results[m]
@@ -117,7 +121,11 @@ def BUILD_L2_DF(results, dataset):
117
  # Use the first 5 non-overall fields as required fields
118
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
119
 
120
- df = df.sort_values('Overall')
 
 
 
 
121
  df = df.iloc[::-1]
122
 
123
  check_box = {}
@@ -152,7 +160,9 @@ def generate_table(results):
152
 
153
 
154
  for d in DATASETS_ALL:
155
- key_name = 'Overall'
 
 
156
  if d in item:
157
  val = float(item[d][key_name])
158
  val = float(f'{val:.1f}')
 
88
  assert len(sub), dataset
89
  fields = list(sub[0][dataset].keys())
90
 
91
+ if dataset == 'WeMath':
92
+ non_overall_fields = [x for x in fields if 'Score' in x]
93
+ overall_fields = [x for x in fields if 'Score' not in x]
94
+ else:
95
+ non_overall_fields = [x for x in fields if 'Overall' not in x]
96
+ overall_fields = [x for x in fields if 'Overall' in x]
97
 
98
  for m in results:
99
  item = results[m]
 
121
  # Use the first 5 non-overall fields as required fields
122
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
123
 
124
+ if 'Score (Strict)' in df:
125
+ df = df.sort_values('Score (Strict)')
126
+ else:
127
+ df = df.sort_values('Overall')
128
+
129
  df = df.iloc[::-1]
130
 
131
  check_box = {}
 
160
 
161
 
162
  for d in DATASETS_ALL:
163
+ key_name = 'Overall'
164
+ if d == 'WeMath':
165
+ key_name = 'Score (Strict)'
166
  if d in item:
167
  val = float(item[d][key_name])
168
  val = float(f'{val:.1f}')
meta_data.py CHANGED
@@ -22,13 +22,15 @@ We obtain all evaluation results based on the [VLMEvalKit](https://github.com/op
22
  2. MathVision: The Full test set of MathVision, around 3000 samples.
23
  3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
24
  4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
 
 
25
 
26
  To suggest new models or benchmarks for this leaderboard, please contact [email protected].
27
  """
28
 
29
  # CONSTANTS-FIELDS
30
- DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
31
- DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
32
  META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
33
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
34
  MODEL_TYPE = ['OpenSource', 'API']
 
22
  2. MathVision: The Full test set of MathVision, around 3000 samples.
23
  3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
24
  4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
25
+ 5. WeMath: The Test Mini split of WeMath, around 1740 samples, we report "Score (Strict)" as the main metric.
26
+ 6. LogicVista: The Full test set of LogicVista, around 450 samples.
27
 
28
  To suggest new models or benchmarks for this leaderboard, please contact [email protected].
29
  """
30
 
31
  # CONSTANTS-FIELDS
32
+ DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
33
+ DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
34
  META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
35
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
36
  MODEL_TYPE = ['OpenSource', 'API']