cccjc commited on
Commit
2acbd8f
·
1 Parent(s): 2a2ba62

sync format with github repo exported results

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. static/eval_results/Default/Aquila_VL_2B/summary_results.json +2 -4
  2. static/eval_results/Default/Aria/summary_results.json +2 -4
  3. static/eval_results/Default/Claude_3.5/summary_results.json +2 -4
  4. static/eval_results/Default/Claude_3.5_new/summary_results.json +2 -4
  5. static/eval_results/Default/GPT_4o/summary_results.json +2 -4
  6. static/eval_results/Default/GPT_4o_mini/summary_results.json +2 -4
  7. static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +2 -4
  8. static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +2 -4
  9. static/eval_results/Default/Idefics3/summary_results.json +2 -4
  10. static/eval_results/Default/InternVL2_2B/summary_results.json +2 -4
  11. static/eval_results/Default/InternVL2_5_2B/summary_results.json +2 -4
  12. static/eval_results/Default/InternVL2_5_78B/summary_results.json +2 -4
  13. static/eval_results/Default/InternVL2_76B/summary_results.json +2 -4
  14. static/eval_results/Default/InternVL2_8B/summary_results.json +2 -4
  15. static/eval_results/Default/Llama_3_2_11B/summary_results.json +2 -4
  16. static/eval_results/Default/Mammoth_VL/summary_results.json +2 -4
  17. static/eval_results/Default/MiniCPM_v2.6/summary_results.json +2 -4
  18. static/eval_results/Default/NVLM/summary_results.json +2 -4
  19. static/eval_results/Default/Phi-3.5-vision/summary_results.json +2 -4
  20. static/eval_results/Default/Pixtral_12B/summary_results.json +2 -4
  21. static/eval_results/Default/Qwen2_VL_2B/summary_results.json +2 -4
  22. static/eval_results/Default/Qwen2_VL_72B/summary_results.json +2 -4
  23. static/eval_results/Default/Qwen2_VL_7B/summary_results.json +2 -4
  24. static/eval_results/Default/llava_onevision_72B/summary_results.json +2 -4
  25. static/eval_results/Default/llava_onevision_7B/summary_results.json +2 -4
  26. static/eval_results/SI/Aquila_VL_2B/summary_results.json +0 -2
  27. static/eval_results/SI/Aria/summary_results.json +0 -2
  28. static/eval_results/SI/Claude_3.5/summary_results.json +2 -4
  29. static/eval_results/SI/Claude_3.5_new/summary_results.json +2 -4
  30. static/eval_results/SI/GPT_4o/summary_results.json +2 -4
  31. static/eval_results/SI/GPT_4o_mini/summary_results.json +2 -4
  32. static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json +2 -4
  33. static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json +2 -4
  34. static/eval_results/SI/Idefics3/summary_results.json +0 -2
  35. static/eval_results/SI/InternVL2_2B/summary_results.json +0 -2
  36. static/eval_results/SI/InternVL2_76B/summary_results.json +0 -2
  37. static/eval_results/SI/InternVL2_8B/summary_results.json +0 -2
  38. static/eval_results/SI/Llama_3_2_11B/summary_results.json +0 -2
  39. static/eval_results/SI/MiniCPM_v2.6/summary_results.json +0 -2
  40. static/eval_results/SI/Molmo_72B/summary_results.json +0 -2
  41. static/eval_results/SI/Molmo_7B_D/summary_results.json +0 -2
  42. static/eval_results/SI/NVLM/summary_results.json +0 -2
  43. static/eval_results/SI/POINTS_15_7B/summary_results.json +0 -2
  44. static/eval_results/SI/POINTS_7B/summary_results.json +0 -2
  45. static/eval_results/SI/Phi-3.5-vision/summary_results.json +0 -2
  46. static/eval_results/SI/Pixtral_12B/summary_results.json +0 -2
  47. static/eval_results/SI/Qwen2_VL_2B/summary_results.json +0 -2
  48. static/eval_results/SI/Qwen2_VL_72B/summary_results.json +0 -2
  49. static/eval_results/SI/Qwen2_VL_7B/summary_results.json +0 -2
  50. static/eval_results/SI/SmolVLM/summary_results.json +0 -2
static/eval_results/Default/Aquila_VL_2B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.159970161379836,
7
- "micro_mean_score": 0.15844711671722148
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.24567572098570653,
13
- "micro_mean_score": 0.2704213241616509
14
  },
15
  "overall_score": 0.17100157004197775
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.159970161379836
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.24567572098570653
 
12
  },
13
  "overall_score": 0.17100157004197775
14
  },
static/eval_results/Default/Aria/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.289073788209904,
7
- "micro_mean_score": 0.2859007507765791
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.5103725263180767,
13
- "micro_mean_score": 0.5349957007738607
14
  },
15
  "overall_score": 0.31755778420402525
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.289073788209904
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.5103725263180767
 
12
  },
13
  "overall_score": 0.31755778420402525
14
  },
static/eval_results/Default/Claude_3.5/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.5040975742801586,
7
- "micro_mean_score": 0.5002259116666758
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.6373907158949892,
13
- "micro_mean_score": 0.6569647463456579
14
  },
15
  "overall_score": 0.5212541172602853
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.5040975742801586
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.6373907158949892
 
12
  },
13
  "overall_score": 0.5212541172602853
14
  },
static/eval_results/Default/Claude_3.5_new/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.5259191914020757,
7
- "micro_mean_score": 0.5230785894131227
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.6563419761104125,
13
- "micro_mean_score": 0.6724419604471196
14
  },
15
  "overall_score": 0.5427062825031487
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.5259191914020757
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.6563419761104125
 
12
  },
13
  "overall_score": 0.5427062825031487
14
  },
static/eval_results/Default/GPT_4o/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.5265030595065238,
7
- "micro_mean_score": 0.5236338521693411
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.6478225794744895,
13
- "micro_mean_score": 0.665391229578676
14
  },
15
  "overall_score": 0.5421184432647768
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.5265030595065238
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.6478225794744895
 
12
  },
13
  "overall_score": 0.5421184432647768
14
  },
static/eval_results/Default/GPT_4o_mini/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.40767494558789397,
7
- "micro_mean_score": 0.40431644154143376
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.586537827213665,
13
- "micro_mean_score": 0.6133276010318144
14
  },
15
  "overall_score": 0.43069690064863675
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.40767494558789397
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.586537827213665
 
12
  },
13
  "overall_score": 0.43069690064863675
14
  },
static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.4189319021967416,
7
- "micro_mean_score": 0.41567515414375245
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.5691365176285039,
13
- "micro_mean_score": 0.5987532244196045
14
  },
15
  "overall_score": 0.4382651695295427
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.4189319021967416
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.5691365176285039
 
12
  },
13
  "overall_score": 0.4382651695295427
14
  },
static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.4822473962867704,
7
- "micro_mean_score": 0.4764805563057179
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.5858190649927173,
13
- "micro_mean_score": 0.6104901117798793
14
  },
15
  "overall_score": 0.4955784031499121
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.4822473962867704
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.5858190649927173
 
12
  },
13
  "overall_score": 0.4955784031499121
14
  },
static/eval_results/Default/Idefics3/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.08956972487602757,
7
- "micro_mean_score": 0.08982225274252693
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.3210866162255635,
13
- "micro_mean_score": 0.35649183147033553
14
  },
15
  "overall_score": 0.11936892871309657
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.08956972487602757
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.3210866162255635
 
12
  },
13
  "overall_score": 0.11936892871309657
14
  },
static/eval_results/Default/InternVL2_2B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.13141974398938763,
7
- "micro_mean_score": 0.13063500716262516
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.23864417043743646,
13
- "micro_mean_score": 0.24901117798796224
14
  },
15
  "overall_score": 0.14522090778963154
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.13141974398938763
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.23864417043743646
 
12
  },
13
  "overall_score": 0.14522090778963154
14
  },
static/eval_results/Default/InternVL2_5_2B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.17806821966478364,
7
- "micro_mean_score": 0.17708809739236367
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.2738430375585404,
13
- "micro_mean_score": 0.2905417024935512
14
  },
15
  "overall_score": 0.19039567147289096
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.17806821966478364
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.2738430375585404
 
12
  },
13
  "overall_score": 0.19039567147289096
14
  },
static/eval_results/Default/InternVL2_5_78B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.44132952988532753,
7
- "micro_mean_score": 0.4397079059379812
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.5538024772749066,
13
- "micro_mean_score": 0.5776870163370592
14
  },
15
  "overall_score": 0.4558062458859664
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.44132952988532753
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.5538024772749066
 
12
  },
13
  "overall_score": 0.4558062458859664
14
  },
static/eval_results/Default/InternVL2_76B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.3562710424410931,
7
- "micro_mean_score": 0.35129859801162616
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.5192997443033639,
13
- "micro_mean_score": 0.5421324161650903
14
  },
15
  "overall_score": 0.3772549347599992
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.3562710424410931
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.5192997443033639
 
12
  },
13
  "overall_score": 0.3772549347599992
14
  },
static/eval_results/Default/InternVL2_8B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.25956581776451815,
7
- "micro_mean_score": 0.2546984460483302
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1165,
12
- "macro_mean_score": 0.3978571701460552,
13
- "micro_mean_score": 0.4108583690987125
14
  },
15
  "overall_score": 0.2773656948037259
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.25956581776451815
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1165,
11
+ "macro_mean_score": 0.3978571701460552
 
12
  },
13
  "overall_score": 0.2773656948037259
14
  },
static/eval_results/Default/Llama_3_2_11B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.15999641916771298,
7
- "micro_mean_score": 0.15809331016967038
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.3173342406187366,
13
- "micro_mean_score": 0.3487962166809973
14
  },
15
  "overall_score": 0.1802478219287358
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.15999641916771298
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.3173342406187366
 
12
  },
13
  "overall_score": 0.1802478219287358
14
  },
static/eval_results/Default/Mammoth_VL/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.264052880412689,
7
- "micro_mean_score": 0.2626894374387823
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.37992668750165337,
13
- "micro_mean_score": 0.40120378331900275
14
  },
15
  "overall_score": 0.27896733083008046
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.264052880412689
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.37992668750165337
 
12
  },
13
  "overall_score": 0.27896733083008046
14
  },
static/eval_results/Default/MiniCPM_v2.6/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.22955895202146906,
7
- "micro_mean_score": 0.22560399396899078
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.41728623355613875,
13
- "micro_mean_score": 0.43452278589853827
14
  },
15
  "overall_score": 0.2537218694467236
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.22955895202146906
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.41728623355613875
 
12
  },
13
  "overall_score": 0.2537218694467236
14
  },
static/eval_results/Default/NVLM/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.21589726765847422,
7
- "micro_mean_score": 0.21406043849932396
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.3478114310231307,
13
- "micro_mean_score": 0.3947549441100602
14
  },
15
  "overall_score": 0.23287631838857856
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.21589726765847422
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.3478114310231307
 
12
  },
13
  "overall_score": 0.23287631838857856
14
  },
static/eval_results/Default/Phi-3.5-vision/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.22995297916629392,
7
- "micro_mean_score": 0.22708502951025372
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.3947914647737769,
13
- "micro_mean_score": 0.42459157351676696
14
  },
15
  "overall_score": 0.2511698139474551
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.22995297916629392
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.3947914647737769
 
12
  },
13
  "overall_score": 0.2511698139474551
14
  },
static/eval_results/Default/Pixtral_12B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.31362045151669854,
7
- "micro_mean_score": 0.3100986209078182
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.4566234428542061,
13
- "micro_mean_score": 0.4870593293207223
14
  },
15
  "overall_score": 0.33202677713439754
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.31362045151669854
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.4566234428542061
 
12
  },
13
  "overall_score": 0.33202677713439754
14
  },
static/eval_results/Default/Qwen2_VL_2B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.20877163406364055,
7
- "micro_mean_score": 0.20561526268932287
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.3154302566225611,
13
- "micro_mean_score": 0.33856405846947557
14
  },
15
  "overall_score": 0.22249997162072932
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.20877163406364055
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.3154302566225611
 
12
  },
13
  "overall_score": 0.22249997162072932
14
  },
static/eval_results/Default/Qwen2_VL_72B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.4542376574527161,
7
- "micro_mean_score": 0.4501201906164793
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.5639771804231668,
13
- "micro_mean_score": 0.5835339638865004
14
  },
15
  "overall_score": 0.4683625465479226
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.4542376574527161
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.5639771804231668
 
12
  },
13
  "overall_score": 0.4683625465479226
14
  },
static/eval_results/Default/Qwen2_VL_7B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.3293449599230247,
7
- "micro_mean_score": 0.325331493515679
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1170,
12
- "macro_mean_score": 0.43955105763038577,
13
- "micro_mean_score": 0.45508547008546996
14
  },
15
  "overall_score": 0.34352990319228904
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.3293449599230247
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1170,
11
+ "macro_mean_score": 0.43955105763038577
 
12
  },
13
  "overall_score": 0.34352990319228904
14
  },
static/eval_results/Default/llava_onevision_72B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.2974368415462532,
7
- "micro_mean_score": 0.2956217833156672
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.4599484231632498,
13
- "micro_mean_score": 0.4850386930352536
14
  },
15
  "overall_score": 0.31835417383358944
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.2974368415462532
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.4599484231632498
 
12
  },
13
  "overall_score": 0.31835417383358944
14
  },
static/eval_results/Default/llava_onevision_7B/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
- "macro_mean_score": 0.21362697219149712,
7
- "micro_mean_score": 0.21073910058505504
8
  },
9
  "open": {
10
  "num_eval_tasks": 65,
11
  "num_eval_samples": 1163,
12
- "macro_mean_score": 0.33979975321921935,
13
- "micro_mean_score": 0.36474634565778147
14
  },
15
  "overall_score": 0.2298670331158574
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.21362697219149712
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 65,
10
  "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.33979975321921935
 
12
  },
13
  "overall_score": 0.2298670331158574
14
  },
static/eval_results/SI/Aquila_VL_2B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.20770364903712493,
8
- "micro_mean_score": 0.20333142638522636,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.31474202723571276,
16
- "micro_mean_score": 0.3326568265682657,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.22197543279693666
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.20770364903712493,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.31474202723571276,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.22197543279693666
static/eval_results/SI/Aria/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3178882776147889,
8
- "micro_mean_score": 0.3101511832828904,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.5137437248005172,
16
- "micro_mean_score": 0.5472939729397295,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.34400233723955265
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3178882776147889,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.5137437248005172,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.34400233723955265
static/eval_results/SI/Claude_3.5/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
- "macro_mean_score": 0.520276385877485,
7
- "micro_mean_score": 0.5148202137998056
8
  },
9
  "open": {
10
  "num_eval_tasks": 42,
11
  "num_eval_samples": 813,
12
- "macro_mean_score": 0.6479684260295507,
13
- "micro_mean_score": 0.6801968019680197
14
  },
15
  "overall_score": 0.5373019912310938
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.520276385877485
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 42,
10
  "num_eval_samples": 813,
11
+ "macro_mean_score": 0.6479684260295507
 
12
  },
13
  "overall_score": 0.5373019912310938
14
  },
static/eval_results/SI/Claude_3.5_new/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
- "macro_mean_score": 0.5462752278980763,
7
- "micro_mean_score": 0.5417881438289601
8
  },
9
  "open": {
10
  "num_eval_tasks": 42,
11
  "num_eval_samples": 813,
12
- "macro_mean_score": 0.6764020657053476,
13
- "micro_mean_score": 0.6924969249692496
14
  },
15
  "overall_score": 0.5636254729390457
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.5462752278980763
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 42,
10
  "num_eval_samples": 813,
11
+ "macro_mean_score": 0.6764020657053476
 
12
  },
13
  "overall_score": 0.5636254729390457
14
  },
static/eval_results/SI/GPT_4o/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
- "macro_mean_score": 0.5529953662872719,
7
- "micro_mean_score": 0.5483479105928085
8
  },
9
  "open": {
10
  "num_eval_tasks": 42,
11
  "num_eval_samples": 813,
12
- "macro_mean_score": 0.6600228904804206,
13
- "micro_mean_score": 0.6801968019680197
14
  },
15
  "overall_score": 0.5672657028463584
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.5529953662872719
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 42,
10
  "num_eval_samples": 813,
11
+ "macro_mean_score": 0.6600228904804206
 
12
  },
13
  "overall_score": 0.5672657028463584
14
  },
static/eval_results/SI/GPT_4o_mini/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
- "macro_mean_score": 0.4431039098921726,
7
- "micro_mean_score": 0.43780369290573373
8
  },
9
  "open": {
10
  "num_eval_tasks": 42,
11
  "num_eval_samples": 813,
12
- "macro_mean_score": 0.595574663769726,
13
- "micro_mean_score": 0.6334563345633456
14
  },
15
  "overall_score": 0.46343334374251305
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.4431039098921726
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 42,
10
  "num_eval_samples": 813,
11
+ "macro_mean_score": 0.595574663769726
 
12
  },
13
  "overall_score": 0.46343334374251305
14
  },
static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
- "macro_mean_score": 0.43481964330318734,
7
- "micro_mean_score": 0.4297862001943635
8
  },
9
  "open": {
10
  "num_eval_tasks": 42,
11
  "num_eval_samples": 813,
12
- "macro_mean_score": 0.5787083135236054,
13
- "micro_mean_score": 0.6186961869618696
14
  },
15
  "overall_score": 0.4540047993325765
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.43481964330318734
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 42,
10
  "num_eval_samples": 813,
11
+ "macro_mean_score": 0.5787083135236054
 
12
  },
13
  "overall_score": 0.4540047993325765
14
  },
static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json CHANGED
@@ -3,14 +3,12 @@
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
- "macro_mean_score": 0.4914311038229404,
7
- "micro_mean_score": 0.48323615160349853
8
  },
9
  "open": {
10
  "num_eval_tasks": 42,
11
  "num_eval_samples": 813,
12
- "macro_mean_score": 0.5814975405131552,
13
- "micro_mean_score": 0.6174661746617466
14
  },
15
  "overall_score": 0.5034399620483024
16
  },
 
3
  "core": {
4
  "num_eval_tasks": 273,
5
  "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.4914311038229404
 
7
  },
8
  "open": {
9
  "num_eval_tasks": 42,
10
  "num_eval_samples": 813,
11
+ "macro_mean_score": 0.5814975405131552
 
12
  },
13
  "overall_score": 0.5034399620483024
14
  },
static/eval_results/SI/Idefics3/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.08941182847569326,
8
- "micro_mean_score": 0.08779475233900695,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.3231434267517844,
16
- "micro_mean_score": 0.3618081180811809,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.12057604157917208
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.08941182847569326,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.3231434267517844,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.12057604157917208
static/eval_results/SI/InternVL2_2B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.12069001041308772,
8
- "micro_mean_score": 0.11842605219090299,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.28522459992910454,
16
- "micro_mean_score": 0.28886838868388687,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.14262795568189
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.12069001041308772,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.28522459992910454,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.14262795568189
static/eval_results/SI/InternVL2_76B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3998616568018755,
8
- "micro_mean_score": 0.39149064302628933,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.554748737158244,
16
- "micro_mean_score": 0.5800738007380073,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.42051326751605805
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3998616568018755,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.554748737158244,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.42051326751605805
static/eval_results/SI/InternVL2_8B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.27650612401825575,
8
- "micro_mean_score": 0.27119471729837735,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.39388373890935635,
16
- "micro_mean_score": 0.4045510455104551,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.29215647267040246
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.27650612401825575,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.39388373890935635,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.29215647267040246
static/eval_results/SI/Llama_3_2_11B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.20789144960796493,
8
- "micro_mean_score": 0.20163641703273802,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.3861125858565788,
16
- "micro_mean_score": 0.4130381303813038,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.2316542677744468
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.20789144960796493,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.3861125858565788,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.2316542677744468
static/eval_results/SI/MiniCPM_v2.6/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.23230765810722817,
8
- "micro_mean_score": 0.22684118052665975,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.4360655066213874,
16
- "micro_mean_score": 0.4588560885608856,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.2594753712424494
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.23230765810722817,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.4360655066213874,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.2594753712424494
static/eval_results/SI/Molmo_72B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4073,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.36480000609384927,
8
- "micro_mean_score": 0.36205779758110807,
9
  "missing_tasks": [
10
  "planning_screenshot_termes",
11
  "table_understanding",
@@ -17,7 +16,6 @@
17
  "num_eval_samples": 813,
18
  "num_not_eval_samples": 0,
19
  "macro_mean_score": 0.4465682063915481,
20
- "micro_mean_score": 0.4850553505535054,
21
  "missing_tasks": []
22
  },
23
  "overall_score": 0.3758072638262318
 
5
  "num_eval_samples": 4073,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.36480000609384927,
 
8
  "missing_tasks": [
9
  "planning_screenshot_termes",
10
  "table_understanding",
 
16
  "num_eval_samples": 813,
17
  "num_not_eval_samples": 0,
18
  "macro_mean_score": 0.4465682063915481,
 
19
  "missing_tasks": []
20
  },
21
  "overall_score": 0.3758072638262318
static/eval_results/SI/Molmo_7B_D/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4102,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.2098088446992518,
8
- "micro_mean_score": 0.20550929661464645,
9
  "missing_tasks": [
10
  "MMSoc_Misinformation_PolitiFact"
11
  ]
@@ -15,7 +14,6 @@
15
  "num_eval_samples": 813,
16
  "num_not_eval_samples": 0,
17
  "macro_mean_score": 0.35697926179118733,
18
- "micro_mean_score": 0.38936039360393604,
19
  "missing_tasks": []
20
  },
21
  "overall_score": 0.22949405972428777
 
5
  "num_eval_samples": 4102,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.2098088446992518,
 
8
  "missing_tasks": [
9
  "MMSoc_Misinformation_PolitiFact"
10
  ]
 
14
  "num_eval_samples": 813,
15
  "num_not_eval_samples": 0,
16
  "macro_mean_score": 0.35697926179118733,
 
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.22949405972428777
static/eval_results/SI/NVLM/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.32989872890926025,
8
- "micro_mean_score": 0.32315683713111915,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.4469349818134809,
16
- "micro_mean_score": 0.4881303813038132,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.34550356262982296
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.32989872890926025,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.4469349818134809,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.34550356262982296
static/eval_results/SI/POINTS_15_7B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.31355970638319003,
8
- "micro_mean_score": 0.30728203432446294,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.41331219301389166,
16
- "micro_mean_score": 0.42749077490774917,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.32686003793395024
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.31355970638319003,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.41331219301389166,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.32686003793395024
static/eval_results/SI/POINTS_7B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.25511317681632334,
8
- "micro_mean_score": 0.24927711632415062,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.30315625179016,
16
- "micro_mean_score": 0.3313653136531366,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.26151892014616823
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.25511317681632334,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.30315625179016,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.26151892014616823
static/eval_results/SI/Phi-3.5-vision/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.2561274958722834,
8
- "micro_mean_score": 0.2504214576875906,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.4272267419054576,
16
- "micro_mean_score": 0.445879458794588,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.2789407286767066
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.2561274958722834,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.4272267419054576,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.2789407286767066
static/eval_results/SI/Pixtral_12B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3436942439614412,
8
- "micro_mean_score": 0.3373564384613738,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.4417271955536318,
16
- "micro_mean_score": 0.4845633456334564,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.3567653041737333
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3436942439614412,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.4417271955536318,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.3567653041737333
static/eval_results/SI/Qwen2_VL_2B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.22787906973244856,
8
- "micro_mean_score": 0.2234748515064842,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.3509364634962041,
16
- "micro_mean_score": 0.3768757687576875,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.24428672223428263
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.22787906973244856,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.3509364634962041,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.24428672223428263
static/eval_results/SI/Qwen2_VL_72B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.4730536307784527,
8
- "micro_mean_score": 0.4659830915476831,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.5510079982505317,
16
- "micro_mean_score": 0.5826568265682657,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.48344754644139654
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.4730536307784527,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.5510079982505317,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.48344754644139654
static/eval_results/SI/Qwen2_VL_7B/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3538656561495699,
8
- "micro_mean_score": 0.34581250459157137,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.4517429592549692,
16
- "micro_mean_score": 0.4730012300123002,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.3669159632302898
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.3538656561495699,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.4517429592549692,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.3669159632302898
static/eval_results/SI/SmolVLM/summary_results.json CHANGED
@@ -5,7 +5,6 @@
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.07348385181460795,
8
- "micro_mean_score": 0.0732694668402814,
9
  "missing_tasks": []
10
  },
11
  "open": {
@@ -13,7 +12,6 @@
13
  "num_eval_samples": 813,
14
  "num_not_eval_samples": 0,
15
  "macro_mean_score": 0.2427337975725658,
16
- "micro_mean_score": 0.2504920049200492,
17
  "missing_tasks": []
18
  },
19
  "overall_score": 0.09605051124900234
 
5
  "num_eval_samples": 4116,
6
  "num_not_eval_samples": 0,
7
  "macro_mean_score": 0.07348385181460795,
 
8
  "missing_tasks": []
9
  },
10
  "open": {
 
12
  "num_eval_samples": 813,
13
  "num_not_eval_samples": 0,
14
  "macro_mean_score": 0.2427337975725658,
 
15
  "missing_tasks": []
16
  },
17
  "overall_score": 0.09605051124900234