Update README.md
Browse files
README.md
CHANGED
@@ -94,24 +94,16 @@ The website [https://swallow-llm.github.io/](https://swallow-llm.github.io/) pro
|
|
94 |
|Model|coding|extraction|humanities|math|reasoning|roleplay|stem|writing|JMTAvg|
|
95 |
|---|---|---|---|---|---|---|---|---|---|
|
96 |
| Model | coding | extraction | humanities | math | reasoning | roleplay | stem | writing | JMT Avg |
|
97 |
-
|
|
98 |
-
|
|
99 |
-
|
|
100 |
-
| KARAKURI LM 70B Chat v0.1 | 0.2804 | 0.5862 | 0.624 | 0.2934 | 0.4183 | 0.553 | 0.4859 | 0.5964 | 0.4797 |
|
101 |
-
| Swallow-70b-instruct-v0.1 | 0.303 | 0.55 | 0.565 | 0.3483 | 0.305 | 0.542 | 0.4916 | 0.463 | 0.446 |
|
102 |
-
| Llama 3 70B Instruct | 0.5969 | 0.841 | 0.712 | 0.4481 | 0.4884 | 0.7117 | 0.651 | 0.69 | 0.6424 |
|
103 |
| Llama 3.1 70B Instruct | 0.5252 | 0.7846 | 0.7086 | 0.5063 | 0.6979 | 0.6888 | 0.6402 | 0.6653 | 0.6521 |
|
104 |
| Llama 3 Youko 70B Instruct | 0.6632 | 0.8387 | 0.8108 | 0.4655 | 0.7013 | 0.7778 | 0.7544 | 0.7662 | 0.7222 |
|
105 |
-
| Llama
|
106 |
-
| Llama 3
|
107 |
-
| Llama 3 Swallow 70B Instruct | 0.
|
108 |
-
| Llama 3.1 Swallow 70B Instruct | 0.5676 | 0.7859 | 0.749 | 0.5437 | 0.6383 | 0.687 | 0.6121 | 0.654 | 0.6547 |
|
109 |
-
| Qwen2-72B-Instruct | 0.5699 | 0.7858 | 0.8222 | 0.5096 | 0.7032 | 0.7963 | 0.7728 | 0.8223 | 0.7228 |
|
110 |
-
| Qwen2.5-72B-Instruct | 0.706 | 0.7866 | 0.8122 | 0.6968 | 0.6536 | 0.8301 | 0.806 | 0.7841 | 0.7594 |
|
111 |
-
| Mixtral-8x22B-Instruct-v0.1 | 0.5061 | 0.7454 | 0.5978 | 0.4772 | 0.476 | 0.542 | 0.4679 | 0.6244 | 0.5546 |
|
112 |
-
| Llama 3.1 405B Instruct (deepinfra API) | 0.6464 | 0.8218 | 0.715 | 0.5313 | 0.6447 | 0.716 | 0.6737 | 0.677 | 0.6782 |
|
113 |
| GPT-3.5 (gpt-3.5-turbo-0125) | 0.6851 | 0.7641 | 0.7414 | 0.5522 | 0.5128 | 0.7104 | 0.6266 | 0.7361 | 0.6661 |
|
114 |
-
| GPT-4o (gpt-4o-2024-05-13) | 0.7296 | 0.
|
115 |
|
116 |
## Evaluation Benchmarks
|
117 |
|
|
|
94 |
|Model|coding|extraction|humanities|math|reasoning|roleplay|stem|writing|JMTAvg|
|
95 |
|---|---|---|---|---|---|---|---|---|---|
|
96 |
| Model | coding | extraction | humanities | math | reasoning | roleplay | stem | writing | JMT Avg |
|
97 |
+
| Qwen2-72B-Instruct | 0.5699 | 0.7858 | 0.8222 | 0.5096 | 0.7032 | 0.7963 | 0.7728 | 0.8223 | 0.7228 |
|
98 |
+
| Qwen2.5-72B-Instruct | 0.7060 | 0.7866 | 0.8122 | 0.6968 | 0.6536 | 0.8301 | 0.8060 | 0.7841 | 0.7594 |
|
99 |
+
| Llama 3 70B Instruct | 0.5969 | 0.8410 | 0.7120 | 0.4481 | 0.4884 | 0.7117 | 0.6510 | 0.6900 | 0.6424 |
|
|
|
|
|
|
|
100 |
| Llama 3.1 70B Instruct | 0.5252 | 0.7846 | 0.7086 | 0.5063 | 0.6979 | 0.6888 | 0.6402 | 0.6653 | 0.6521 |
|
101 |
| Llama 3 Youko 70B Instruct | 0.6632 | 0.8387 | 0.8108 | 0.4655 | 0.7013 | 0.7778 | 0.7544 | 0.7662 | 0.7222 |
|
102 |
+
| Llama 3 heron brain 70B v0.3 | 0.3762 | 0.7892 | 0.7274 | 0.5589 | 0.5070 | 0.6662 | 0.6880 | 0.6996 | 0.6266 |
|
103 |
+
| Llama 3 Swallow 70B Instruct | 0.5269 | 0.7250 | 0.5690 | 0.4669 | 0.6121 | 0.6238 | 0.5533 | 0.5698 | 0.5809 |
|
104 |
+
| Llama 3.1 Swallow 70B Instruct | 0.5676 | 0.7859 | 0.7490 | 0.5437 | 0.6383 | 0.6870 | 0.6121 | 0.6540 | 0.6547 |
|
|
|
|
|
|
|
|
|
|
|
105 |
| GPT-3.5 (gpt-3.5-turbo-0125) | 0.6851 | 0.7641 | 0.7414 | 0.5522 | 0.5128 | 0.7104 | 0.6266 | 0.7361 | 0.6661 |
|
106 |
+
| GPT-4o (gpt-4o-2024-05-13) | 0.7296 | 0.8540 | 0.8646 | 0.6641 | 0.6661 | 0.8274 | 0.8184 | 0.8085 | 0.7791 |
|
107 |
|
108 |
## Evaluation Benchmarks
|
109 |
|