Spaces:

TIGER-Lab
/

MMLU-Pro

Running on CPU Upgrade

App Files Files Community

ubowang commited on May 20, 2024

Commit

17ec649

verified ·

1 Parent(s): 653a1dc

Update utils.py

Browse files

Files changed (1) hide show

utils.py +14 -18

utils.py CHANGED Viewed

@@ -18,8 +18,8 @@ MODEL_INFO = [
     "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
 DATA_TITLE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number',
-                  'number', 'number', 'number', 'number', 'number', 'number', 'number',
-                  'number', 'number']
 SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
 SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
@@ -27,10 +27,11 @@ CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv"
 COLUMN_NAMES = MODEL_INFO
-LEADERBORAD_INTRODUCTION = """# MMLU-Pro Leaderboard
-    MMLU-Pro dataset, a more robust and challenging massive multi-task understanding dataset tailored to more \
-    rigorously benchmark large language models' capabilities. This dataset contains 12K \
-    complex questions across various disciplines.
     """
 TABLE_INTRODUCTION = """
@@ -50,23 +51,20 @@ SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
 ```json
 {
-    "Model": "[NAME]",
-    "Repo": "https://huggingface.co/[MODEL_NAME],"
-    "Overall": 56.7,
-    "Biology": 23.4,
-    "Business": 45.6,
     ...,
-    "Other: 56.7"
 }
 ```
-After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).
 """
 def get_df():
-    print("HF_TOKEN", HF_TOKEN)
-    print("SUBMISSION_URL", SUBMISSION_URL)
     repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
     repo.git_pull()
     df = pd.read_csv(CSV_DIR)
@@ -81,7 +79,7 @@ def add_new_eval(
         return "Error! Empty file!"
     upload_data = json.loads(input_file)
-    data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['Overall']]
     for subject in SUBJECTS:
         data_row += [upload_data[subject]]
@@ -109,5 +107,3 @@ def add_new_eval(
 def refresh_data():
     return get_df()

     "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
 DATA_TITLE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number',
+                   'number', 'number', 'number', 'number', 'number', 'number', 'number',
+                   'number', 'number']
 SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
 SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
 COLUMN_NAMES = MODEL_INFO
+LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard
+MMLU-Pro dataset, a more robust and challenging massive multi-task understanding dataset tailored to more rigorously benchmark large language models' capabilities. This dataset contains 12K complex questions across various disciplines. The following are the accuracies of various models evaluated on MMLU-Pro.
+We invite you to use our dataset available at [https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro). If you want to reproduce our results or evaluate your own models on MMLU-Pro, please check out our evaluation scripts at [https://github.com/TIGER-AI-Lab/MMLU-Pro](https://github.com/TIGER-AI-Lab/MMLU-Pro).
     """
 TABLE_INTRODUCTION = """
 ```json
 {
+    "Model": "[MODEL_NAME]",
+    "Overall": 0.5678,
+    "Biology": 0.1234,
+    "Business": 0.4567,
     ...,
+    "Other: 0.3456"
 }
 ```
+After submitting, you can click the "Refresh" button to see the updated leaderboard (it may takes few seconds).
 """
 def get_df():
     repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
     repo.git_pull()
     df = pd.read_csv(CSV_DIR)
         return "Error! Empty file!"
     upload_data = json.loads(input_file)
+    data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
     for subject in SUBJECTS:
         data_row += [upload_data[subject]]
 def refresh_data():
     return get_df()