MMLU-Pro / utils.py
ubowang's picture
Update utils.py
17ec649 verified
raw
history blame
3.69 kB
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository
HF_TOKEN = os.environ.get("HF_TOKEN")
SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
"Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
MODEL_INFO = [
"Models",
"Overall",
"Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
"Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
DATA_TITLE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number',
'number', 'number', 'number', 'number', 'number', 'number', 'number',
'number', 'number']
SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv"
COLUMN_NAMES = MODEL_INFO
LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard
MMLU-Pro dataset, a more robust and challenging massive multi-task understanding dataset tailored to more rigorously benchmark large language models' capabilities. This dataset contains 12K complex questions across various disciplines. The following are the accuracies of various models evaluated on MMLU-Pro.
We invite you to use our dataset available at [https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro). If you want to reproduce our results or evaluate your own models on MMLU-Pro, please check out our evaluation scripts at [https://github.com/TIGER-AI-Lab/MMLU-Pro](https://github.com/TIGER-AI-Lab/MMLU-Pro).
"""
TABLE_INTRODUCTION = """
"""
LEADERBORAD_INFO = """
We list the information of the used datasets as follows:<br>
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r""""""
SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
## ⚠ Please note that you need to submit the json file with following format:
```json
{
"Model": "[MODEL_NAME]",
"Overall": 0.5678,
"Biology": 0.1234,
"Business": 0.4567,
...,
"Other: 0.3456"
}
```
After submitting, you can click the "Refresh" button to see the updated leaderboard (it may takes few seconds).
"""
def get_df():
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
repo.git_pull()
df = pd.read_csv(CSV_DIR)
df = df.sort_values(by=['Overall'], ascending=False)
return df[COLUMN_NAMES]
def add_new_eval(
input_file,
):
if input_file is None:
return "Error! Empty file!"
upload_data = json.loads(input_file)
data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
for subject in SUBJECTS:
data_row += [upload_data[subject]]
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
use_auth_token=HF_TOKEN, repo_type="dataset")
submission_repo.git_pull()
already_submitted = []
with open(CSV_DIR, mode='r') as file:
reader = csv.reader(file, delimiter=',')
for row in reader:
already_submitted.append(row[0])
if data_row[0] not in already_submitted:
with open(CSV_DIR, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow(data_row)
submission_repo.push_to_hub()
print('Submission Successful')
else:
print('The entry already exists')
def refresh_data():
return get_df()