from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
# task0 = Task("boolq", "acc", "BoolQA")
task1 = Task("trivia", "EM", "TriviaQA")
task2 = Task("truthfulqa", "EM", "TruthfulQA")
task3 = Task("popqa", "acc", "PopQA")
task4 = Task("hpqa", "EM", "HotpotQA")
task5 = Task("nq", "EM", "Natural Questions")
task6 = Task("2wiki", "EM", "2WikiMultiHop")
task7 = Task("musique", "EM", "MuSiQue")
# task0 = Task("anli_r1", "acc", "ANLI")
# task1 = Task("logiqa", "acc_norm", "LogiQA")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """
GIFT-Eval Time Series Forecasting Leaderboard
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
[Placeholder] We introduce the General TIme Series ForecasTing Model Evaluation, GIFT-Eval,
a pioneering benchmark aimed at promoting evaluation across diverse datasets.
GIFT-Eval encompasses 28 datasets over 144,000 time series and 177 million data
points, spanning seven domains, 10 frequencies, multivariate inputs, and prediction lengths ranging from short to long-term forecasts.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
How It Works
To participate in the ContextualBench leaderboard, follow these steps to evaluate your Large Language Model (LLM) using the ContextualBench framework:
Clone the Repository: Start by cloning the ContextualBench GitHub repository to your local machine using the following command:
```bash
git clone https://github.com/SalesforceAIResearch/SFR-RAG
```
Navigate to the Directory: Move into the cloned repository's directory:
``bash
cd ContextualBench
```
Install Dependencies: Install all necessary dependencies by executing:
```bash
pip install -r requirements.txt
```
Prepare Your Model and Dataset: Set up your model and dataset according to the guidelines provided in the repository's documentation.
Run the Evaluation Script: Execute the evaluation script to generate outputs for your model on the specified dataset:
```bash
python run.py [dataset_name]
```
Collect and Format Outputs: Gather the outputs generated for each dataset and format them according to the leaderboard submission guidelines.
Submit Your Results: Email the formatted outputs to the author's email address for evaluation. Our team will assess the performance and update the leaderboard accordingly.
Reproducibility
Ensuring reproducibility is a key aspect of the ContextualBench leaderboard.
By following the standardized steps outlined above, participants can consistently reproduce evaluation results. This process not only facilitates fair comparisons across different models but also encourages transparency and reliability in model assessments. Participants are encouraged to adhere strictly to the submission guidelines to ensure their results are accurately reflected on the leaderboard.
"""
EVALUATION_QUEUE_TEXT = """
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{
aksu2024gifteval,
title={{GIFT}-Eval: A Benchmark for General Time Series Forecasting Model Evaluation},
author={Taha Aksu and Gerald Woo and Juncheng Liu and Xu Liu and Chenghao Liu and Silvio Savarese and Caiming Xiong and Doyen Sahoo},
booktitle={NeurIPS Workshop on Time Series in the Age of Large Models},
year={2024},
url={https://openreview.net/forum?id=Z2cMOOANFX}
}
"""