3v324v23 commited on
Commit
b2d95df
·
1 Parent(s): 86751a4
app.py CHANGED
@@ -10,7 +10,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
10
  from huggingface_hub import HfApi
11
  from transformers import AutoConfig
12
 
13
- from src.auto_leaderboard.get_model_metadata import apply_metadata
14
  from src.assets.text_content import *
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
@@ -82,11 +82,11 @@ def get_leaderboard_df():
82
  print("Pulling evaluation results for the leaderboard.")
83
  eval_results_private.git_pull()
84
 
85
- all_data = get_eval_results_dicts(IS_PUBLIC)
86
 
87
- # if not IS_PUBLIC:
88
- all_data.append(gpt4_values)
89
- all_data.append(gpt35_values)
90
 
91
  all_data.append(baseline)
92
  apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
@@ -227,9 +227,13 @@ def add_new_eval(
227
  os.makedirs(OUT_DIR, exist_ok=True)
228
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
229
 
 
 
 
 
230
  # Check for duplicate submission
231
  if out_path.split("eval-queue/")[1].lower() in requested_models:
232
- return styled_warning("This model has been already submitted.")
233
 
234
  with open(out_path, "w") as f:
235
  f.write(json.dumps(eval_entry))
@@ -290,7 +294,30 @@ def filter_items(df, leaderboard_table, query):
290
  if AutoEvalColumn.model_type_symbol.name in leaderboard_table.columns:
291
  filtered_df = df[(df[AutoEvalColumn.model_type_symbol.name] == query)]
292
  else:
293
- return leaderboard_table.columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  return filtered_df[leaderboard_table.columns]
295
 
296
  def change_tab(query_param):
@@ -306,6 +333,10 @@ def change_tab(query_param):
306
  else:
307
  return gr.Tabs.update(selected=0)
308
 
 
 
 
 
309
 
310
  demo = gr.Blocks(css=custom_css)
311
  with demo:
@@ -328,20 +359,46 @@ with demo:
328
  show_label=False,
329
  elem_id="search-bar",
330
  )
331
- filter_columns = gr.Radio(
332
- label="⏚ Filter model types",
333
- choices = [
334
- "all",
335
- ModelType.PT.to_str(),
336
- ModelType.FT.to_str(),
337
- ModelType.IFT.to_str(),
338
- ModelType.RL.to_str(),
339
- ],
340
- value="all",
341
- elem_id="filter-columns"
342
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  leaderboard_table = gr.components.Dataframe(
344
- value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value+ [AutoEvalColumn.dummy.name]],
345
  headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
346
  datatype=TYPES,
347
  max_rows=None,
@@ -363,8 +420,11 @@ with demo:
363
  [hidden_leaderboard_table_for_search, leaderboard_table, search_bar],
364
  leaderboard_table,
365
  )
366
- shown_columns.change(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table)
367
- filter_columns.change(filter_items, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns], leaderboard_table)
 
 
 
368
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
369
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
370
 
@@ -432,6 +492,6 @@ with demo:
432
  )
433
 
434
  scheduler = BackgroundScheduler()
435
- scheduler.add_job(restart_space, "interval", seconds=1800)
436
  scheduler.start()
437
  demo.queue(concurrency_count=40).launch()
 
10
  from huggingface_hub import HfApi
11
  from transformers import AutoConfig
12
 
13
+ from src.auto_leaderboard.get_model_metadata import apply_metadata, DO_NOT_SUBMIT_MODELS
14
  from src.assets.text_content import *
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
 
82
  print("Pulling evaluation results for the leaderboard.")
83
  eval_results_private.git_pull()
84
 
85
+ all_data = get_eval_results_dicts()
86
 
87
+ if not IS_PUBLIC:
88
+ all_data.append(gpt4_values)
89
+ all_data.append(gpt35_values)
90
 
91
  all_data.append(baseline)
92
  apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
 
227
  os.makedirs(OUT_DIR, exist_ok=True)
228
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
229
 
230
+ # Check if the model has been forbidden:
231
+ if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
232
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
233
+
234
  # Check for duplicate submission
235
  if out_path.split("eval-queue/")[1].lower() in requested_models:
236
+ return styled_warning("This model has been already submitted.")
237
 
238
  with open(out_path, "w") as f:
239
  f.write(json.dumps(eval_entry))
 
294
  if AutoEvalColumn.model_type_symbol.name in leaderboard_table.columns:
295
  filtered_df = df[(df[AutoEvalColumn.model_type_symbol.name] == query)]
296
  else:
297
+ return filtered_df[leaderboard_table.columns]
298
+ return filtered_df[leaderboard_table.columns]
299
+
300
+ def filter_items_size(df, leaderboard_table, query):
301
+ numeric_intervals = {
302
+ "all": None,
303
+ "< 1B": (0, 1),
304
+ "~3B": (1, 5),
305
+ "~7B": (6, 11),
306
+ "~13B": (12, 15),
307
+ "~35B": (16, 55),
308
+ "60B+": (55, 1000)
309
+ }
310
+
311
+ if query == "all":
312
+ return df[leaderboard_table.columns]
313
+
314
+ numeric_interval = numeric_intervals[query]
315
+
316
+ if AutoEvalColumn.params.name in leaderboard_table.columns:
317
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors='coerce')
318
+ filtered_df = df[params_column.between(*numeric_interval)]
319
+ else:
320
+ return filtered_df[leaderboard_table.columns]
321
  return filtered_df[leaderboard_table.columns]
322
 
323
  def change_tab(query_param):
 
333
  else:
334
  return gr.Tabs.update(selected=0)
335
 
336
+ def update_filter_type(input_type, shown_columns):
337
+ shown_columns.append(AutoEvalColumn.params.name)
338
+ return gr.update(visible=(input_type == 'types')), gr.update(visible=(input_type == 'sizes')), shown_columns
339
+
340
 
341
  demo = gr.Blocks(css=custom_css)
342
  with demo:
 
359
  show_label=False,
360
  elem_id="search-bar",
361
  )
362
+ with gr.Box(elem_id="box-filter"):
363
+ filter_type = gr.Dropdown(
364
+ label="⏚ Filter model",
365
+ choices=["types", "sizes"], value="types",
366
+ interactive=True,
367
+ elem_id="filter_type"
368
+ )
369
+ filter_columns = gr.Radio(
370
+ label="⏚ Filter model types",
371
+ show_label=False,
372
+ choices = [
373
+ "all",
374
+ ModelType.PT.to_str(),
375
+ ModelType.FT.to_str(),
376
+ ModelType.IFT.to_str(),
377
+ ModelType.RL.to_str(),
378
+ ],
379
+ value="all",
380
+ elem_id="filter-columns"
381
+ )
382
+ filter_columns_size = gr.Radio(
383
+ label="⏚ Filter model sizes",
384
+ show_label=False,
385
+ choices = [
386
+ "all",
387
+ "< 1B",
388
+ "~3B",
389
+ "~7B",
390
+ "~13B",
391
+ "~35B",
392
+ "60B+"
393
+ ],
394
+ value="all",
395
+ visible=False,
396
+ interactive=True,
397
+ elem_id="filter-columns-size"
398
+ )
399
+
400
  leaderboard_table = gr.components.Dataframe(
401
+ value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name]],
402
  headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
403
  datatype=TYPES,
404
  max_rows=None,
 
420
  [hidden_leaderboard_table_for_search, leaderboard_table, search_bar],
421
  leaderboard_table,
422
  )
423
+
424
+ filter_type.change(update_filter_type,inputs=[filter_type, shown_columns],outputs=[filter_columns, filter_columns_size, shown_columns],queue=False).then(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table, queue=False)
425
+ shown_columns.change(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table, queue=False)
426
+ filter_columns.change(filter_items, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns], leaderboard_table, queue=False)
427
+ filter_columns_size.change(filter_items_size, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns_size], leaderboard_table, queue=False)
428
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
429
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
430
 
 
492
  )
493
 
494
  scheduler = BackgroundScheduler()
495
+ scheduler.add_job(restart_space, "interval", seconds=900)
496
  scheduler.start()
497
  demo.queue(concurrency_count=40).launch()
src/assets/css_html_js.py CHANGED
@@ -1,11 +1,4 @@
1
  custom_css = """
2
- #changelog-text {
3
- font-size: 16px !important;
4
- }
5
-
6
- #changelog-text h2 {
7
- font-size: 18px !important;
8
- }
9
 
10
  .markdown-text {
11
  font-size: 16px !important;
@@ -75,6 +68,38 @@ table th:first-child {
75
  #scale-logo .download {
76
  display: none;
77
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  """
79
 
80
  get_window_url_params = """
@@ -83,4 +108,4 @@ get_window_url_params = """
83
  url_params = Object.fromEntries(params);
84
  return url_params;
85
  }
86
- """
 
1
  custom_css = """
 
 
 
 
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
 
68
  #scale-logo .download {
69
  display: none;
70
  }
71
+ #filter_type{
72
+ border: 0;
73
+ padding-left: 0;
74
+ padding-top: 0;
75
+ }
76
+ #filter_type label {
77
+ display: flex;
78
+ }
79
+ #filter_type label > span{
80
+ margin-top: var(--spacing-lg);
81
+ margin-right: 0.5em;
82
+ }
83
+ #filter_type label > .wrap{
84
+ width: 103px;
85
+ }
86
+ #filter_type label > .wrap .wrap-inner{
87
+ padding: 2px;
88
+ }
89
+ #filter_type label > .wrap .wrap-inner input{
90
+ width: 1px
91
+ }
92
+ #filter-columns{
93
+ border:0;
94
+ padding:0;
95
+ }
96
+ #filter-columns-size{
97
+ border:0;
98
+ padding:0;
99
+ }
100
+ #box-filter > .form{
101
+ border: 0
102
+ }
103
  """
104
 
105
  get_window_url_params = """
 
108
  url_params = Object.fromEntries(params);
109
  return url_params;
110
  }
111
+ """
src/assets/scale-hf-logo.png CHANGED

Git LFS Details

  • SHA256: 11a263a1abe4c7c9cf022cbe052dc567dcea164bdfbc111299aae3270e992934
  • Pointer size: 132 Bytes
  • Size of remote file: 1.88 MB

Git LFS Details

  • SHA256: 11a263a1abe4c7c9cf022cbe052dc567dcea164bdfbc111299aae3270e992934
  • Pointer size: 132 Bytes
  • Size of remote file: 1.88 MB
src/assets/text_content.py CHANGED
@@ -1,76 +1,31 @@
1
  from ..auto_leaderboard.model_metadata_type import ModelType
2
 
3
- CHANGELOG_TEXT = f"""
4
- ## [2023-06-19]
5
- - Added model type column
6
- - Hid revision and 8bit columns since all models are the same atm
7
-
8
- ## [2023-06-16]
9
- - Refactored code base
10
- - Added new columns: number of parameters, hub likes, license
11
-
12
- ## [2023-06-13]
13
- - Adjust description for TruthfulQA
14
-
15
- ## [2023-06-12]
16
- - Add Human & GPT-4 Evaluations
17
-
18
- ## [2023-06-05]
19
- - Increase concurrent thread count to 40
20
- - Search models on ENTER
21
-
22
- ## [2023-06-02]
23
- - Add a typeahead search bar
24
- - Use webhooks to automatically spawn a new Space when someone opens a PR
25
- - Start recording `submitted_time` for eval requests
26
- - Limit AutoEvalColumn max-width
27
-
28
- ## [2023-05-30]
29
- - Add a citation button
30
- - Simplify Gradio layout
31
-
32
- ## [2023-05-29]
33
- - Auto-restart every hour for the latest results
34
- - Sync with the internal version (minor style changes)
35
-
36
- ## [2023-05-24]
37
- - Add a baseline that has 25.0 for all values
38
- - Add CHANGELOG
39
-
40
- ## [2023-05-23]
41
- - Fix a CSS issue that made the leaderboard hard to read in dark mode
42
-
43
- ## [2023-05-22]
44
- - Display a success/error message after submitting evaluation requests
45
- - Reject duplicate submission
46
- - Do not display results that have incomplete results
47
- - Display different queues for jobs that are RUNNING, PENDING, FINISHED status
48
-
49
- ## [2023-05-15]
50
- - Fix a typo: from "TruthQA" to "QA"
51
-
52
- ## [2023-05-10]
53
- - Fix a bug that prevented auto-refresh
54
-
55
- ## [2023-05-10]
56
- - Release the leaderboard to public
57
- """
58
-
59
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
60
 
61
  INTRODUCTION_TEXT = f"""
62
- 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
63
-
64
- 🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as the original LLaMa release.
65
 
66
- Other cool benchmarks for LLMs are developed at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
 
67
  """
68
 
69
  LLM_BENCHMARKS_TEXT = f"""
70
  # Context
71
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
72
 
73
- 📈 We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
76
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
@@ -80,38 +35,13 @@ With the plethora of large language models (LLMs) and chatbots being released we
80
  For all these evaluations, a higher score is a better score.
81
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
82
 
83
- # Some good practices before submitting a model
84
-
85
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
86
- ```python
87
- from transformers import AutoConfig, AutoModel, AutoTokenizer
88
- config = AutoConfig.from_pretrained("your model name", revision=revision)
89
- model = AutoModel.from_pretrained("your model name", revision=revision)
90
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
91
- ```
92
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
93
-
94
- Note: make sure your model is public!
95
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
96
-
97
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
98
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
99
-
100
- ### 3) Make sure your model has an open license!
101
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
102
-
103
- ### 4) Fill up your model card
104
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
105
-
106
- # Reproducibility and details
107
-
108
- ### Details and logs
109
  You can find:
110
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
111
  - details on the input/outputs for the models in the `details` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/details
112
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
113
 
114
- ### Reproducibility
115
  To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
116
  `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
117
  ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
@@ -125,29 +55,48 @@ The tasks and few shots parameters are:
125
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
126
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
127
 
128
- ### Quantization
129
  To get more information about quantization, see:
130
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
131
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
132
 
133
- ### Icons
134
- {ModelType.PT.to_str(" : ")} model
135
- {ModelType.FT.to_str(" : ")} model
136
- {ModelType.IFT.to_str(" : ")} model
137
- {ModelType.RL.to_str(" : ")} model
138
- If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
139
 
 
 
 
 
140
 
141
- # In case of model failure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  If your model is displayed in the `FAILED` category, its execution stopped.
143
  Make sure you have followed the above steps first.
144
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
145
-
146
- """
147
-
148
- EVALUATION_QUEUE_TEXT = f"""
149
- # Evaluation Queue for the 🤗 Open LLM Leaderboard
150
- These models will be automatically evaluated on the 🤗 cluster.
151
  """
152
 
153
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -216,4 +165,4 @@ CITATION_BUTTON_TEXT = r"""
216
  eprint={2109.07958},
217
  archivePrefix={arXiv},
218
  primaryClass={cs.CL}
219
- }"""
 
1
  from ..auto_leaderboard.model_metadata_type import ModelType
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = f"""
6
+ 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
 
 
7
 
8
+ 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
9
+ The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
10
  """
11
 
12
  LLM_BENCHMARKS_TEXT = f"""
13
  # Context
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
+ ## Icons
17
+ {ModelType.PT.to_str(" : ")} model
18
+ {ModelType.FT.to_str(" : ")} model
19
+ {ModelType.IFT.to_str(" : ")} model
20
+ {ModelType.RL.to_str(" : ")} model
21
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
22
+
23
+ 🏴‍☠️ indicates that this model has been flagged by the community, and should probably be ignored! Clicking the icon will redirect you to the discussion about the model.
24
+ (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
25
+
26
+ ## How it works
27
+
28
+ 📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
29
 
30
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
31
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 
35
  For all these evaluations, a higher score is a better score.
36
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
37
 
38
+ ## Details and logs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  You can find:
40
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
41
  - details on the input/outputs for the models in the `details` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/details
42
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
43
 
44
+ ## Reproducibility
45
  To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
46
  `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
47
  ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
 
55
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
56
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
57
 
58
+ ## Quantization
59
  To get more information about quantization, see:
60
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
61
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
62
 
63
+ ## More resources
64
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/179)!
65
+ We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
66
+ """
 
 
67
 
68
+ EVALUATION_QUEUE_TEXT = f"""
69
+ # Evaluation Queue for the ��� Open LLM Leaderboard
70
+
71
+ Models added here will be automatically evaluated on the 🤗 cluster.
72
 
73
+ ## Some good practices before submitting a model
74
+
75
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
76
+ ```python
77
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
78
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
79
+ model = AutoModel.from_pretrained("your model name", revision=revision)
80
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
81
+ ```
82
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
83
+
84
+ Note: make sure your model is public!
85
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
86
+
87
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
88
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
89
+
90
+ ### 3) Make sure your model has an open license!
91
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
92
+
93
+ ### 4) Fill up your model card
94
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
95
+
96
+ ## In case of model failure
97
  If your model is displayed in the `FAILED` category, its execution stopped.
98
  Make sure you have followed the above steps first.
99
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
 
 
 
 
100
  """
101
 
102
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
165
  eprint={2109.07958},
166
  archivePrefix={arXiv},
167
  primaryClass={cs.CL}
168
+ }"""
src/auto_leaderboard/get_model_metadata.py CHANGED
@@ -1,9 +1,14 @@
1
  import re
2
  import os
 
 
 
3
  from typing import List
 
4
 
5
- from src.utils_display import AutoEvalColumn
6
- from src.auto_leaderboard.model_metadata_type import get_model_type
 
7
 
8
  from huggingface_hub import HfApi
9
  import huggingface_hub
@@ -11,7 +16,7 @@ api = HfApi(token=os.environ.get("H4_TOKEN", None))
11
 
12
 
13
  def get_model_infos_from_hub(leaderboard_data: List[dict]):
14
- for model_data in leaderboard_data:
15
  model_name = model_data["model_name_for_query"]
16
  try:
17
  model_info = api.model_info(model_name)
@@ -21,12 +26,6 @@ def get_model_infos_from_hub(leaderboard_data: List[dict]):
21
  model_data[AutoEvalColumn.likes.name] = None
22
  model_data[AutoEvalColumn.params.name] = get_model_size(model_name, None)
23
  continue
24
- except Exception:
25
- print("Repo fetch error!", model_name)
26
- model_data[AutoEvalColumn.license.name] = None
27
- model_data[AutoEvalColumn.likes.name] = None
28
- model_data[AutoEvalColumn.params.name] = None
29
- continue
30
 
31
  model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
32
  model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
@@ -57,6 +56,68 @@ def get_model_size(model_name, model_info):
57
  return None
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def apply_metadata(leaderboard_data: List[dict]):
 
61
  get_model_type(leaderboard_data)
62
  get_model_infos_from_hub(leaderboard_data)
 
 
1
  import re
2
  import os
3
+ import glob
4
+ import json
5
+ import os
6
  from typing import List
7
+ from tqdm import tqdm
8
 
9
+ from src.utils_display import AutoEvalColumn, model_hyperlink
10
+ from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
11
+ from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS, DO_NOT_SUBMIT_MODELS
12
 
13
  from huggingface_hub import HfApi
14
  import huggingface_hub
 
16
 
17
 
18
  def get_model_infos_from_hub(leaderboard_data: List[dict]):
19
+ for model_data in tqdm(leaderboard_data):
20
  model_name = model_data["model_name_for_query"]
21
  try:
22
  model_info = api.model_info(model_name)
 
26
  model_data[AutoEvalColumn.likes.name] = None
27
  model_data[AutoEvalColumn.params.name] = get_model_size(model_name, None)
28
  continue
 
 
 
 
 
 
29
 
30
  model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
31
  model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
 
56
  return None
57
 
58
 
59
+ def get_model_type(leaderboard_data: List[dict]):
60
+ for model_data in leaderboard_data:
61
+ request_files = os.path.join("eval-queue", model_data["model_name_for_query"] + "_eval_request_*" + ".json")
62
+ request_files = glob.glob(request_files)
63
+
64
+ # Select correct request file (precision)
65
+ request_file = ""
66
+ if len(request_files) == 1:
67
+ request_file = request_files[0]
68
+ elif len(request_files) > 1:
69
+ request_files = sorted(request_files, reverse=True)
70
+ for tmp_request_file in request_files:
71
+ with open(tmp_request_file, "r") as f:
72
+ req_content = json.load(f)
73
+ if req_content["status"] == "FINISHED" and req_content["precision"] == model_data["Precision"].split(".")[-1]:
74
+ request_file = tmp_request_file
75
+
76
+ if request_file == "":
77
+ model_data[AutoEvalColumn.model_type.name] = ""
78
+ model_data[AutoEvalColumn.model_type_symbol.name] = ""
79
+ continue
80
+
81
+ try:
82
+ with open(request_file, "r") as f:
83
+ request = json.load(f)
84
+ is_delta = request["weight_type"] != "Original"
85
+ except Exception:
86
+ is_delta = False
87
+
88
+ try:
89
+ with open(request_file, "r") as f:
90
+ request = json.load(f)
91
+ model_type = model_type_from_str(request["model_type"])
92
+ model_data[AutoEvalColumn.model_type.name] = model_type.value.name
93
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol #+ ("🔺" if is_delta else "")
94
+ except KeyError:
95
+ if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
96
+ model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[model_data["model_name_for_query"]].value.name
97
+ model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[model_data["model_name_for_query"]].value.symbol #+ ("🔺" if is_delta else "")
98
+ else:
99
+ model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
100
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
101
+
102
+ def flag_models(leaderboard_data:List[dict]):
103
+ for model_data in leaderboard_data:
104
+ if model_data["model_name_for_query"] in FLAGGED_MODELS:
105
+ issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
106
+ issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
107
+ model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
108
+
109
+ def remove_forbidden_models(leaderboard_data: List[dict]):
110
+ indices_to_remove = []
111
+ for ix, model in enumerate(leaderboard_data):
112
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
113
+ indices_to_remove.append(ix)
114
+
115
+ for ix in reversed(indices_to_remove):
116
+ leaderboard_data.pop(ix)
117
+ return leaderboard_data
118
+
119
  def apply_metadata(leaderboard_data: List[dict]):
120
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
121
  get_model_type(leaderboard_data)
122
  get_model_infos_from_hub(leaderboard_data)
123
+ flag_models(leaderboard_data)
src/auto_leaderboard/load_results.py CHANGED
@@ -91,7 +91,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
91
 
92
  eval_results = []
93
  for benchmark, metric in zip(BENCHMARKS, METRICS):
94
- accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
95
  if accs.size == 0:
96
  continue
97
  mean_acc = np.mean(accs) * 100.0
@@ -102,7 +102,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
102
  return result_key, eval_results
103
 
104
 
105
- def get_eval_results(is_public) -> List[EvalResult]:
106
  json_filepaths = []
107
 
108
  for root, dir, files in os.walk("eval-results"):
@@ -135,7 +135,7 @@ def get_eval_results(is_public) -> List[EvalResult]:
135
  return eval_results
136
 
137
 
138
- def get_eval_results_dicts(is_public=True) -> List[Dict]:
139
- eval_results = get_eval_results(is_public)
140
 
141
  return [e.to_dict() for e in eval_results]
 
91
 
92
  eval_results = []
93
  for benchmark, metric in zip(BENCHMARKS, METRICS):
94
+ accs = np.array([v.get(metric, 0) for k, v in data["results"].items() if benchmark in k])
95
  if accs.size == 0:
96
  continue
97
  mean_acc = np.mean(accs) * 100.0
 
102
  return result_key, eval_results
103
 
104
 
105
+ def get_eval_results() -> List[EvalResult]:
106
  json_filepaths = []
107
 
108
  for root, dir, files in os.walk("eval-results"):
 
135
  return eval_results
136
 
137
 
138
+ def get_eval_results_dicts() -> List[Dict]:
139
+ eval_results = get_eval_results()
140
 
141
  return [e.to_dict() for e in eval_results]
src/auto_leaderboard/model_metadata_flags.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models which have been flagged by users as being problematic for a reason or another
2
+ # (Model name to forum discussion link)
3
+ FLAGGED_MODELS = {
4
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
5
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
6
+ "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
7
+ }
8
+
9
+ # Models which have been requested by orgs to not be submitted on the leaderboard
10
+ DO_NOT_SUBMIT_MODELS = [
11
+ "Voicelab/trurl-2-13b", # trained on MMLU
12
+ ]
src/auto_leaderboard/model_metadata_type.py CHANGED
@@ -1,35 +1,26 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
- import glob
4
- import json
5
- import os
6
- from typing import Dict, List
7
 
8
- from ..utils_display import AutoEvalColumn
9
 
10
  @dataclass
11
  class ModelInfo:
12
  name: str
13
  symbol: str # emoji
14
 
15
- model_type_symbols = {
16
- "fine-tuned": "🔶",
17
- "pretrained": "🟢",
18
- "RL-tuned": "🟦",
19
- "instruction-tuned": "⭕",
20
- }
21
 
22
  class ModelType(Enum):
23
  PT = ModelInfo(name="pretrained", symbol="🟢")
24
  FT = ModelInfo(name="fine-tuned", symbol="🔶")
25
  IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
26
  RL = ModelInfo(name="RL-tuned", symbol="🟦")
 
27
 
28
  def to_str(self, separator = " "):
29
  return f"{self.value.symbol}{separator}{self.value.name}"
30
 
31
 
32
- TYPE_METADATA: Dict[str, ModelType] = {
33
  'notstoic/PygmalionCoT-7b': ModelType.IFT,
34
  'aisquared/dlite-v1-355m': ModelType.IFT,
35
  'aisquared/dlite-v1-1_5b': ModelType.IFT,
@@ -211,10 +202,10 @@ TYPE_METADATA: Dict[str, ModelType] = {
211
  'dvruette/oasst-gpt-neox-20b-1000-steps': ModelType.FT,
212
  'dvruette/llama-13b-pretrained-dropout': ModelType.PT,
213
  'dvruette/llama-13b-pretrained': ModelType.PT,
214
- 'dvruette/llama-13b-pretrained-sft-epoch-1': ModelType.PT,
215
- 'dvruette/llama-13b-pretrained-sft-do2': ModelType.PT,
216
  'dvruette/oasst-gpt-neox-20b-3000-steps': ModelType.FT,
217
- 'dvruette/oasst-pythia-12b-pretrained-sft': ModelType.PT,
218
  'dvruette/oasst-pythia-6.9b-4000-steps': ModelType.FT,
219
  'dvruette/gpt-neox-20b-full-precision': ModelType.FT,
220
  'dvruette/oasst-llama-13b-1000-steps': ModelType.FT,
@@ -547,33 +538,14 @@ TYPE_METADATA: Dict[str, ModelType] = {
547
  }
548
 
549
 
550
- def get_model_type(leaderboard_data: List[dict]):
551
- for model_data in leaderboard_data:
552
- # Todo @clefourrier once requests are connected with results
553
- # Stored information
554
- request_file = os.path.join("eval-queue", model_data["model_name_for_query"] + "_eval_request_*" + ".json")
555
- request_file = glob.glob(request_file)
556
-
557
- if len(request_file) == 0:
558
- model_data[AutoEvalColumn.model_type.name] = ""
559
- model_data[AutoEvalColumn.model_type_symbol.name] = ""
560
- continue
561
-
562
- request_file = request_file[0]
563
-
564
- try:
565
- with open(request_file, "r") as f:
566
- request = json.load(f)
567
- is_delta = request["weight_type"] != "Original"
568
- except Exception:
569
- is_delta = False
570
 
571
- try:
572
- with open(request_file, "r") as f:
573
- request = json.load(f)
574
- model_type = request["model_type"]
575
- model_data[AutoEvalColumn.model_type.name] = model_type
576
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type_symbols[model_type] + ("🔺" if is_delta else "")
577
- except Exception:
578
- model_data[AutoEvalColumn.model_type.name] = "Unknown, add type to request file!"
579
- model_data[AutoEvalColumn.model_type_symbol.name] = "?"
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
+ from typing import Dict
 
 
 
4
 
 
5
 
6
  @dataclass
7
  class ModelInfo:
8
  name: str
9
  symbol: str # emoji
10
 
 
 
 
 
 
 
11
 
12
  class ModelType(Enum):
13
  PT = ModelInfo(name="pretrained", symbol="🟢")
14
  FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
  IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
  RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
+ Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
18
 
19
  def to_str(self, separator = " "):
20
  return f"{self.value.symbol}{separator}{self.value.name}"
21
 
22
 
23
+ MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
  'notstoic/PygmalionCoT-7b': ModelType.IFT,
25
  'aisquared/dlite-v1-355m': ModelType.IFT,
26
  'aisquared/dlite-v1-1_5b': ModelType.IFT,
 
202
  'dvruette/oasst-gpt-neox-20b-1000-steps': ModelType.FT,
203
  'dvruette/llama-13b-pretrained-dropout': ModelType.PT,
204
  'dvruette/llama-13b-pretrained': ModelType.PT,
205
+ 'dvruette/llama-13b-pretrained-sft-epoch-1': ModelType.FT,
206
+ 'dvruette/llama-13b-pretrained-sft-do2': ModelType.FT,
207
  'dvruette/oasst-gpt-neox-20b-3000-steps': ModelType.FT,
208
+ 'dvruette/oasst-pythia-12b-pretrained-sft': ModelType.FT,
209
  'dvruette/oasst-pythia-6.9b-4000-steps': ModelType.FT,
210
  'dvruette/gpt-neox-20b-full-precision': ModelType.FT,
211
  'dvruette/oasst-llama-13b-1000-steps': ModelType.FT,
 
538
  }
539
 
540
 
541
+ def model_type_from_str(type):
542
+ if "fine-tuned" in type or "🔶" in type:
543
+ return ModelType.FT
544
+ if "pretrained" in type or "🟢" in type:
545
+ return ModelType.PT
546
+ if "RL-tuned" in type or "🟦" in type:
547
+ return ModelType.RL
548
+ if "instruction-tuned" in type or "⭕" in type:
549
+ return ModelType.IFT
550
+ return ModelType.Unknown
 
 
 
 
 
 
 
 
 
 
551
 
 
 
 
 
 
 
 
 
 
src/init.py CHANGED
@@ -1,8 +1,6 @@
1
  import os
2
  from huggingface_hub import Repository
3
 
4
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
5
-
6
 
7
  def get_all_requested_models(requested_models_dir):
8
  depth = 1
@@ -20,28 +18,23 @@ def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
20
  eval_results_repo = None
21
  requested_models = None
22
 
23
- if H4_TOKEN:
24
- print("Pulling evaluation requests and results.")
25
-
26
- eval_queue_repo = Repository(
27
- local_dir=QUEUE_PATH,
28
- clone_from=QUEUE_REPO,
29
- use_auth_token=H4_TOKEN,
30
- repo_type="dataset",
31
- )
32
- eval_queue_repo.git_pull()
33
-
34
- eval_results_repo = Repository(
35
- local_dir=RESULTS_PATH,
36
- clone_from=RESULTS_REPO,
37
- use_auth_token=H4_TOKEN,
38
- repo_type="dataset",
39
- )
40
- eval_results_repo.git_pull()
41
-
42
- requested_models = get_all_requested_models("eval-queue")
43
- else:
44
- print("No HuggingFace token provided. Skipping evaluation requests and results.")
45
 
46
  return eval_queue_repo, requested_models, eval_results_repo
47
 
 
1
  import os
2
  from huggingface_hub import Repository
3
 
 
 
4
 
5
  def get_all_requested_models(requested_models_dir):
6
  depth = 1
 
18
  eval_results_repo = None
19
  requested_models = None
20
 
21
+ print("Pulling evaluation requests and results.")
22
+
23
+ eval_queue_repo = Repository(
24
+ local_dir=QUEUE_PATH,
25
+ clone_from=QUEUE_REPO,
26
+ repo_type="dataset",
27
+ )
28
+ eval_queue_repo.git_pull()
29
+
30
+ eval_results_repo = Repository(
31
+ local_dir=RESULTS_PATH,
32
+ clone_from=RESULTS_REPO,
33
+ repo_type="dataset",
34
+ )
35
+ eval_results_repo.git_pull()
36
+
37
+ requested_models = get_all_requested_models("eval-queue")
 
 
 
 
 
38
 
39
  return eval_queue_repo, requested_models, eval_results_repo
40
 
src/utils_display.py CHANGED
@@ -1,4 +1,9 @@
 
1
  from dataclasses import dataclass
 
 
 
 
2
 
3
  # These classes are for user facing column names, to avoid having to change them
4
  # all around the code when a modif is needed
@@ -84,10 +89,24 @@ def make_clickable_model(model_name):
84
  link = KOALA_LINK
85
  elif model_name == "oasst-12b":
86
  link = OASST_LINK
87
- #else:
88
- # link = MODEL_PAGE
89
-
90
- return model_hyperlink(link, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def styled_error(error):
93
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
 
1
+ import os
2
  from dataclasses import dataclass
3
+ from huggingface_hub import HfApi
4
+
5
+ API = HfApi()
6
+
7
 
8
  # These classes are for user facing column names, to avoid having to change them
9
  # all around the code when a modif is needed
 
89
  link = KOALA_LINK
90
  elif model_name == "oasst-12b":
91
  link = OASST_LINK
92
+
93
+ details_model_name = model_name.replace('/', '__')
94
+ details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
95
+
96
+ if not bool(os.getenv("DEBUG", "False")):
97
+ # We only add these checks when not debugging, as they are extremely slow
98
+ print(f"details_link: {details_link}")
99
+ try:
100
+ check_path = list(API.list_files_info(repo_id=f"open-llm-leaderboard/details_{details_model_name}",
101
+ paths="README.md",
102
+ repo_type="dataset"))
103
+ print(f"check_path: {check_path}")
104
+ except Exception as err:
105
+ # No details repo for this model
106
+ print(f"No details repo for this model: {err}")
107
+ return model_hyperlink(link, model_name)
108
+
109
+ return model_hyperlink(link, model_name) + ' ' + model_hyperlink(details_link, "📑")
110
 
111
  def styled_error(error):
112
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"