|
import os |
|
|
|
import pandas as pd |
|
from huggingface_hub import hf_hub_download |
|
|
|
from .utils import process_quantization_scheme, process_arch |
|
|
|
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" |
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
COLUMNS_MAPPING = { |
|
"Model": "Model π€", |
|
"experiment_name": "Experiment π§ͺ", |
|
|
|
"forward.latency(s)": "Prefill (s)", |
|
"decode.throughput(tokens/s)": "Decode (tokens/s)", |
|
"generate.max_memory_allocated(MB)": "Memory (MB)", |
|
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)", |
|
|
|
"backend.name": "Backend π", |
|
"backend.torch_dtype": "DType π₯", |
|
"optimization": "Optimization π οΈ", |
|
"quantization": "Quantization ποΈ", |
|
|
|
"Size": "Params (B)", |
|
"Arch": "Architecture ποΈ", |
|
"Score": "Open LLM Score (%)", |
|
"generate.latency(s)": "End-to-End (s)", |
|
"generate.throughput(tokens/s)": "End-to-End (tokens/s)", |
|
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)", |
|
"generate.max_memory_used(MB)": "Used Memory (MB)", |
|
} |
|
SORTING_COLUMNS = [ |
|
"Open LLM Score (%)", |
|
"Decode (tokens/s)", |
|
"Prefill (s)", |
|
] |
|
SORTING_ASCENDING = [False, True, False] |
|
|
|
|
|
def get_llm_df(): |
|
|
|
hf_hub_download( |
|
repo_id=LLM_PERF_DATASET_REPO, |
|
filename="open-llm.csv", |
|
local_dir="dataset", |
|
repo_type="dataset", |
|
token=HF_TOKEN, |
|
) |
|
|
|
llm_df = pd.read_csv("dataset/open-llm.csv") |
|
|
|
return llm_df |
|
|
|
|
|
def get_perf_df(machine: str = "hf-dgx-01"): |
|
hf_hub_download( |
|
repo_id=LLM_PERF_DATASET_REPO, |
|
filename=f"{machine}/perf-report.csv", |
|
local_dir="dataset", |
|
repo_type="dataset", |
|
token=HF_TOKEN, |
|
) |
|
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv") |
|
|
|
return perf_df |
|
|
|
|
|
def get_llm_perf_df(machine: str = "hf-dgx-01"): |
|
|
|
llm_df = get_llm_df() |
|
perf_df = get_perf_df(machine=machine) |
|
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model") |
|
|
|
assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1 |
|
assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1 |
|
assert llm_perf_df["benchmark.new_tokens"].nunique() == 1 |
|
|
|
llm_perf_df["generate.energy_consumption(tokens/kWh)"] = ( |
|
1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1) |
|
).astype(int) |
|
|
|
llm_perf_df.loc[ |
|
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1, |
|
"generate.energy_consumption(tokens/kWh)", |
|
] = pd.NA |
|
|
|
|
|
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply( |
|
lambda x: ( |
|
"BetterTransformer" |
|
if x["backend.to_bettertransformer"] |
|
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None") |
|
), |
|
axis=1, |
|
) |
|
|
|
llm_perf_df["quantization"] = llm_perf_df[ |
|
[ |
|
"backend.quantization_scheme", |
|
"backend.quantization_config.bits", |
|
"backend.quantization_config.version", |
|
"backend.quantization_config.load_in_4bit", |
|
"backend.quantization_config.load_in_8bit", |
|
"backend.quantization_config.exllama_config.version", |
|
] |
|
].apply(lambda x: process_quantization_scheme(x), axis=1) |
|
|
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", "")) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply( |
|
lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x |
|
) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit")) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit")) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit")) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit")) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA")) |
|
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2")) |
|
|
|
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch) |
|
|
|
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())] |
|
|
|
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True) |
|
|
|
llm_perf_df.sort_values( |
|
by=SORTING_COLUMNS, |
|
ascending=SORTING_ASCENDING, |
|
inplace=True, |
|
) |
|
|
|
return llm_perf_df |
|
|