llm-perf-leaderboard

Running

File size: 4,623 Bytes

import gradio as gr
import pandas as pd
import plotly.express as px


EXLLAMA_DATA = [
    # open llm
    "Model 🤗",
    "Arch 🏛️",
    "DType 📥",
    "Backend 🏭",
    "Params (B)",
    "Open LLM Score (%)",
    # deployment settings
    "DType 📥",
    "Backend 🏭",
    "Quantization 🗜️",
    # primary measurements
    "Prefill Latency (s)",
    "Prefill Latency (s) Exllama",
    "Decode Throughput (tokens/s)",
    "Decode Throughput (tokens/s) Exllama",
    "E2E Throughput (tokens/s)",
    "E2E Throughput (tokens/s) Exllama",
    # speedups
    "Prefill Latency Speedup (%)",
    "Decode Throughput Speedup (%)",
]


def get_exllama_df(llm_perf_df):
    copy_df = llm_perf_df.copy()
    # seperate vanilla GPTQ experiments from Exllama experiments
    gptq_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit")]
    exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
    exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
    # merge the three dataframes
    exllamav1_df = pd.merge(
        gptq_df,
        exllamav1_df,
        on=["Model 🤗"],
        suffixes=["", " Exllama"],
    )
    exllamav2_df = pd.merge(
        gptq_df,
        exllamav2_df,
        on=["Model 🤗"],
        suffixes=["", " Exllama"],
    )
    # concat the two dataframes row-wise
    exllama_df = pd.concat([exllamav1_df, exllamav2_df])
    exllama_df["Quantization 🗜️"] = exllama_df["Quantization 🗜️ Exllama"]
    # compute speedups
    exllama_df["Prefill Latency Speedup (%)"] = (
        (exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100
    ).round(2) - 100
    exllama_df["Decode Throughput Speedup (%)"] = (
        (exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100
    ).round(2) - 100
    # filter speedups > 1000%
    exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000]
    exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000]

    return exllama_df


def get_exllama_decode_fig(llm_perf_df):
    exllama_df = get_exllama_df(llm_perf_df)
    # plot
    decode_fig = px.box(
        exllama_df,
        x="Arch 🏛️",
        y="Decode Throughput Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=EXLLAMA_DATA,
        color="Quantization 🗜️ Exllama",
        points="all",
    )
    # add hover data
    decode_fig.update_traces(
        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
    )
    # add layout
    decode_fig.update_layout(
        title={
            "text": "Decode Throughput Speedup per Architecture",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Decode Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return decode_fig


def get_exllama_prefill_fig(llm_perf_df):
    exllama_df = get_exllama_df(llm_perf_df)
    # plot
    prefill_fig = px.box(
        exllama_df,
        x="Arch 🏛️",
        y="Prefill Latency Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=EXLLAMA_DATA,
        color="Quantization 🗜️ Exllama",
        points="all",
    )
    # add hover data
    prefill_fig.update_traces(
        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
    )
    # add layout
    prefill_fig.update_layout(
        title={
            "text": "Prefill Latency Speedup per Architecture",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Prefill Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return prefill_fig


def create_exllama_plots(llm_perf_df):
    # descriptive text
    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
    # get figures
    prefill_fig = get_exllama_prefill_fig(llm_perf_df)
    decode_fig = get_exllama_decode_fig(llm_perf_df)

    # create plots
    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)

    return prefill_plot, decode_plot