File size: 4,623 Bytes
2460b35 08604d0 2460b35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import gradio as gr
import pandas as pd
import plotly.express as px
EXLLAMA_DATA = [
# open llm
"Model π€",
"Arch ποΈ",
"DType π₯",
"Backend π",
"Params (B)",
"Open LLM Score (%)",
# deployment settings
"DType π₯",
"Backend π",
"Quantization ποΈ",
# primary measurements
"Prefill Latency (s)",
"Prefill Latency (s) Exllama",
"Decode Throughput (tokens/s)",
"Decode Throughput (tokens/s) Exllama",
"E2E Throughput (tokens/s)",
"E2E Throughput (tokens/s) Exllama",
# speedups
"Prefill Latency Speedup (%)",
"Decode Throughput Speedup (%)",
]
def get_exllama_df(llm_perf_df):
copy_df = llm_perf_df.copy()
# seperate vanilla GPTQ experiments from Exllama experiments
gptq_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit")]
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
# merge the three dataframes
exllamav1_df = pd.merge(
gptq_df,
exllamav1_df,
on=["Model π€"],
suffixes=["", " Exllama"],
)
exllamav2_df = pd.merge(
gptq_df,
exllamav2_df,
on=["Model π€"],
suffixes=["", " Exllama"],
)
# concat the two dataframes row-wise
exllama_df = pd.concat([exllamav1_df, exllamav2_df])
exllama_df["Quantization ποΈ"] = exllama_df["Quantization ποΈ Exllama"]
# compute speedups
exllama_df["Prefill Latency Speedup (%)"] = (
(exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100
).round(2) - 100
exllama_df["Decode Throughput Speedup (%)"] = (
(exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100
).round(2) - 100
# filter speedups > 1000%
exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000]
exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000]
return exllama_df
def get_exllama_decode_fig(llm_perf_df):
exllama_df = get_exllama_df(llm_perf_df)
# plot
decode_fig = px.box(
exllama_df,
x="Arch ποΈ",
y="Decode Throughput Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=EXLLAMA_DATA,
color="Quantization ποΈ Exllama",
points="all",
)
# add hover data
decode_fig.update_traces(
hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
)
# add layout
decode_fig.update_layout(
title={
"text": "Decode Throughput Speedup per Architecture",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Decode Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return decode_fig
def get_exllama_prefill_fig(llm_perf_df):
exllama_df = get_exllama_df(llm_perf_df)
# plot
prefill_fig = px.box(
exllama_df,
x="Arch ποΈ",
y="Prefill Latency Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=EXLLAMA_DATA,
color="Quantization ποΈ Exllama",
points="all",
)
# add hover data
prefill_fig.update_traces(
hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
)
# add layout
prefill_fig.update_layout(
title={
"text": "Prefill Latency Speedup per Architecture",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Prefill Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return prefill_fig
def create_exllama_plots(llm_perf_df):
# descriptive text
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
# get figures
prefill_fig = get_exllama_prefill_fig(llm_perf_df)
decode_fig = get_exllama_decode_fig(llm_perf_df)
# create plots
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
return prefill_plot, decode_plot
|