|
|
|
|
|
|
|
|
|
name: Benchmark |
|
|
|
on: |
|
workflow_dispatch: |
|
inputs: |
|
gpu-series: |
|
description: 'Azure GPU series to run with' |
|
required: true |
|
type: choice |
|
options: |
|
- Standard_NC4as_T4_v3 |
|
- Standard_NC24ads_A100_v4 |
|
- Standard_NC80adis_H100_v5 |
|
sha: |
|
description: 'Commit SHA1 to build' |
|
required: false |
|
type: string |
|
duration: |
|
description: 'Duration of the bench' |
|
type: string |
|
default: 10m |
|
|
|
push: |
|
branches: |
|
- master |
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] |
|
pull_request_target: |
|
types: [opened, synchronize, reopened] |
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] |
|
schedule: |
|
- cron: '04 2 * * *' |
|
|
|
concurrency: |
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} |
|
cancel-in-progress: true |
|
|
|
jobs: |
|
bench-server-baseline: |
|
runs-on: Standard_NC4as_T4_v3 |
|
env: |
|
RUNNER_LABEL: Standard_NC4as_T4_v3 |
|
N_USERS: 8 |
|
DURATION: 10m |
|
|
|
strategy: |
|
matrix: |
|
model: [phi-2] |
|
ftype: [q4_0, q8_0, f16] |
|
include: |
|
- model: phi-2 |
|
ftype: q4_0 |
|
pr_comment_enabled: "true" |
|
|
|
if: | |
|
inputs.gpu-series == 'Standard_NC4as_T4_v3' |
|
|| ( |
|
github.event_name == 'schedule' |
|
&& github.ref_name == 'master' |
|
&& github.repository_owner == 'ggerganov' |
|
) |
|
|| github.event_name == 'pull_request_target' |
|
|| ( |
|
github.event_name == 'push' |
|
&& github.event.ref == 'refs/heads/master' |
|
&& github.repository_owner == 'ggerganov' |
|
) |
|
steps: |
|
- name: Clone |
|
id: checkout |
|
uses: actions/checkout@v4 |
|
with: |
|
fetch-depth: 0 |
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} |
|
|
|
- name: Install python env |
|
id: pipenv |
|
run: | |
|
cd examples/server/bench |
|
python3 -m venv venv |
|
source venv/bin/activate |
|
pip install -r requirements.txt |
|
|
|
- name: Prometheus |
|
id: install_prometheus |
|
run: | |
|
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz |
|
tar xzf prometheus*.tar.gz --strip-components=1 |
|
./prometheus --config.file=examples/server/bench/prometheus.yml & |
|
while ! nc -z localhost 9090; do |
|
sleep 0.1 |
|
done |
|
|
|
- name: Set up Go |
|
uses: actions/setup-go@v5 |
|
with: |
|
go-version: '1.21' |
|
|
|
- name: Install k6 and xk6-sse |
|
id: k6_installation |
|
run: | |
|
cd examples/server/bench |
|
go install go.k6.io/xk6/cmd/xk6@latest |
|
xk6 build master \ |
|
--with github.com/phymbert/xk6-sse |
|
|
|
- name: Build |
|
id: cmake_build |
|
run: | |
|
set -eux |
|
cmake -B build \ |
|
-DGGML_NATIVE=OFF \ |
|
-DLLAMA_BUILD_SERVER=ON \ |
|
-DLLAMA_CURL=ON \ |
|
-DLLAMA_CUBLAS=ON \ |
|
-DCUDAToolkit_ROOT=/usr/local/cuda \ |
|
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ |
|
-DCMAKE_CUDA_ARCHITECTURES=75 \ |
|
-DLLAMA_FATAL_WARNINGS=OFF \ |
|
-DLLAMA_ALL_WARNINGS=OFF \ |
|
-DCMAKE_BUILD_TYPE=Release; |
|
cmake --build build --config Release -j $(nproc) --target llama-server |
|
|
|
- name: Download the dataset |
|
id: download_dataset |
|
run: | |
|
cd examples/server/bench |
|
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json |
|
|
|
- name: Server bench |
|
id: server_bench |
|
env: |
|
HEAD_REF: ${{ github.head_ref || github.ref_name }} |
|
run: | |
|
set -eux |
|
|
|
cd examples/server/bench |
|
source venv/bin/activate |
|
python bench.py \ |
|
--runner-label ${{ env.RUNNER_LABEL }} \ |
|
--name ${{ github.job }} \ |
|
--branch $HEAD_REF \ |
|
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ |
|
--scenario script.js \ |
|
--duration ${{ github.event.inputs.duration || env.DURATION }} \ |
|
--hf-repo ggml-org/models \ |
|
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ |
|
--model-path-prefix /models \ |
|
--parallel ${{ env.N_USERS }} \ |
|
-ngl 33 \ |
|
--batch-size 2048 \ |
|
--ubatch-size 256 \ |
|
--ctx-size 16384 \ |
|
--n-prompts 1000 \ |
|
--max-prompt-tokens 1024 \ |
|
--max-tokens 2048 |
|
|
|
cat results.github.env >> $GITHUB_ENV |
|
|
|
|
|
rm ShareGPT_V3_unfiltered_cleaned_split.json |
|
|
|
- uses: actions/upload-artifact@v4 |
|
with: |
|
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} |
|
compression-level: 9 |
|
path: | |
|
examples/server/bench/*.jpg |
|
examples/server/bench/*.json |
|
examples/server/bench/*.log |
|
|
|
- name: Commit status |
|
uses: Sibz/github-status-action@v1 |
|
with: |
|
authToken: ${{secrets.GITHUB_TOKEN}} |
|
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} |
|
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} |
|
description: | |
|
${{ env.BENCH_RESULTS }} |
|
state: 'success' |
|
|
|
- name: Upload benchmark images |
|
uses: devicons/[email protected] |
|
continue-on-error: true |
|
id: imgur_step |
|
with: |
|
client_id: ${{secrets.IMGUR_CLIENT_ID}} |
|
path: | |
|
examples/server/bench/prompt_tokens_seconds.jpg |
|
examples/server/bench/predicted_tokens_seconds.jpg |
|
examples/server/bench/kv_cache_usage_ratio.jpg |
|
examples/server/bench/requests_processing.jpg |
|
|
|
- name: Extract mermaid |
|
id: set_mermaid |
|
run: | |
|
set -eux |
|
|
|
cd examples/server/bench |
|
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) |
|
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV |
|
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV |
|
echo "EOF" >> $GITHUB_ENV |
|
|
|
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) |
|
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV |
|
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV |
|
echo "EOF" >> $GITHUB_ENV |
|
|
|
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) |
|
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV |
|
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV |
|
echo "EOF" >> $GITHUB_ENV |
|
|
|
REQUESTS_PROCESSING=$(cat requests_processing.mermaid) |
|
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV |
|
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV |
|
echo "EOF" >> $GITHUB_ENV |
|
|
|
- name: Extract image url |
|
id: extract_image_url |
|
continue-on-error: true |
|
run: | |
|
set -eux |
|
|
|
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV |
|
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV |
|
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV |
|
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV |
|
|
|
- name: Comment PR |
|
uses: mshick/add-pr-comment@v2 |
|
id: comment_pr |
|
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} |
|
with: |
|
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} |
|
message: | |
|
<p align="center"> |
|
|
|
π **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** π |
|
|
|
</p> |
|
|
|
<details> |
|
|
|
<summary>Expand details for performance related PR only</summary> |
|
|
|
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} |
|
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} |
|
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s |
|
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s |
|
- ${{ env.BENCH_GRAPH_XLABEL }} |
|
|
|
|
|
<p align="center"> |
|
|
|
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" /> |
|
|
|
<details> |
|
|
|
<summary>More</summary> |
|
|
|
```mermaid |
|
${{ env.PROMPT_TOKENS_SECONDS }} |
|
``` |
|
|
|
</details> |
|
|
|
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/> |
|
|
|
<details> |
|
<summary>More</summary> |
|
|
|
```mermaid |
|
${{ env.PREDICTED_TOKENS_SECONDS }} |
|
``` |
|
|
|
</details> |
|
|
|
</p> |
|
|
|
<details> |
|
|
|
<summary>Details</summary> |
|
|
|
<p align="center"> |
|
|
|
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" /> |
|
|
|
<details> |
|
<summary>More</summary> |
|
|
|
```mermaid |
|
${{ env.KV_CACHE_USAGE_RATIO }} |
|
``` |
|
|
|
</details> |
|
|
|
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/> |
|
|
|
<details> |
|
<summary>More</summary> |
|
|
|
```mermaid |
|
${{ env.REQUESTS_PROCESSING }} |
|
``` |
|
|
|
</details> |
|
|
|
</p> |
|
</details> |
|
</details> |
|
|