Skip to content

Commit

Permalink
Merge pull request #301 from makaveli10/upgrade_tensorrt
Browse files Browse the repository at this point in the history
Upgrade tensorrt_llm==0.15.0.
  • Loading branch information
zoq authored Dec 3, 2024
2 parents a55b99c + 49e232b commit 19c05c8
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 92 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,16 @@ client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/b
docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it ghcr.io/collabora/whisperlive-tensorrt

# Build small.en engine
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en # float16
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8 # int8 weight only quantization
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4 # int4 weight only quantization

# Run server with small.en
python3 run_server.py --port 9090 \
--backend tensorrt \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en"
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int8"
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int4"
```

- CPU
Expand Down
16 changes: 6 additions & 10 deletions TensorRT_whisper.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
# WhisperLive-TensorRT
We have only tested the TensorRT backend in docker so, we recommend docker for a smooth TensorRT backend setup.
**Note**: We use `tensorrt_llm==0.9.0`
**Note**: We use `tensorrt_llm==0.15.0.dev2024111200`

## Installation
- Install [docker](https://docs.docker.com/engine/install/)
- Install [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)

- Clone this repo.
```bash
git clone https://github.com/collabora/WhisperLive.git
cd WhisperLive
```

- Run WhisperLive TensorRT in docker
```bash
docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it ghcr.io/collabora/whisperlive-tensorrt:latest
Expand All @@ -21,7 +15,9 @@ docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it g
- We build `small.en` and `small` multilingual TensorRT engine as examples below. The script logs the path of the directory with Whisper TensorRT engine. We need that model_path to run the server.
```bash
# convert small.en
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en # float16
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8 # int8 weight only quantization
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4 # int4 weight only quantization

# convert small multilingual model
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
Expand All @@ -32,11 +28,11 @@ bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
# Run English only model
python3 run_server.py --port 9090 \
--backend tensorrt \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en"
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"

# Run Multilingual model
python3 run_server.py --port 9090 \
--backend tensorrt \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small" \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
--trt_multilingual
```
10 changes: 5 additions & 5 deletions docker/Dockerfile.tensorrt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS base
FROM nvidia/cuda:12.5.1-runtime-ubuntu22.04 AS base

ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install -y \
python3.10 python3-pip openmpi-bin libopenmpi-dev git wget \
python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget \
&& rm -rf /var/lib/apt/lists/*

FROM base AS devel
RUN pip3 install --no-cache-dir -U tensorrt_llm==0.10.0 --extra-index-url https://pypi.nvidia.com
RUN pip3 install --no-cache-dir -U tensorrt_llm==0.15.0.dev2024111200 --extra-index-url https://pypi.nvidia.com
WORKDIR /app
RUN git clone -b v0.10.0 --depth 1 https://github.com/NVIDIA/TensorRT-LLM.git && \
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && cd TensorRT-LLM && \
git checkout c629546ce429623c8a163633095230154a6f0574 && cd ../ && \
mv TensorRT-LLM/examples ./TensorRT-LLM-examples && \
rm -rf TensorRT-LLM

Expand All @@ -24,7 +25,6 @@ RUN apt update && bash setup.sh && rm setup.sh

COPY requirements/server.txt .
RUN pip install --no-cache-dir -r server.txt && rm server.txt
RUN pip install -U huggingface_hub tokenizers==0.19.0
COPY whisper_live ./whisper_live
COPY scripts/build_whisper_tensorrt.sh .
COPY run_server.py .
57 changes: 51 additions & 6 deletions scripts/build_whisper_tensorrt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,24 @@ download_and_build_model() {
"large-v3" | "large")
model_url="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt"
;;
"large-v3-turbo" | "turbo")
model_url="https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt"
;;
*)
echo "Invalid model name: $model_name"
exit 1
;;
esac

if [ "$model_name" == "turbo" ]; then
model_name="large-v3-turbo"
fi

local inference_precision="float16"
local weight_only_precision="${2:-float16}"
local max_beam_width=4
local max_batch_size=1

echo "Downloading $model_name..."
# wget --directory-prefix=assets "$model_url"
# echo "Download completed: ${model_name}.pt"
Expand All @@ -54,11 +66,43 @@ download_and_build_model() {
echo "${model_name}.pt already exists in assets directory."
fi

local output_dir="whisper_${model_name//./_}"
local sanitized_model_name="${model_name//./_}"
local checkpoint_dir="whisper_${sanitized_model_name}_weights_${weight_only_precision}"
local output_dir="whisper_${sanitized_model_name}_${weight_only_precision}"
echo "$output_dir"
echo "Running build script for $model_name with output directory $output_dir"
python3 build.py --output_dir "$output_dir" --use_gpt_attention_plugin --use_gemm_plugin --use_bert_attention_plugin --enable_context_fmha --model_name "$model_name"
echo "Whisper $model_name TensorRT engine built."
echo "Converting model weights for $model_name..."
python3 convert_checkpoint.py \
$( [[ "$weight_only_precision" == "int8" || "$weight_only_precision" == "int4" ]] && echo "--use_weight_only --weight_only_precision $weight_only_precision" ) \
--output_dir "$checkpoint_dir" --model_name "$model_name"

echo "Building encoder for $model_name..."
trtllm-build \
--checkpoint_dir "${checkpoint_dir}/encoder" \
--output_dir "${output_dir}/encoder" \
--moe_plugin disable \
--enable_xqa disable \
--max_batch_size "$max_batch_size" \
--gemm_plugin disable \
--bert_attention_plugin "$inference_precision" \
--max_input_len 3000 \
--max_seq_len 3000

echo "Building decoder for $model_name..."
trtllm-build \
--checkpoint_dir "${checkpoint_dir}/decoder" \
--output_dir "${output_dir}/decoder" \
--moe_plugin disable \
--enable_xqa disable \
--max_beam_width "$max_beam_width" \
--max_batch_size "$max_batch_size" \
--max_seq_len 200 \
--max_input_len 14 \
--max_encoder_input_len 3000 \
--gemm_plugin "$inference_precision" \
--bert_attention_plugin "$inference_precision" \
--gpt_attention_plugin "$inference_precision"

echo "TensorRT LLM engine built for $model_name."
echo "========================================="
echo "Model is located at: $(pwd)/$output_dir"
}
Expand All @@ -70,8 +114,9 @@ fi

tensorrt_examples_dir="$1"
model_name="${2:-small.en}"
weight_only_precision="${3:-float16}" # Default to float16 if not provided

cd $1/whisper
cd $tensorrt_examples_dir/whisper
pip install --no-deps -r requirements.txt

download_and_build_model "$model_name"
download_and_build_model "$model_name" "$weight_only_precision"
4 changes: 2 additions & 2 deletions whisper_live/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def process_segments(self, segments):
for i, seg in enumerate(segments):
if not text or text[-1] != seg["text"]:
text.append(seg["text"])
if i == len(segments) - 1 and not seg["completed"]:
if i == len(segments) - 1 and not seg.get("completed", False):
self.last_segment = seg
elif (self.server_backend == "faster_whisper" and seg["completed"] and
elif (self.server_backend == "faster_whisper" and seg.get("completed", False) and
(not self.transcript or
float(seg['start']) >= float(self.transcript[-1]['end']))):
self.transcript.append(seg)
Expand Down
Loading

0 comments on commit 19c05c8

Please sign in to comment.