Merge pull request #301 from makaveli10/upgrade_tensorrt

Upgrade tensorrt_llm==0.15.0.
collabora · Dec 3, 2024 · 19c05c8 · 19c05c8
2 parents a55b99c + 49e232b
commit 19c05c8
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -133,12 +133,16 @@ client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/b
   docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it ghcr.io/collabora/whisperlive-tensorrt
 
   # Build small.en engine
-  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
 
   # Run server with small.en
   python3 run_server.py --port 9090 \
                         --backend tensorrt \
-                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en"
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int8"
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int4"
   ```
 
 - CPU

diff --git a/TensorRT_whisper.md b/TensorRT_whisper.md
@@ -1,17 +1,11 @@
 # WhisperLive-TensorRT
 We have only tested the TensorRT backend in docker so, we recommend docker for a smooth TensorRT backend setup.
-**Note**: We use `tensorrt_llm==0.9.0`
+**Note**: We use `tensorrt_llm==0.15.0.dev2024111200`
 
 ## Installation
 - Install [docker](https://docs.docker.com/engine/install/)
 - Install [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
 
-- Clone this repo.
-```bash
-git clone https://github.com/collabora/WhisperLive.git
-cd WhisperLive
-```
-
 - Run WhisperLive TensorRT in docker
 ```bash
 docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it ghcr.io/collabora/whisperlive-tensorrt:latest
@@ -21,7 +15,9 @@ docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it g
 - We build `small.en` and `small` multilingual TensorRT engine as examples below. The script logs the path of the directory with Whisper TensorRT engine. We need that model_path to run the server.
 ```bash
 # convert small.en
-bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
 
 # convert small multilingual model
 bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
@@ -32,11 +28,11 @@ bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
 # Run English only model
 python3 run_server.py --port 9090 \
                       --backend tensorrt \
-                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en"
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
 
 # Run Multilingual model
 python3 run_server.py --port 9090 \
                       --backend tensorrt \
-                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small" \
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
                       --trt_multilingual
 ```
diff --git a/docker/Dockerfile.tensorrt b/docker/Dockerfile.tensorrt
@@ -1,15 +1,16 @@
-FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS base
+FROM nvidia/cuda:12.5.1-runtime-ubuntu22.04 AS base
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && apt-get install -y \
-    python3.10 python3-pip openmpi-bin libopenmpi-dev git wget \
+    python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget \
     && rm -rf /var/lib/apt/lists/*
 
 FROM base AS devel
-RUN pip3 install --no-cache-dir -U tensorrt_llm==0.10.0 --extra-index-url https://pypi.nvidia.com
+RUN pip3 install --no-cache-dir -U tensorrt_llm==0.15.0.dev2024111200 --extra-index-url https://pypi.nvidia.com
 WORKDIR /app
-RUN git clone -b v0.10.0 --depth 1 https://github.com/NVIDIA/TensorRT-LLM.git && \
+RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && cd TensorRT-LLM && \
+    git checkout c629546ce429623c8a163633095230154a6f0574 && cd ../ && \
     mv TensorRT-LLM/examples ./TensorRT-LLM-examples && \
     rm -rf TensorRT-LLM
 
@@ -24,7 +25,6 @@ RUN apt update && bash setup.sh && rm setup.sh
 
 COPY requirements/server.txt .
 RUN pip install --no-cache-dir -r server.txt && rm server.txt
-RUN pip install -U huggingface_hub tokenizers==0.19.0
 COPY whisper_live ./whisper_live
 COPY scripts/build_whisper_tensorrt.sh .
 COPY run_server.py .
diff --git a/scripts/build_whisper_tensorrt.sh b/scripts/build_whisper_tensorrt.sh
@@ -38,12 +38,24 @@ download_and_build_model() {
         "large-v3" | "large")
             model_url="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt"
             ;;
+        "large-v3-turbo" | "turbo")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt"
+            ;;
         *)
             echo "Invalid model name: $model_name"
             exit 1
             ;;
     esac
 
+    if [ "$model_name" == "turbo" ]; then
+        model_name="large-v3-turbo"
+    fi
+
+    local inference_precision="float16"
+    local weight_only_precision="${2:-float16}"
+    local max_beam_width=4
+    local max_batch_size=1
+
     echo "Downloading $model_name..."
     # wget --directory-prefix=assets "$model_url"
     # echo "Download completed: ${model_name}.pt"
@@ -54,11 +66,43 @@ download_and_build_model() {
         echo "${model_name}.pt already exists in assets directory."
     fi
 
-    local output_dir="whisper_${model_name//./_}"
+    local sanitized_model_name="${model_name//./_}"
+    local checkpoint_dir="whisper_${sanitized_model_name}_weights_${weight_only_precision}"
+    local output_dir="whisper_${sanitized_model_name}_${weight_only_precision}"
     echo "$output_dir"
-    echo "Running build script for $model_name with output directory $output_dir"
-    python3 build.py --output_dir "$output_dir" --use_gpt_attention_plugin --use_gemm_plugin  --use_bert_attention_plugin --enable_context_fmha --model_name "$model_name"
-    echo "Whisper $model_name TensorRT engine built."
+    echo "Converting model weights for $model_name..."
+    python3 convert_checkpoint.py \
+        $( [[ "$weight_only_precision" == "int8" || "$weight_only_precision" == "int4" ]] && echo "--use_weight_only --weight_only_precision $weight_only_precision" ) \
+        --output_dir "$checkpoint_dir" --model_name "$model_name"
+
+    echo "Building encoder for $model_name..."
+    trtllm-build \
+        --checkpoint_dir "${checkpoint_dir}/encoder" \
+        --output_dir "${output_dir}/encoder" \
+        --moe_plugin disable \
+        --enable_xqa disable \
+        --max_batch_size "$max_batch_size" \
+        --gemm_plugin disable \
+        --bert_attention_plugin "$inference_precision" \
+        --max_input_len 3000 \
+        --max_seq_len 3000
+
+    echo "Building decoder for $model_name..."
+    trtllm-build \
+        --checkpoint_dir "${checkpoint_dir}/decoder" \
+        --output_dir "${output_dir}/decoder" \
+        --moe_plugin disable \
+        --enable_xqa disable \
+        --max_beam_width "$max_beam_width" \
+        --max_batch_size "$max_batch_size" \
+        --max_seq_len 200 \
+        --max_input_len 14 \
+        --max_encoder_input_len 3000 \
+        --gemm_plugin "$inference_precision" \
+        --bert_attention_plugin "$inference_precision" \
+        --gpt_attention_plugin "$inference_precision"
+
+    echo "TensorRT LLM engine built for $model_name."
     echo "========================================="
     echo "Model is located at: $(pwd)/$output_dir"
 }
@@ -70,8 +114,9 @@ fi
 
 tensorrt_examples_dir="$1"
 model_name="${2:-small.en}"
+weight_only_precision="${3:-float16}"  # Default to float16 if not provided
 
-cd $1/whisper
+cd $tensorrt_examples_dir/whisper
 pip install --no-deps -r requirements.txt
 
-download_and_build_model "$model_name"
+download_and_build_model "$model_name" "$weight_only_precision"
diff --git a/whisper_live/client.py b/whisper_live/client.py
@@ -112,9 +112,9 @@ def process_segments(self, segments):
         for i, seg in enumerate(segments):
             if not text or text[-1] != seg["text"]:
                 text.append(seg["text"])
-                if i == len(segments) - 1 and not seg["completed"]:
+                if i == len(segments) - 1 and not seg.get("completed", False):
                     self.last_segment = seg
-                elif (self.server_backend == "faster_whisper" and seg["completed"] and
+                elif (self.server_backend == "faster_whisper" and seg.get("completed", False) and
                       (not self.transcript or
                         float(seg['start']) >= float(self.transcript[-1]['end']))):
                     self.transcript.append(seg)