Merge pull request #296 from makaveli10/upgrade_faster_whisper

Upgrade faster-whisper 1.1.0rc0.
collabora · Nov 19, 2024 · 446fc6e · 446fc6e
2 parents 0e89573 + a1650ea
commit 446fc6e
Show file tree

Hide file tree

Showing 4 changed files with 1,223 additions and 309 deletions.
diff --git a/requirements/server.txt b/requirements/server.txt
@@ -1,4 +1,4 @@
-faster-whisper==1.0.1
+faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/8f01aee36b562e6be537e0341cdd40dc8bed33a7.tar.gz
 websockets
 onnxruntime==1.16.0
 numba
@@ -9,5 +9,4 @@ scipy
 jiwer
 evaluate
 numpy<2
-tiktoken==0.3.3
-openai-whisper==20231117
+openai-whisper==20240930
diff --git a/whisper_live/client.py b/whisper_live/client.py
@@ -259,7 +259,9 @@ def write_srt_file(self, output_path="output.srt"):
 
         """
         if self.server_backend == "faster_whisper":
-            if (self.last_segment) and self.transcript[-1]["text"] != self.last_segment["text"]:
+            if not self.transcript and self.last_segment is not None:
+                self.transcript.append(self.last_segment)
+            elif self.last_segment and self.transcript[-1]["text"] != self.last_segment["text"]:
                 self.transcript.append(self.last_segment)
             utils.create_srt_file(self.transcript, output_path)
 

diff --git a/whisper_live/server.py b/whisper_live/server.py
@@ -417,7 +417,7 @@ def __init__(self, client_uid, websocket):
         self.prev_out = ''
         self.t_start = None
         self.exit = False
-        self.same_output_threshold = 0
+        self.same_output_count = 0
         self.show_prev_out_thresh = 5   # if pause(no output from whisper) show previous output for 5 seconds
         self.add_pause_thresh = 3       # add a blank to segment list as a pause(no speech) for 3 seconds
         self.transcript = []
@@ -780,17 +780,21 @@ def __init__(self, websocket, task="transcribe", device=None, language=None, cli
         super().__init__(client_uid, websocket)
         self.model_sizes = [
             "tiny", "tiny.en", "base", "base.en", "small", "small.en",
-            "medium", "medium.en", "large-v2", "large-v3",
+            "medium", "medium.en", "large-v2", "large-v3", "distil-small.en",
+            "distil-medium.en", "distil-large-v2", "distil-large-v3",
+            "large-v3-turbo", "turbo"
         ]
+
         if not os.path.exists(model):
             self.model_size_or_path = self.check_valid_model(model)
         else:
             self.model_size_or_path = model
         self.language = "en" if self.model_size_or_path.endswith("en") else language
         self.task = task
         self.initial_prompt = initial_prompt
-        self.vad_parameters = vad_parameters or {"threshold": 0.5}
+        self.vad_parameters = vad_parameters or {"onset": 0.5}
         self.no_speech_thresh = 0.45
+        self.same_output_threshold = 10
 
         device = "cuda" if torch.cuda.is_available() else "cpu"
         if device == "cuda":
@@ -1048,7 +1052,7 @@ def update_segments(self, segments, duration):
         last_segment = None
 
         # process complete segments
-        if len(segments) > 1:
+        if len(segments) > 1 and segments[-1].no_speech_prob <= self.no_speech_thresh:
             for i, s in enumerate(segments[:-1]):
                 text_ = s.text
                 self.text.append(text_)
@@ -1062,7 +1066,7 @@ def update_segments(self, segments, duration):
                 self.transcript.append(self.format_segment(start, end, text_, completed=True))
                 offset = min(duration, s.end)
 
-        # only process the segments if it satisfies the no_speech_thresh
+        # only process the last segment if it satisfies the no_speech_thresh
         if segments[-1].no_speech_prob <= self.no_speech_thresh:
             self.current_out += segments[-1].text
             last_segment = self.format_segment(
@@ -1072,14 +1076,15 @@ def update_segments(self, segments, duration):
                 completed=False
             )
 
-        # if same incomplete segment is seen multiple times then update the offset
-        # and append the segment to the list
         if self.current_out.strip() == self.prev_out.strip() and self.current_out != '':
-            self.same_output_threshold += 1
+            self.same_output_count += 1
+            time.sleep(0.1)     # wait for some voice activity just in case there is an unitended pause from the speaker for better punctuations.
         else:
-            self.same_output_threshold = 0
-
-        if self.same_output_threshold > 5:
+            self.same_output_count = 0
+
+        # if same incomplete segment is seen multiple times then update the offset
+        # and append the segment to the list
+        if self.same_output_count > self.same_output_threshold:
             if not len(self.text) or self.text[-1].strip().lower() != self.current_out.strip().lower():
                 self.text.append(self.current_out)
                 self.transcript.append(self.format_segment(
@@ -1090,7 +1095,7 @@ def update_segments(self, segments, duration):
                 ))
             self.current_out = ''
             offset = duration
-            self.same_output_threshold = 0
+            self.same_output_count = 0
             last_segment = None
         else:
             self.prev_out = self.current_out