Skip to content

Commit

Permalink
Merge pull request #296 from makaveli10/upgrade_faster_whisper
Browse files Browse the repository at this point in the history
Upgrade faster-whisper 1.1.0rc0.
  • Loading branch information
zoq authored Nov 19, 2024
2 parents 0e89573 + a1650ea commit 446fc6e
Show file tree
Hide file tree
Showing 4 changed files with 1,223 additions and 309 deletions.
5 changes: 2 additions & 3 deletions requirements/server.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
faster-whisper==1.0.1
faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/8f01aee36b562e6be537e0341cdd40dc8bed33a7.tar.gz
websockets
onnxruntime==1.16.0
numba
Expand All @@ -9,5 +9,4 @@ scipy
jiwer
evaluate
numpy<2
tiktoken==0.3.3
openai-whisper==20231117
openai-whisper==20240930
4 changes: 3 additions & 1 deletion whisper_live/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,9 @@ def write_srt_file(self, output_path="output.srt"):
"""
if self.server_backend == "faster_whisper":
if (self.last_segment) and self.transcript[-1]["text"] != self.last_segment["text"]:
if not self.transcript and self.last_segment is not None:
self.transcript.append(self.last_segment)
elif self.last_segment and self.transcript[-1]["text"] != self.last_segment["text"]:
self.transcript.append(self.last_segment)
utils.create_srt_file(self.transcript, output_path)

Expand Down
29 changes: 17 additions & 12 deletions whisper_live/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def __init__(self, client_uid, websocket):
self.prev_out = ''
self.t_start = None
self.exit = False
self.same_output_threshold = 0
self.same_output_count = 0
self.show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds
self.add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds
self.transcript = []
Expand Down Expand Up @@ -780,17 +780,21 @@ def __init__(self, websocket, task="transcribe", device=None, language=None, cli
super().__init__(client_uid, websocket)
self.model_sizes = [
"tiny", "tiny.en", "base", "base.en", "small", "small.en",
"medium", "medium.en", "large-v2", "large-v3",
"medium", "medium.en", "large-v2", "large-v3", "distil-small.en",
"distil-medium.en", "distil-large-v2", "distil-large-v3",
"large-v3-turbo", "turbo"
]

if not os.path.exists(model):
self.model_size_or_path = self.check_valid_model(model)
else:
self.model_size_or_path = model
self.language = "en" if self.model_size_or_path.endswith("en") else language
self.task = task
self.initial_prompt = initial_prompt
self.vad_parameters = vad_parameters or {"threshold": 0.5}
self.vad_parameters = vad_parameters or {"onset": 0.5}
self.no_speech_thresh = 0.45
self.same_output_threshold = 10

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
Expand Down Expand Up @@ -1048,7 +1052,7 @@ def update_segments(self, segments, duration):
last_segment = None

# process complete segments
if len(segments) > 1:
if len(segments) > 1 and segments[-1].no_speech_prob <= self.no_speech_thresh:
for i, s in enumerate(segments[:-1]):
text_ = s.text
self.text.append(text_)
Expand All @@ -1062,7 +1066,7 @@ def update_segments(self, segments, duration):
self.transcript.append(self.format_segment(start, end, text_, completed=True))
offset = min(duration, s.end)

# only process the segments if it satisfies the no_speech_thresh
# only process the last segment if it satisfies the no_speech_thresh
if segments[-1].no_speech_prob <= self.no_speech_thresh:
self.current_out += segments[-1].text
last_segment = self.format_segment(
Expand All @@ -1072,14 +1076,15 @@ def update_segments(self, segments, duration):
completed=False
)

# if same incomplete segment is seen multiple times then update the offset
# and append the segment to the list
if self.current_out.strip() == self.prev_out.strip() and self.current_out != '':
self.same_output_threshold += 1
self.same_output_count += 1
time.sleep(0.1) # wait for some voice activity just in case there is an unitended pause from the speaker for better punctuations.
else:
self.same_output_threshold = 0

if self.same_output_threshold > 5:
self.same_output_count = 0

# if same incomplete segment is seen multiple times then update the offset
# and append the segment to the list
if self.same_output_count > self.same_output_threshold:
if not len(self.text) or self.text[-1].strip().lower() != self.current_out.strip().lower():
self.text.append(self.current_out)
self.transcript.append(self.format_segment(
Expand All @@ -1090,7 +1095,7 @@ def update_segments(self, segments, duration):
))
self.current_out = ''
offset = duration
self.same_output_threshold = 0
self.same_output_count = 0
last_segment = None
else:
self.prev_out = self.current_out
Expand Down
Loading

0 comments on commit 446fc6e

Please sign in to comment.