Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade faster-whisper 1.1.0rc0 #296

Merged
merged 4 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions requirements/server.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
faster-whisper==1.0.1
faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/8f01aee36b562e6be537e0341cdd40dc8bed33a7.tar.gz
websockets
onnxruntime==1.16.0
numba
Expand All @@ -9,5 +9,4 @@ scipy
jiwer
evaluate
numpy<2
tiktoken==0.3.3
openai-whisper==20231117
openai-whisper==20240930
4 changes: 3 additions & 1 deletion whisper_live/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,9 @@ def write_srt_file(self, output_path="output.srt"):

"""
if self.server_backend == "faster_whisper":
if (self.last_segment) and self.transcript[-1]["text"] != self.last_segment["text"]:
if not self.transcript and self.last_segment is not None:
self.transcript.append(self.last_segment)
elif self.last_segment and self.transcript[-1]["text"] != self.last_segment["text"]:
self.transcript.append(self.last_segment)
utils.create_srt_file(self.transcript, output_path)

Expand Down
29 changes: 17 additions & 12 deletions whisper_live/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def __init__(self, client_uid, websocket):
self.prev_out = ''
self.t_start = None
self.exit = False
self.same_output_threshold = 0
self.same_output_count = 0
self.show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds
self.add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds
self.transcript = []
Expand Down Expand Up @@ -780,17 +780,21 @@ def __init__(self, websocket, task="transcribe", device=None, language=None, cli
super().__init__(client_uid, websocket)
self.model_sizes = [
"tiny", "tiny.en", "base", "base.en", "small", "small.en",
"medium", "medium.en", "large-v2", "large-v3",
"medium", "medium.en", "large-v2", "large-v3", "distil-small.en",
"distil-medium.en", "distil-large-v2", "distil-large-v3",
"large-v3-turbo", "turbo"
]

if not os.path.exists(model):
self.model_size_or_path = self.check_valid_model(model)
else:
self.model_size_or_path = model
self.language = "en" if self.model_size_or_path.endswith("en") else language
self.task = task
self.initial_prompt = initial_prompt
self.vad_parameters = vad_parameters or {"threshold": 0.5}
self.vad_parameters = vad_parameters or {"onset": 0.5}
self.no_speech_thresh = 0.45
self.same_output_threshold = 10

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
Expand Down Expand Up @@ -1048,7 +1052,7 @@ def update_segments(self, segments, duration):
last_segment = None

# process complete segments
if len(segments) > 1:
if len(segments) > 1 and segments[-1].no_speech_prob <= self.no_speech_thresh:
for i, s in enumerate(segments[:-1]):
text_ = s.text
self.text.append(text_)
Expand All @@ -1062,7 +1066,7 @@ def update_segments(self, segments, duration):
self.transcript.append(self.format_segment(start, end, text_, completed=True))
offset = min(duration, s.end)

# only process the segments if it satisfies the no_speech_thresh
# only process the last segment if it satisfies the no_speech_thresh
if segments[-1].no_speech_prob <= self.no_speech_thresh:
self.current_out += segments[-1].text
last_segment = self.format_segment(
Expand All @@ -1072,14 +1076,15 @@ def update_segments(self, segments, duration):
completed=False
)

# if same incomplete segment is seen multiple times then update the offset
# and append the segment to the list
if self.current_out.strip() == self.prev_out.strip() and self.current_out != '':
self.same_output_threshold += 1
self.same_output_count += 1
time.sleep(0.1) # wait for some voice activity just in case there is an unitended pause from the speaker for better punctuations.
else:
self.same_output_threshold = 0

if self.same_output_threshold > 5:
self.same_output_count = 0

# if same incomplete segment is seen multiple times then update the offset
# and append the segment to the list
if self.same_output_count > self.same_output_threshold:
if not len(self.text) or self.text[-1].strip().lower() != self.current_out.strip().lower():
self.text.append(self.current_out)
self.transcript.append(self.format_segment(
Expand All @@ -1090,7 +1095,7 @@ def update_segments(self, segments, duration):
))
self.current_out = ''
offset = duration
self.same_output_threshold = 0
self.same_output_count = 0
last_segment = None
else:
self.prev_out = self.current_out
Expand Down
Loading
Loading