Spaces:
Sleeping
Sleeping
When the language is set to Chinese,
Browse filesthe method of converting simplified Chinese to traditional Chinese is changed to using the method provided by zhconv.
When the "--merge_subtitle_with_sources" argument is enabled,
the video file extension will be appended with the subtitle language information, such as .en, .zh, .jp, etc.
The downloaded YouTube videos will be added with the format [vcodec^=avc1].
- app.py +21 -12
- requirements-fasterWhisper.txt +2 -1
- requirements-whisper.txt +2 -1
- requirements.txt +2 -1
- src/download.py +1 -1
- src/vad.py +1 -1
app.py
CHANGED
@@ -19,6 +19,7 @@ from src.hooks.subTaskProgressListener import SubTaskProgressListener
|
|
19 |
from src.hooks.whisperProgressHook import create_progress_listener_handle
|
20 |
from src.languages import _TO_LANGUAGE_CODE
|
21 |
from src.languages import get_language_names
|
|
|
22 |
from src.modelCache import ModelCache
|
23 |
from src.prompts.jsonPromptStrategy import JsonPromptStrategy
|
24 |
from src.prompts.prependPromptStrategy import PrependPromptStrategy
|
@@ -38,6 +39,7 @@ from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
|
|
38 |
from src.whisper.whisperFactory import create_whisper_container
|
39 |
|
40 |
import shutil
|
|
|
41 |
|
42 |
# Configure more application defaults in config.json5
|
43 |
|
@@ -102,14 +104,11 @@ class WhisperTranscriber:
|
|
102 |
vad, vadMergeWindow, vadMaxMergeSize,
|
103 |
word_timestamps: bool = False, highlight_words: bool = False,
|
104 |
progress=gr.Progress()):
|
105 |
-
decodeOptions = dict(word_timestamps=word_timestamps)
|
106 |
-
if languageName == "Chinese":
|
107 |
-
decodeOptions.update(initial_prompt="繁體: ")
|
108 |
-
self.app_config.vad_initial_prompt_mode = "prepend_all_segments"
|
109 |
|
110 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
111 |
|
112 |
-
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
|
|
113 |
|
114 |
# Entry function for the full tab
|
115 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
@@ -143,10 +142,6 @@ class WhisperTranscriber:
|
|
143 |
else:
|
144 |
temperature = [temperature]
|
145 |
|
146 |
-
if languageName == "Chinese":
|
147 |
-
initial_prompt = "繁體: " + initial_prompt
|
148 |
-
self.app_config.vad_initial_prompt_mode = "prepend_all_segments"
|
149 |
-
|
150 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
|
151 |
|
152 |
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
@@ -163,7 +158,8 @@ class WhisperTranscriber:
|
|
163 |
sources = self.__get_source(urlData, multipleFiles, microphoneData)
|
164 |
|
165 |
try:
|
166 |
-
|
|
|
167 |
selectedModel = modelName if modelName is not None else "base"
|
168 |
|
169 |
model = create_whisper_container(whisper_implementation=self.app_config.whisper_implementation,
|
@@ -256,6 +252,18 @@ class WhisperTranscriber:
|
|
256 |
return download, text, vtt
|
257 |
|
258 |
finally:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
# Cleanup source
|
260 |
if self.deleteUploadedFiles:
|
261 |
for source in sources:
|
@@ -266,13 +274,14 @@ class WhisperTranscriber:
|
|
266 |
srt_path = source_download[0]
|
267 |
save_path = os.path.join(self.app_config.output_dir, source.source_name)
|
268 |
save_without_ext, ext = os.path.splitext(save_path)
|
269 |
-
|
|
|
270 |
|
271 |
#ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
|
272 |
input_file = ffmpeg.input(source.source_path)
|
273 |
input_srt = ffmpeg.input(srt_path)
|
274 |
out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
|
275 |
-
outRsult = out.run()
|
276 |
except Exception as e:
|
277 |
# Ignore error - it's just a cleanup
|
278 |
print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
|
|
|
19 |
from src.hooks.whisperProgressHook import create_progress_listener_handle
|
20 |
from src.languages import _TO_LANGUAGE_CODE
|
21 |
from src.languages import get_language_names
|
22 |
+
from src.languages import get_language_from_name
|
23 |
from src.modelCache import ModelCache
|
24 |
from src.prompts.jsonPromptStrategy import JsonPromptStrategy
|
25 |
from src.prompts.prependPromptStrategy import PrependPromptStrategy
|
|
|
39 |
from src.whisper.whisperFactory import create_whisper_container
|
40 |
|
41 |
import shutil
|
42 |
+
import zhconv
|
43 |
|
44 |
# Configure more application defaults in config.json5
|
45 |
|
|
|
104 |
vad, vadMergeWindow, vadMaxMergeSize,
|
105 |
word_timestamps: bool = False, highlight_words: bool = False,
|
106 |
progress=gr.Progress()):
|
|
|
|
|
|
|
|
|
107 |
|
108 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
109 |
|
110 |
+
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
111 |
+
word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
|
112 |
|
113 |
# Entry function for the full tab
|
114 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
|
|
142 |
else:
|
143 |
temperature = [temperature]
|
144 |
|
|
|
|
|
|
|
|
|
145 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
|
146 |
|
147 |
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
|
|
158 |
sources = self.__get_source(urlData, multipleFiles, microphoneData)
|
159 |
|
160 |
try:
|
161 |
+
langObj = get_language_from_name(languageName)
|
162 |
+
selectedLanguage = languageName.lower() if languageName is not None and len(languageName) > 0 else None
|
163 |
selectedModel = modelName if modelName is not None else "base"
|
164 |
|
165 |
model = create_whisper_container(whisper_implementation=self.app_config.whisper_implementation,
|
|
|
252 |
return download, text, vtt
|
253 |
|
254 |
finally:
|
255 |
+
if languageName == "Chinese":
|
256 |
+
for file_path in source_download:
|
257 |
+
try:
|
258 |
+
with open(file_path, "r+", encoding="utf-8") as source:
|
259 |
+
content = source.read()
|
260 |
+
content = zhconv.convert(content, "zh-tw")
|
261 |
+
source.seek(0)
|
262 |
+
source.write(content)
|
263 |
+
except Exception as e:
|
264 |
+
# Ignore error - it's just a cleanup
|
265 |
+
print("Error converting Traditional Chinese with download source file: \n" + file_path + ", \n" + str(e))
|
266 |
+
|
267 |
# Cleanup source
|
268 |
if self.deleteUploadedFiles:
|
269 |
for source in sources:
|
|
|
274 |
srt_path = source_download[0]
|
275 |
save_path = os.path.join(self.app_config.output_dir, source.source_name)
|
276 |
save_without_ext, ext = os.path.splitext(save_path)
|
277 |
+
lang_ext = "." + langObj.code if langObj is not None else ""
|
278 |
+
output_with_srt = save_without_ext + lang_ext + ext
|
279 |
|
280 |
#ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
|
281 |
input_file = ffmpeg.input(source.source_path)
|
282 |
input_srt = ffmpeg.input(srt_path)
|
283 |
out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
|
284 |
+
outRsult = out.run(overwrite_output=True)
|
285 |
except Exception as e:
|
286 |
# Ignore error - it's just a cleanup
|
287 |
print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
|
requirements-fasterWhisper.txt
CHANGED
@@ -6,4 +6,5 @@ yt-dlp
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
-
more_itertools
|
|
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
+
more_itertools
|
10 |
+
zhconv
|
requirements-whisper.txt
CHANGED
@@ -6,4 +6,5 @@ gradio==3.36.0
|
|
6 |
yt-dlp
|
7 |
torchaudio
|
8 |
altair
|
9 |
-
json5
|
|
|
|
6 |
yt-dlp
|
7 |
torchaudio
|
8 |
altair
|
9 |
+
json5
|
10 |
+
zhconv
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ yt-dlp
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
-
more_itertools
|
|
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
+
more_itertools
|
10 |
+
zhconv
|
src/download.py
CHANGED
@@ -29,7 +29,7 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
|
|
29 |
destinationDirectory = mkdtemp()
|
30 |
|
31 |
ydl_opts = {
|
32 |
-
"format": "bestaudio/best" if onlyAudio else "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best",
|
33 |
'paths': {
|
34 |
'home': destinationDirectory
|
35 |
}
|
|
|
29 |
destinationDirectory = mkdtemp()
|
30 |
|
31 |
ydl_opts = {
|
32 |
+
"format": "bestaudio/best" if onlyAudio else "bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a]/best",
|
33 |
'paths': {
|
34 |
'home': destinationDirectory
|
35 |
}
|
src/vad.py
CHANGED
@@ -204,7 +204,7 @@ class AbstractTranscription(ABC):
|
|
204 |
detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
|
205 |
|
206 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
207 |
-
segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
|
208 |
|
209 |
perf_start_time = time.perf_counter()
|
210 |
|
|
|
204 |
detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
|
205 |
|
206 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
207 |
+
segment_duration, "expanded: ", segment_expand_amount, ", prompt: ", segment_prompt, ", detected language: ", detected_language)
|
208 |
|
209 |
perf_start_time = time.perf_counter()
|
210 |
|