Spaces:
Sleeping
Sleeping
drop
commited on
Commit
•
e6af331
1
Parent(s):
ec5cb89
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,7 @@ from src.prompts.jsonPromptStrategy import JsonPromptStrategy
|
|
25 |
from src.prompts.prependPromptStrategy import PrependPromptStrategy
|
26 |
from src.source import AudioSource, get_audio_source_collection
|
27 |
from src.vadParallel import ParallelContext, ParallelTranscription
|
|
|
28 |
|
29 |
# External programs
|
30 |
import ffmpeg
|
@@ -496,23 +497,25 @@ class WhisperTranscriber:
|
|
496 |
def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
|
497 |
if not os.path.exists(output_dir):
|
498 |
os.makedirs(output_dir)
|
499 |
-
|
500 |
text = result["text"]
|
|
|
|
|
501 |
language = result["language"] if "language" in result else None
|
502 |
languageMaxLineWidth = self.__get_max_line_width(language)
|
503 |
-
|
504 |
print("Max line width " + str(languageMaxLineWidth))
|
505 |
vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
|
506 |
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
|
507 |
json_result = json.dumps(result, indent=4, ensure_ascii=False)
|
508 |
-
|
509 |
output_files = []
|
510 |
-
output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"))
|
511 |
-
output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"))
|
512 |
-
output_files.append(self.__create_file(
|
513 |
-
output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"))
|
514 |
-
|
515 |
-
return output_files,
|
516 |
|
517 |
def clear_cache(self):
|
518 |
self.model_cache.clear()
|
|
|
25 |
from src.prompts.prependPromptStrategy import PrependPromptStrategy
|
26 |
from src.source import AudioSource, get_audio_source_collection
|
27 |
from src.vadParallel import ParallelContext, ParallelTranscription
|
28 |
+
from src.supress import clean_string, garbage_list
|
29 |
|
30 |
# External programs
|
31 |
import ffmpeg
|
|
|
497 |
def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
|
498 |
if not os.path.exists(output_dir):
|
499 |
os.makedirs(output_dir)
|
500 |
+
|
501 |
text = result["text"]
|
502 |
+
clean_text = clean_string(text, garbage_list)
|
503 |
+
|
504 |
language = result["language"] if "language" in result else None
|
505 |
languageMaxLineWidth = self.__get_max_line_width(language)
|
506 |
+
|
507 |
print("Max line width " + str(languageMaxLineWidth))
|
508 |
vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
|
509 |
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
|
510 |
json_result = json.dumps(result, indent=4, ensure_ascii=False)
|
511 |
+
|
512 |
output_files = []
|
513 |
+
output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"))
|
514 |
+
output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"))
|
515 |
+
output_files.append(self.__create_file(clean_text, output_dir, source_name + "-transcript.txt")) # Use clean_text here
|
516 |
+
output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"))
|
517 |
+
|
518 |
+
return output_files, clean_text, vtt
|
519 |
|
520 |
def clear_cache(self):
|
521 |
self.model_cache.clear()
|