drop commited on
Commit
e6af331
1 Parent(s): ec5cb89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -25,6 +25,7 @@ from src.prompts.jsonPromptStrategy import JsonPromptStrategy
25
  from src.prompts.prependPromptStrategy import PrependPromptStrategy
26
  from src.source import AudioSource, get_audio_source_collection
27
  from src.vadParallel import ParallelContext, ParallelTranscription
 
28
 
29
  # External programs
30
  import ffmpeg
@@ -496,23 +497,25 @@ class WhisperTranscriber:
496
  def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
497
  if not os.path.exists(output_dir):
498
  os.makedirs(output_dir)
499
-
500
  text = result["text"]
 
 
501
  language = result["language"] if "language" in result else None
502
  languageMaxLineWidth = self.__get_max_line_width(language)
503
-
504
  print("Max line width " + str(languageMaxLineWidth))
505
  vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
506
  srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
507
  json_result = json.dumps(result, indent=4, ensure_ascii=False)
508
-
509
  output_files = []
510
- output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
511
- output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
512
- output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
513
- output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"));
514
-
515
- return output_files, text, vtt
516
 
517
  def clear_cache(self):
518
  self.model_cache.clear()
 
25
  from src.prompts.prependPromptStrategy import PrependPromptStrategy
26
  from src.source import AudioSource, get_audio_source_collection
27
  from src.vadParallel import ParallelContext, ParallelTranscription
28
+ from src.supress import clean_string, garbage_list
29
 
30
  # External programs
31
  import ffmpeg
 
497
  def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
498
  if not os.path.exists(output_dir):
499
  os.makedirs(output_dir)
500
+
501
  text = result["text"]
502
+ clean_text = clean_string(text, garbage_list)
503
+
504
  language = result["language"] if "language" in result else None
505
  languageMaxLineWidth = self.__get_max_line_width(language)
506
+
507
  print("Max line width " + str(languageMaxLineWidth))
508
  vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
509
  srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
510
  json_result = json.dumps(result, indent=4, ensure_ascii=False)
511
+
512
  output_files = []
513
+ output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"))
514
+ output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"))
515
+ output_files.append(self.__create_file(clean_text, output_dir, source_name + "-transcript.txt")) # Use clean_text here
516
+ output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"))
517
+
518
+ return output_files, clean_text, vtt
519
 
520
  def clear_cache(self):
521
  self.model_cache.clear()