Spaces:

StarPigeon
/

ViDove

Sleeping

@@ -52,6 +52,8 @@ punctuation_dict = {
     },
 }
 class SrtSegment(object):
     def __init__(self, src_lang, tgt_lang, *args) -> None:
         self.src_lang = src_lang
@@ -150,11 +152,19 @@ class SrtSegment(object):
 class SrtScript(object):
-    def __init__(self, src_lang, tgt_lang, segments) -> None:
         self.src_lang = src_lang
         self.tgt_lang = tgt_lang
         self.segments = [SrtSegment(self.src_lang, self.tgt_lang, seg) for seg in segments]
     @classmethod
     def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
         with open(path, 'r', encoding="utf-8") as f:
@@ -429,6 +439,12 @@ class SrtScript(object):
     def correct_with_force_term(self):
         ## force term correction
         logging.info("performing force term correction")
         # load term dictionary
         with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
             term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
@@ -478,6 +494,12 @@ class SrtScript(object):
     def spell_check_term(self):
         logging.info("performing spell check")
         import enchant
         dict = enchant.Dict('en_US')
         term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')

     },
 }
+dict_path = "./domain_dict"
 class SrtSegment(object):
     def __init__(self, src_lang, tgt_lang, *args) -> None:
         self.src_lang = src_lang
 class SrtScript(object):
+    def __init__(self, src_lang, tgt_lang, segments, domain="General") -> None:
+        self.domain = domain
         self.src_lang = src_lang
         self.tgt_lang = tgt_lang
         self.segments = [SrtSegment(self.src_lang, self.tgt_lang, seg) for seg in segments]
+        if self.domain != "General":
+            if os.path.exists(f"{dict_path}/{self.domain}"):
+                # TODO: load dictionary
+                ...
+            else:
+                logging.error(f"domain {self.domain} doesn't exist")
     @classmethod
     def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
         with open(path, 'r', encoding="utf-8") as f:
     def correct_with_force_term(self):
         ## force term correction
         logging.info("performing force term correction")
+        # check domain
+        if self.domain == "General":
+            logging.info("General domain could not perform correct_with_force_term. skip this step.")
+            pass
         # load term dictionary
         with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
             term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
     def spell_check_term(self):
         logging.info("performing spell check")
+        # check domain
+        if self.domain == "General":
+            logging.info("General domain could not perform spell_check_term. skip this step.")
+            pass
         import enchant
         dict = enchant.Dict('en_US')
         term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')

src/task.py CHANGED Viewed

@@ -157,7 +157,7 @@ class Task:
             # after get the transcript, release the gpu resource
             torch.cuda.empty_cache()
-        self.SRT_Script = SrtScript(self.source_lang, self.target_lang, transcript['segments'])
         # save the srt script to local
         self.SRT_Script.write_srt_file_src(src_srt_path)

             # after get the transcript, release the gpu resource
             torch.cuda.empty_cache()
+        self.SRT_Script = SrtScript(self.source_lang, self.target_lang, transcript['segments'], self.field)
         # save the srt script to local
         self.SRT_Script.write_srt_file_src(src_srt_path)