Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

Eason Lu commited on Mar 22, 2023

Commit

22b6efb

•

1 Parent(s): c7ce724

TO DO: need debug timestamp

Browse files

Former-commit-id: c8b5a96d8beab6123216fef6e59b9d828904dba9

Files changed (4) hide show

.gitignore +1 -0
SRT.py +94 -11
__pycache__/srt2ass.cpython-38.pyc +0 -0
pipeline.py +99 -73

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 /downloads
 /results
 .DS_Store
 test.py
 test.srt
 test.txt

 /downloads
 /results
 .DS_Store
+/__pycache__
 test.py
 test.srt
 test.txt

SRT.py CHANGED Viewed

@@ -3,14 +3,31 @@ import os
 import whisper
 class SRT_segment(object):
-    def __init__(self, segment) -> None:
-        self.start_time_str = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
-        self.end_time_str = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
-        self.segment_id = segment['id']+1
-        self.source_text = segment['text']
-        self.duration = f"{self.start_time_str} --> {self.end_time_str}"
-        self.translation = ""
 class SRT_script():
     def __init__(self, segments) -> None:
@@ -18,13 +35,79 @@ class SRT_script():
         for seg in segments:
             srt_seg = SRT_segment(seg)
             self.segments.append(srt_seg)
-    def get_source_only():
-        # return a string
         pass
-    def write_srt_file(path:str):
         # write srt file to path
         pass

 import whisper
 class SRT_segment(object):
+    def __init__(self, *args) -> None:
+        if isinstance(args[0], dict):
+            segment = args[0]
+            self.start_time_str = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
+            self.end_time_str = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
+            self.segment_id = segment['id']+1
+            self.source_text = segment['text']
+            self.duration = f"{self.start_time_str} --> {self.end_time_str}"
+            self.translation = ""
+        elif isinstance(args[0], list):
+            self.segment_id = args[0][0]
+            self.source_text = args[0][2]
+            self.duration = args[0][1]
+            self.start_time_str = self.duration.split("-->")[0]
+            self.end_time_str = self.duration.split("-->")[1]
+            self.translation = ""
+    def __str__(self) -> str:
+        return  f'{self.segment_id}\n{self.duration}\n{self.source_text}\n\n'
+    def get_trans_str(self) -> str:
+        return f'{self.segment_id}\n{self.duration}\n{self.translation}\n\n'
+    def get_bilingual_str(self) -> str:
+        return f'{self.segment_id}\n{self.duration}\n{self.source_text}\n{self.translation}\n\n'
 class SRT_script():
     def __init__(self, segments) -> None:
         for seg in segments:
             srt_seg = SRT_segment(seg)
             self.segments.append(srt_seg)
+    @classmethod
+    def parse_from_srt_file(cls, path:str):
+        with open(path, 'r', encoding="utf-8") as f:
+            script_lines = f.read().splitlines()
+        segments = []
+        for i in range(len(script_lines)):
+            if i % 4 == 0:
+                segments.append(list(script_lines[i:i+4]))
+        return cls(segments)
+    def set_translation(self, translate:str, id_range:tuple):
+        start_seg_id = id_range[0]
+        end_seg_id = id_range[1]
+        lines = translate.split('\n\n')
+        print(id_range)
+        print(translate)
+        # print(len(translate))
+        for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
+            seg.translation = lines[i]
         pass
+    def get_source_only(self):
+        # return a string with pure source text
+        result = ""
+        for seg in self.segments:
+            result+=f'{seg.source_text}\n\n'
+        return result
+    def reform_src_str(self):
+        result = ""
+        for seg in self.segments:
+            result += str(seg)
+        return result
+    def reform_trans_str(self):
+        result = ""
+        for seg in self.segments:
+            result += seg.get_trans_str()
+        return result
+    def form_bilingual_str(self):
+        result = ""
+        for seg in self.segments:
+            result += seg.get_bilingual_str()
+        return result
+    def write_srt_file_src(self, path:str):
         # write srt file to path
+        with open(path, "w", encoding='utf-8') as f:
+            f.write(self.reform_src_str())
         pass
+    def write_srt_file_translate(self, path:str):
+        with open(path, "w", encoding='utf-8') as f:
+            f.write(self.reform_trans_str())
+        pass
+    def write_srt_file_bilingual(self, path:str):
+        with open(path, "w", encoding='utf-8') as f:
+            f.write(self.form_bilingual_str())
+        pass
+    def correct_with_force_term():
+        # force term correction
+        pass

__pycache__/srt2ass.cpython-38.pyc DELETED Viewed

Binary file (13.9 kB)

pipeline.py CHANGED Viewed

@@ -4,6 +4,8 @@ import argparse
 import os
 import whisper
 from tqdm import tqdm
 parser = argparse.ArgumentParser()
 parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
@@ -84,96 +86,120 @@ if not os.path.exists(f'{RESULT_PATH}/{VIDEO_NAME}'):
 # Instead of using the script_en variable directly, we'll use script_input
 srt_file_en = args.srt_file
 if srt_file_en is not None:
-    with open(srt_file_en, 'r', encoding='utf-8') as f:
-        script_input = f.read()
 else:
     # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
     srt_file_en = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
     if not os.path.exists(srt_file_en):
         # use OpenAI API for transcribe
         # transcript = openai.Audio.transcribe("whisper-1", audio_file)
         # use local whisper model
-        model = whisper.load_model("base") # using base model in local machine (may use large model on our server)
         transcript = model.transcribe(audio_path)
         #Write SRT file
-        from whisper.utils import WriteSRT
-        with open(srt_file_en, 'w', encoding="utf-8") as srt:
-            writer = WriteSRT(RESULT_PATH)
-            writer.write_result(transcript, srt)
-    # split the video script(open ai prompt limit: about 5000)
-    with open(srt_file_en, 'r', encoding='utf-8') as f:
-        script_en = f.read()
-        script_input = script_en
 if not args.only_srt:
     from srt2ass import srt2ass
     assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
     print('ASS subtitle saved as: ' + assSub_en)
-# force translate the starcraft2 term into chinese according to the dict
-# TODO: shortcut translation i.e. VA, ob
-# TODO: variety of translation
-from csv import reader
-import re
-# read dict
-with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
-  csv_reader = reader(f)
-  term_dict = {rows[0]:rows[1] for rows in csv_reader}
-def clean_timestamp(lines):
-  new_lines = []
-  strinfo = re.compile('[0-9]+\n.{25},[0-9]{3}')    # 注意用4个\\\\来替换\
-  new_lines = strinfo.sub('_-_', lines)
-  print(new_lines)
-  return new_lines
-ready_lines = re.sub('\n', '\n ', script_input)
-ready_words = ready_lines.split(" ")
-i = 0
-while i < len(ready_words):
-  word = ready_words[i]
-  if word[-2:] == ".\n" :
-    if word[:-2].lower() in term_dict :
-      new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
-      ready_words[i] = new_word
-    else :
-      word += ' '
-      ready_words[i] = word
-  elif word.lower() in term_dict :
-      new_word = word.replace(word,term_dict.get(word.lower())) + ' '
-      ready_words[i] = new_word
-  else :
-    word += " "
-    ready_words[i]= word
-  i += 1
-script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
 # Split the video script by sentences and create chunks within the token limit
-n_threshold = 1000  # Token limit for the GPT-3 model
-script_split = script_input_withForceTerm.split('\n')
-script_arr = []
-script = ""
-for sentence in script_split:
-    if len(script) + len(sentence) + 1 <= n_threshold:
-        script += sentence + '\n'
-    else:
         script_arr.append(script.strip())
-        script = sentence + '\n'
-if script.strip():
-    script_arr.append(script.strip())
 # Translate and save
-for s in tqdm(script_arr):
     # using chatgpt model
     if model_name == "gpt-3.5-turbo":
         # print(s + "\n")
@@ -187,9 +213,8 @@ for s in tqdm(script_arr):
             ],
             temperature=0.15
         )
-        with open(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", 'a+') as f:
-            f.write(response['choices'][0]['message']['content'].strip())
-            f.write("\n")
     if model_name == "text-davinci-003":
         prompt = f"Please help me translate this into Chinese:\n\n{s}\n\n"
@@ -203,10 +228,11 @@ for s in tqdm(script_arr):
             frequency_penalty=0.0,
             presence_penalty=0.0
         )
-        with open(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", 'a+') as f:
-            f.write(response['choices'][0]['text'].strip())
-            f.write("\n")
 if not args.only_srt:
     assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")

 import os
 import whisper
 from tqdm import tqdm
+from SRT import SRT_script
+import stable_whisper
 parser = argparse.ArgumentParser()
 parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
 # Instead of using the script_en variable directly, we'll use script_input
 srt_file_en = args.srt_file
 if srt_file_en is not None:
+    # with open(srt_file_en, 'r', encoding='utf-8') as f:
+    #     script_input = f.read()
+    srt = SRT_script.parse_from_srt_file(srt_file_en)
+    script_input = srt.get_source_only()
 else:
     # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
     srt_file_en = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
     if not os.path.exists(srt_file_en):
         # use OpenAI API for transcribe
         # transcript = openai.Audio.transcribe("whisper-1", audio_file)
         # use local whisper model
+        # model = whisper.load_model("base") # using base model in local machine (may use large model on our server)
+        # transcript = model.transcribe(audio_path)
+        # use stable-whisper
+        model = stable_whisper.load_model('base')
         transcript = model.transcribe(audio_path)
+        transcript.to_srt_vtt(srt_file_en)
+        transcript = transcript.to_dict()
+        srt = SRT_script(transcript['segments']) # read segments to SRT class
+        script_input = srt.get_source_only()
         #Write SRT file
+        # from whisper.utils import WriteSRT
+        # with open(srt_file_en, 'w', encoding="utf-8") as f:
+        #     writer = WriteSRT(RESULT_PATH)
+        #     writer.write_result(transcript, f)
+    else:
+        srt = SRT_script.parse_from_srt_file(srt_file_en)
+        script_input = srt.get_source_only()
 if not args.only_srt:
     from srt2ass import srt2ass
     assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
     print('ASS subtitle saved as: ' + assSub_en)
+# # force translate the starcraft2 term into chinese according to the dict
+# # TODO: shortcut translation i.e. VA, ob
+# # TODO: variety of translation
+# from csv import reader
+# import re
+# # read dict
+# with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
+#   csv_reader = reader(f)
+#   term_dict = {rows[0]:rows[1] for rows in csv_reader}
+# def clean_timestamp(lines):
+#   new_lines = []
+#   strinfo = re.compile('[0-9]+\n.{25},[0-9]{3}')    # 注意用4个\\\\来替换\
+#   new_lines = strinfo.sub('_-_', lines)
+#   print(new_lines)
+#   return new_lines
+# ready_lines = re.sub('\n', '\n ', script_input)
+# ready_words = ready_lines.split(" ")
+# i = 0
+# while i < len(ready_words):
+#   word = ready_words[i]
+#   if word[-2:] == ".\n" :
+#     if word[:-2].lower() in term_dict :
+#       new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
+#       ready_words[i] = new_word
+#     else :
+#       word += ' '
+#       ready_words[i] = word
+#   elif word.lower() in term_dict :
+#       new_word = word.replace(word,term_dict.get(word.lower())) + ' '
+#       ready_words[i] = new_word
+#   else :
+#     word += " "
+#     ready_words[i]= word
+#   i += 1
+# script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
+srt.correct_with_force_term()
 # Split the video script by sentences and create chunks within the token limit
+def script_split(script_in, chunk_size = 1000):
+    script_split = script_in.split('\n\n')
+    script_arr = []
+    range_arr = []
+    start = 1
+    end = 0
+    script = ""
+    for sentence in script_split:
+        if len(script) + len(sentence) + 1 <= chunk_size:
+            script += sentence + '\n\n'
+            end+=1
+        else:
+            range_arr.append((start, end))
+            start = end+1
+            end += 1
+            script_arr.append(script.strip())
+            script = sentence + '\n\n'
+    if script.strip():
         script_arr.append(script.strip())
+        range_arr.append((start, len(script_split)-1))
+    assert len(script_arr) == len(range_arr)
+    return script_arr, range_arr
+script_arr, range_arr = script_split(script_input)
 # Translate and save
+for s, range in tqdm(zip(script_arr, range_arr)):
+    print(s)
     # using chatgpt model
     if model_name == "gpt-3.5-turbo":
         # print(s + "\n")
             ],
             temperature=0.15
         )
+        translate = response['choices'][0]['message']['content'].strip()
     if model_name == "text-davinci-003":
         prompt = f"Please help me translate this into Chinese:\n\n{s}\n\n"
             frequency_penalty=0.0,
             presence_penalty=0.0
         )
+        translate = response['choices'][0]['text'].strip()
+    srt.set_translation(translate, range)
+srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
 if not args.only_srt:
     assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")