CanYing0913 commited on
Commit
7d74f8e
β€’
1 Parent(s): cba75b6

Update srt.py and file hierarchy

Browse files

Former-commit-id: d36b43736cb3447da3e26e3caef1e351bf431dc3

doc/Installation.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ### **Recommended:**
2
+ We recommend you to configure your environment using [mamba](https://pypi.org/project/mamba/). The following packages are required:
3
+ ```
4
+ openai
5
+ openai-whisper
6
+
7
+ ```
doc/struct.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Structure of Repository
2
+ ```
3
+ β”œβ”€β”€ doc # Baseline implementation of SpMM algorithm.
4
+ β”œβ”€β”€β”€β”€β”€β”€ struct.md # Document of repository structure.
5
+ β”œβ”€β”€ finetune_data #
6
+ └── README.md
7
+ ```
pipeline.py CHANGED
@@ -3,10 +3,10 @@ from pytube import YouTube
3
  import argparse
4
  import os
5
  from tqdm import tqdm
6
- from SRT import SRT_script
7
  import stable_whisper
8
  import whisper
9
- from srt2ass import srt2ass
10
 
11
  import subprocess
12
 
@@ -85,7 +85,7 @@ def get_sources(args, download_path, result_path, video_name):
85
  def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'large', method = "stable"):
86
  # Instead of using the script_en variable directly, we'll use script_input
87
  if srt_file_en is not None:
88
- srt = SRT_script.parse_from_srt_file(srt_file_en)
89
  else:
90
  # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
91
  srt_file_en = "{}/{}/{}_en.srt".format(result_path, video_name, video_name)
@@ -115,10 +115,10 @@ def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file =
115
  else:
116
  raise ValueError("invalid speech to text method")
117
 
118
- srt = SRT_script(transcript['segments']) # read segments to SRT class
119
 
120
  else:
121
- srt = SRT_script.parse_from_srt_file(srt_file_en)
122
  return srt_file_en, srt
123
 
124
  # Split the video script by sentences and create chunks within the token limit
 
3
  import argparse
4
  import os
5
  from tqdm import tqdm
6
+ from srt_util.srt import SrtScript
7
  import stable_whisper
8
  import whisper
9
+ from srt_util.srt2ass import srt2ass
10
 
11
  import subprocess
12
 
 
85
  def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'large', method = "stable"):
86
  # Instead of using the script_en variable directly, we'll use script_input
87
  if srt_file_en is not None:
88
+ srt = SrtScript.parse_from_srt_file(srt_file_en)
89
  else:
90
  # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
91
  srt_file_en = "{}/{}/{}_en.srt".format(result_path, video_name, video_name)
 
115
  else:
116
  raise ValueError("invalid speech to text method")
117
 
118
+ srt = SrtScript(transcript['segments']) # read segments to SRT class
119
 
120
  else:
121
+ srt = SrtScript.parse_from_srt_file(srt_file_en)
122
  return srt_file_en, srt
123
 
124
  # Split the video script by sentences and create chunks within the token limit
srt_util/__init__.py ADDED
File without changes
SRT.py β†’ srt_util/srt.py RENAMED
@@ -7,7 +7,7 @@ from datetime import timedelta
7
  import openai
8
 
9
 
10
- class SRT_segment(object):
11
  def __init__(self, *args) -> None:
12
  if isinstance(args[0], dict):
13
  segment = args[0]
@@ -63,28 +63,23 @@ class SRT_segment(object):
63
  self.end = seg.end
64
  self.end_ms = seg.end_ms
65
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
66
- pass
67
 
68
  def __add__(self, other):
69
  """
70
  Merge the segment seg with the current segment, and return the new constructed segment.
71
  No in-place modification.
 
72
  :param other: Another segment that is strictly next to added segment.
73
  :return: new segment of the two sub-segments
74
  """
75
  # assert other.start_ms == self.end_ms, f"cannot merge discontinuous segments."
76
  result = deepcopy(self)
77
- result.source_text += f' {other.source_text}'
78
- result.translation += f' {other.translation}'
79
- result.end_time_str = other.end_time_str
80
- result.end = other.end
81
- result.end_ms = other.end_ms
82
- result.duration = f"{self.start_time_str} --> {self.end_time_str}"
83
  return result
84
 
85
- def remove_trans_punc(self):
86
  """
87
- remove punctuations in translation text
88
  :return: None
89
  """
90
  punc_cn = "οΌŒγ€‚οΌοΌŸ"
@@ -101,12 +96,9 @@ class SRT_segment(object):
101
  return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
102
 
103
 
104
- class SRT_script():
105
  def __init__(self, segments) -> None:
106
- self.segments = []
107
- for seg in segments:
108
- srt_seg = SRT_segment(seg)
109
- self.segments.append(srt_seg)
110
 
111
  @classmethod
112
  def parse_from_srt_file(cls, path: str):
@@ -114,13 +106,12 @@ class SRT_script():
114
  script_lines = [line.rstrip() for line in f.readlines()]
115
 
116
  segments = []
117
- for i in range(len(script_lines)):
118
- if i % 4 == 0:
119
- segments.append(list(script_lines[i:i + 4]))
120
 
121
  return cls(segments)
122
 
123
- def merge_segs(self, idx_list) -> SRT_segment:
124
  """
125
  Merge entire segment list to a single segment
126
  :param idx_list: List of index to merge
@@ -145,6 +136,7 @@ class SRT_script():
145
  """
146
  merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
147
  sentence = []
 
148
  for i, seg in enumerate(self.segments):
149
  if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
150
  sentence.append(i)
@@ -153,6 +145,7 @@ class SRT_script():
153
  else:
154
  sentence.append(i)
155
 
 
156
  segments = []
157
  for idx_list in merge_list:
158
  segments.append(self.merge_segs(idx_list))
@@ -327,14 +320,14 @@ class SRT_script():
327
  seg1_dict['text'] = src_seg1
328
  seg1_dict['start'] = start_seg1
329
  seg1_dict['end'] = end_seg1
330
- seg1 = SRT_segment(seg1_dict)
331
  seg1.translation = trans_seg1
332
 
333
  seg2_dict = {}
334
  seg2_dict['text'] = src_seg2
335
  seg2_dict['start'] = start_seg2
336
  seg2_dict['end'] = end_seg2
337
- seg2 = SRT_segment(seg2_dict)
338
  seg2.translation = trans_seg2
339
 
340
  result_list = []
@@ -386,7 +379,7 @@ class SRT_script():
386
  ## force term correction
387
 
388
  # load term dictionary
389
- with open("./finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
390
  term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
391
 
392
  # change term
@@ -455,7 +448,7 @@ class SRT_script():
455
  pos = uncover(word)[1]
456
  new_word = word
457
  if arg == 0: # term translate mode
458
- with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
459
  term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
460
  if real_word in term_enzh_dict:
461
  new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
 
7
  import openai
8
 
9
 
10
+ class SrtSegment(object):
11
  def __init__(self, *args) -> None:
12
  if isinstance(args[0], dict):
13
  segment = args[0]
 
63
  self.end = seg.end
64
  self.end_ms = seg.end_ms
65
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
 
66
 
67
  def __add__(self, other):
68
  """
69
  Merge the segment seg with the current segment, and return the new constructed segment.
70
  No in-place modification.
71
+ This is used for '+' operator.
72
  :param other: Another segment that is strictly next to added segment.
73
  :return: new segment of the two sub-segments
74
  """
75
  # assert other.start_ms == self.end_ms, f"cannot merge discontinuous segments."
76
  result = deepcopy(self)
77
+ result.merge_seg(other)
 
 
 
 
 
78
  return result
79
 
80
+ def remove_trans_punc(self) -> None:
81
  """
82
+ remove CN punctuations in translation text
83
  :return: None
84
  """
85
  punc_cn = "οΌŒγ€‚οΌοΌŸ"
 
96
  return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
97
 
98
 
99
+ class SrtScript(object):
100
  def __init__(self, segments) -> None:
101
+ self.segments = [SrtSegment(seg) for seg in segments]
 
 
 
102
 
103
  @classmethod
104
  def parse_from_srt_file(cls, path: str):
 
106
  script_lines = [line.rstrip() for line in f.readlines()]
107
 
108
  segments = []
109
+ for i in range(0, len(script_lines), 4):
110
+ segments.append(list(script_lines[i:i + 4]))
 
111
 
112
  return cls(segments)
113
 
114
+ def merge_segs(self, idx_list) -> SrtSegment:
115
  """
116
  Merge entire segment list to a single segment
117
  :param idx_list: List of index to merge
 
136
  """
137
  merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
138
  sentence = []
139
+ # Get each entire sentence of distinct segments, fill indices to merge_list
140
  for i, seg in enumerate(self.segments):
141
  if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
142
  sentence.append(i)
 
145
  else:
146
  sentence.append(i)
147
 
148
+ # Reconstruct segments, each with an entire sentence
149
  segments = []
150
  for idx_list in merge_list:
151
  segments.append(self.merge_segs(idx_list))
 
320
  seg1_dict['text'] = src_seg1
321
  seg1_dict['start'] = start_seg1
322
  seg1_dict['end'] = end_seg1
323
+ seg1 = SrtSegment(seg1_dict)
324
  seg1.translation = trans_seg1
325
 
326
  seg2_dict = {}
327
  seg2_dict['text'] = src_seg2
328
  seg2_dict['start'] = start_seg2
329
  seg2_dict['end'] = end_seg2
330
+ seg2 = SrtSegment(seg2_dict)
331
  seg2.translation = trans_seg2
332
 
333
  result_list = []
 
379
  ## force term correction
380
 
381
  # load term dictionary
382
+ with open("../finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
383
  term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
384
 
385
  # change term
 
448
  pos = uncover(word)[1]
449
  new_word = word
450
  if arg == 0: # term translate mode
451
+ with open("../finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
452
  term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
453
  if real_word in term_enzh_dict:
454
  new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
srt2ass.py β†’ srt_util/srt2ass.py RENAMED
File without changes