File size: 4,102 Bytes
2f9df49 d89fd8e 2f9df49 d89fd8e 2f9df49 d89fd8e 2f9df49 d7cb368 2f9df49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import io
import sys
import gradio as gr
import srt
import jiwer
from dataclasses import dataclass
from dataclasses_json import dataclass_json
from datetime import timedelta
@dataclass_json
@dataclass
class ZHTW_Sub:
start: timedelta
end: timedelta
zh: str
tw: str
def read_srt(p):
with open(p) as f:
subs = list(srt.parse(f.read()))
return subs
def merge_sub(subs):
i = 1
while i < len(subs):
ps = subs[i-1]
s = subs[i]
if ps.end != s.start:
i += 1
continue
ps.end = s.end
ps.zh += f" {s.zh}"
ps.tw += f" {s.tw}"
subs.pop(i)
return subs
def merge_sub2(subs, delta):
i = 1
while i < len(subs):
ps = subs[i-1]
s = subs[i]
if s.start - ps.end > delta:
i += 1
continue
ps.end = s.end
ps.zh += f" {s.zh}"
ps.tw += f" {s.tw}"
subs.pop(i)
return subs
def filter_sub(subs):
buffer = io.StringIO()
stdout_bak = sys.stdout
sys.stdout = buffer # Redirect print to buffer
new_subs = []
carry_next = False
for s in subs:
content = s.content
if '#' in s.content:
print('註:標記', s.start, s.end, s.content)
continue
if '\n' in content:
print('修:分行', '\\n', s.start, content)
carry_next = True
continue #?
else:
content = [content]
if len(content) != 1:
print('註:多行', '\\n', s.start, content)
print(s.start, s.end)
tw_all, zh_all = [], []
for cnt in content:
if '|' in cnt:
if len(cnt.split('|')) %2 != 0:
print('修:多槓', cnt.split('|'))
continue
tw, zh = cnt.split('|')
tw, zh = (t.strip() for t in [tw, zh])
else:
sp = cnt.split()
if len(sp) %2!=0:
print('修:不均', s.start, s.end, sp)
continue
else:
mid = len(sp)//2
tw, zh = sp[:mid], sp[mid:]
tw, zh = (' '.join(t) for t in [tw, zh])
if jiwer.cer(tw, zh) > 1:
print('註:差距', s.start, s.end, 'tw:', tw, 'zh:', zh)
tw_all.append(tw)
zh_all.append(zh)
if carry_next:
new_subs[-1].zh += f" {zh}"
new_subs[-1].tw += f" {tw}"
new_subs[-1].end = s.end
carry_next = False
else:
new_sub = ZHTW_Sub(s.start, s.end, zh, tw)
new_subs.append(new_sub)
sys.stdout = stdout_bak
return new_subs, buffer
def update_yield():
buffer = []
def update_print(inp):
buffer.append(str(inp))
return '\n'.join(buffer)
return update_print
def parse_srt(file):
if file is None:
return "No file uploaded."
upd = update_yield()
yield upd(file.name)
subs = read_srt(file.name)
yield upd(len(subs))
new_subs, logs = filter_sub(subs)
yield upd(logs.getvalue())
yield upd(len(new_subs))
new_subs = merge_sub(new_subs)
yield upd(len(new_subs))
# ep_name = file.name.replace('-dedup', '')
# ep_name = ep_name.replace('.fix', '')
total_dur = 0
for i, it in enumerate(new_subs):
if (it.end-it.start).total_seconds() > 30:
yield upd(i)
yield upd(it.end.total_seconds(), (it.end-it.start).total_seconds(), it.tw)
total_dur += (it.end-it.start).total_seconds()
yield upd("可用時長 "+str(timedelta(seconds=int(total_dur))))
with gr.Blocks() as demo:
gr.Markdown("## SRT File Validator")
with gr.Column():
file_input = gr.File(label="Upload .srt File", file_types=[".srt"])
output_log = gr.Textbox(label="Parsing Log", lines=10, max_lines=120)
file_input.change(fn=parse_srt, inputs=file_input, outputs=output_log)
demo.launch()
|