ViDove / srt2ass.py
Eason Lu
replace whisper api to whisper lib
04ae3b4
raw
history blame
17 kB
# -*- coding: utf-8 -*-
#
# python-srt2ass: https://github.com/ewwink/python-srt2ass
# by: ewwink
# modified by: 一堂宁宁 Lenshyuu227
# original srt2ass: https://github.com/Ayanaminn/N46Whisper/blob/main/srt2ass.py
# modified by: Myth
import sys
import os
import regex as re
import codecs
def fileopen(input_file):
# use correct codec to encode the input file
encodings = ["utf-32", "utf-16", "utf-8", "cp1252", "gb2312", "gbk", "big5"]
srt_src = ''
for enc in encodings:
try:
with codecs.open(input_file, mode="r", encoding=enc) as fd:
# return an instance of StreamReaderWriter
srt_src = fd.read()
break
except:
# print enc + ' failed'
continue
return [srt_src, enc]
def srt2ass(input_file,sub_style, is_split, split_method):
if '.ass' in input_file:
return input_file
if not os.path.isfile(input_file):
print(input_file + ' not exist')
return
src = fileopen(input_file)
srt_content = src[0]
# encoding = src[1] # Will not encode so do not need to pass codec para
src = ''
utf8bom = ''
if u'\ufeff' in srt_content:
srt_content = srt_content.replace(u'\ufeff', '')
utf8bom = u'\ufeff'
srt_content = srt_content.replace("\r", "")
lines = [x.strip() for x in srt_content.split("\n") if x.strip()]
subLines = ''
dlgLines = '' # dialogue line
lineCount = 0
output_file = '.'.join(input_file.split('.')[:-1])
output_file += '.ass'
for ln in range(len(lines)):
line = lines[ln]
if line.isdigit() and re.match('-?\d\d:\d\d:\d\d', lines[(ln+1)]):
if dlgLines:
subLines += dlgLines + "\n"
dlgLines = ''
lineCount = 0
continue
else:
if re.match('-?\d\d:\d\d:\d\d', line):
line = line.replace('-0', '0')
if sub_style =='default':
dlgLines += 'Dialogue: 0,' + line + ',default,,0,0,0,,'
elif sub_style =='ikedaCN':
dlgLines += 'Dialogue: 0,' + line + ',池田字幕1080p,,0,0,0,,'
elif sub_style == 'sugawaraCN':
dlgLines += 'Dialogue: 0,' + line + ',中字 1080P,,0,0,0,,'
elif sub_style == 'kaedeCN':
dlgLines += 'Dialogue: 0,' + line + ',den SR红色,,0,0,0,,'
elif sub_style == 'taniguchiCN':
dlgLines += 'Dialogue: 0,' + line + ',正文_1080P,,0,0,0,,'
elif sub_style == 'asukaCN':
dlgLines += 'Dialogue: 0,' + line + ',DEFAULT1,,0,0,0,,'
else:
if lineCount < 2:
dlg_string = line
if is_split == "Yes" and split_method == 'Modest':
# do not split if space proceed and followed by non-ASC-II characters
# do not split if space followed by word that less than 5 characters
split_string = re.sub(r'(?<=[^\x00-\x7F])\s+(?=[^\x00-\x7F])(?=\w{5})', r'|', dlg_string)
# print(split_string)
if len(split_string.split('|')) > 1:
dlgLines += (split_string.replace('|', "(adjust_required)\n" + dlgLines)) + "(adjust_required)"
else:
dlgLines += line
elif is_split == "Yes" and split_method == 'Aggressive':
# do not split if space proceed and followed by non-ASC-II characters
# split at all the rest spaces
split_string = re.sub(r'(?<=[^\x00-\x7F])\s+(?=[^\x00-\x7F])', r'|', dlg_string)
if len(split_string.split('|')) > 1:
dlgLines += (split_string.replace('|',"(adjust_required)\n" + dlgLines)) + "(adjust_required)"
else:
dlgLines += line
else:
dlgLines += line
else:
dlgLines += "\n" + line
lineCount += 1
ln += 1
subLines += dlgLines + "\n"
subLines = re.sub(r'\d(\d:\d{2}:\d{2}),(\d{2})\d', '\\1.\\2', subLines)
subLines = re.sub(r'\s+-->\s+', ',', subLines)
# replace style
# subLines = re.sub(r'<([ubi])>', "{\\\\\g<1>1}", subLines)
# subLines = re.sub(r'</([ubi])>', "{\\\\\g<1>0}", subLines)
# subLines = re.sub(r'<font\s+color="?#(\w{2})(\w{2})(\w{2})"?>', "{\\\\c&H\\3\\2\\1&}", subLines)
# subLines = re.sub(r'</font>', "", subLines)
if sub_style == 'default':
head_name = 'head_str_default'
elif sub_style == 'ikedaCN':
head_name = 'head_str_ikeda'
elif sub_style == 'sugawaraCN':
head_name = 'head_str_sugawara'
elif sub_style == 'kaedeCN':
head_name = 'head_str_kaede'
elif sub_style == "taniguchiCN":
head_name = 'head_str_taniguchi'
elif sub_style == 'asukaCN':
head_name = 'head_str_asuka'
head_str = STYLE_DICT.get(head_name)
output_str = utf8bom + head_str + '\n' + subLines
# encode again for head string
output_str = output_str.encode('utf8')
with open(output_file, 'wb') as output:
output.write(output_str)
output_file = output_file.replace('\\', '\\\\')
output_file = output_file.replace('/', '//')
return output_file
# if len(sys.argv) > 1:
# for name in sys.argv[1:]:
# srt2ass(name,sub_style=)
STYLE_DICT = {
'head_str_default':'''[Script Info]
; This is an Advanced Sub Station Alpha v4+ script.
; The script is generated by N46Whisper
Title:
ScriptType: v4.00+
Collisions: Normal
PlayDepth: 0
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: default,Meiryo,90,&H00FFFFFF,&H00FFFFFF,&H00000000,&H00050506,-1,0,0,0,100,100,5,0,1,3.5,0,2,135,135,10,1
[Events]
Format: Layer, Start, End, Style, Actor, MarginL, MarginR, MarginV, Effect, Text''',
'head_str_ikeda': '''[Script Info]
; This is an Advanced Sub Station Alpha v4+ script.
; The script is generated by N46Whisper
Title:
ScriptType: v4.00+
Collisions: Normal
PlayDepth: 0
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: SubStyle,Arial,20,&H0300FFFF,&H00FFFFFF,&H00000000,&H02000000,-1,0,0,0,100,100,0,0,3,2,0,2,10,10,10,1
Style: 池田字幕1080p,思源黑体,71,&H00FFFFFF,&H000000FF,&H00008A11,&H00000000,-1,0,0,0,100,100,1.49999,0,1,1.99999,1,2,8,8,5,1
Style: 池田字幕1080p - 不透明背景,思源黑体,71,&H00FFFFFF,&H000000FF,&H64202021,&H00000000,-1,0,0,0,100,100,1.49999,0,3,1.99999,0,2,8,8,5,1
Style: staff1080p,思源黑体,55,&H00FFFFFF,&H00FFFFFF,&H34000000,&H00000000,-1,0,0,0,100,100,3,0,1,2.5,0,7,16,13,4,1
Style: 注释1080p,思源宋体 CN,55,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,-1,0,0,0,100,100,0,0,1,2,1,8,10,10,10,1
Style: 多美左上遮罩,Arial,48,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,0,0,1,8,8,11,1
Style: 多美紫色遮罩,Arial,48,&H00F05384,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,0,0,1,8,8,11,1
Style: 多美紫色屏字,仓耳渔阳体 W03,86,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,0,0,8,8,8,11,1
Style: 多美右上屏字,方正兰亭圆_GBK_细,60,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,94,100,6,0,1,0,3,9,8,45,100,1
Style: 屏字-黑,汉仪正圆-55S,71,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,0,0,8,8,8,11,1
[Events]
Format: Layer, Start, End, Style, Actor, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:00.30,0:00:03.00,staff1080p,,0,0,0,,{'''+r'''\fad(150,300)}特蕾纱熊猫观察会'''+r'''\N片源:'''+r'''\N翻译:'''+r'''\N时间:'''+r'''\N校压:''',
'head_str_sugawara':'''[Script Info]
; This is an Advanced Sub Station Alpha v4+ script.
; The script is generated by N46Whisper
Title:
ScriptType: v4.00+
Collisions: Normal
PlayDepth: 0
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: 中字 1080P,思源黑体 CN Medium,90,&H00FFFFFF,&H00FFFFFF,&H008F51CA,&H00A860F2,-1,0,0,0,100,100,5,0,1,3.5,0,2,135,135,10,1
Style: staff 1080P,思源宋体 CN Medium,70,&H00FFFFFF,&H000000FF,&H008F51CA,&H00000000,0,0,0,0,100,100,0,0,1,4,2,7,10,10,10,1
Style: 标注 1080P,思源黑体 CN Medium,70,&H00FFFFFF,&HFFFFFFFF,&H00000000,&H7F000000,-1,0,0,0,100,100,0,0,1,3,1.5,8,0,0,15,1
Style: 中字 720P,思源黑体 CN Medium,60,&H00FFFFFF,&H00FFFFFF,&H008F51CA,&H00A860F2,-1,0,0,0,100,100,5,0,1,3,0,2,135,135,10,1
Style: staff 720P,思源宋体 CN Medium,50,&H00FFFFFF,&H000000FF,&H008F51CA,&H00000000,0,0,0,0,100,100,0,0,1,3,2,7,10,10,10,1
Style: 标注 720P,思源黑体 CN Medium,50,&H00FFFFFF,&HFFFFFFFF,&H00000000,&H7F000000,-1,0,0,0,100,100,0,0,1,3,1.5,8,0,0,15,1
Style: staff msg,思源宋体 CN Medium,25,&H00FFFFFF,&H000000FF,&H008F51CA,&H00000000,0,0,0,0,100,100,0,0,1,4,2,7,10,10,10,1
Style: 中字 msg,思源黑体 CN Medium,25,&H00FFFFFF,&H00FFFFFF,&H008F51CA,&H00A860F2,-1,0,0,0,100,100,5,0,1,4,0,2,135,135,10,1
Style: 标注 msg,思源黑体 CN Medium,25,&H00FFFFFF,&HFFFFFFFF,&H00000000,&H7F000000,-1,0,0,0,100,100,0,0,1,3,1.5,8,0,0,15,1
Style: 歌词日语 1080P,Swei Spring Sugar CJKtc,60,&H00FFFFFF,&H000000FF,&H009B46A5,&H5A9B46A5,0,0,0,0,100,100,0,0,1,2,0,2,10,10,30,1
Style: 歌词中文 1080P,Swei Spring Sugar CJKtc,90,&H00FFFFFF,&H000000FF,&H009B46A5,&H5F9B46A5,-1,0,0,0,100,100,0,0,1,2,0,2,10,10,100,1
Style: 歌词中文 720P,Swei Spring Sugar CJKtc,60,&H00FFFFFF,&H000000FF,&H009B46A5,&H5F9B46A5,-1,0,0,0,100,100,0,0,1,2,0,2,10,10,70,1
Style: 歌词日语 720P,Swei Spring Sugar CJKtc,40,&H00FFFFFF,&H000000FF,&H009B46A5,&H5A9B46A5,0,0,0,0,100,100,0,0,1,1,0,2,10,10,15,1
[Events]
Format: Layer, Start, End, Style, Actor, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:00.00,0:00:05.24,staff 1080P,,0,0,0,,{'''+r'''\fad(1200,50)\pos(15.2,0.4)}菅原咲月字幕组'''+r'''\N片源:'''+r'''\N翻译:'''+r'''\N时间:'''+r'''\N校压:''',
'head_str_kaede':'''[Script Info]
; This is an Advanced Sub Station Alpha v4+ script.
; The script is generated by N46Whisper
Title:
ScriptType: v4.00+
Collisions: Normal
PlayDepth: 0
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: staff,微软雅黑,60,&H00FFFFFF,&H00923782,&H0076137B,&H00540D67,-1,0,0,0,100,100,0,0,1,3,0,7,15,15,15,1
Style: den SR红色,微软雅黑,70,&H0AFFFFFF,&H004B4B9E,&H322828E0,&H640A0A72,-1,0,0,0,100,100,0,0,1,3,0,2,15,15,70,1
Style: 注释,微软雅黑,68,&H00FFFFFF,&H000000FF,&H3D000000,&H00FFFFFF,-1,0,0,0,100,100,0,0,1,4.5,0,8,23,23,23,1
Style: 红色,微软雅黑,75,&H00FFFFFF,&H000000FF,&H004243CB,&H00000000,-1,0,0,0,100,100,0,0,1,3,0,2,15,15,15,1
Style: den - 中文歌词,微软雅黑,70,&H0AFFFFFF,&H004B4B9E,&H322828E0,&H640A0A72,-1,0,0,0,100,100,0,0,1,3,0,2,15,15,70,1
Style: den - 日文歌词,微软雅黑,50,&H0AFFFFFF,&H00F9F9F9,&H32000001,&H640A0A72,-1,0,0,0,100,100,0,0,1,1,0,2,15,15,9,1
[Events]
Format: Layer, Start, End, Style, Actor, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:00.00,0:00:05.00,staff,,0,0,0,,{'''+r'''\fad(300,300)}「三番目の楓」'''+r'''\N片源:'''+r'''\N翻译:'''+r'''\N时间:'''+r'''\N校压:''',
'head_str_taniguchi':'''[Script Info]
; This is an Advanced Sub Station Alpha v4+ script.
; The script is generated by N46Whisper
Title:
ScriptType: v4.00+
Collisions: Normal
PlayDepth: 0
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,2,10,10,10,1
Style: 正文_1080P,思源黑体 CN Bold,75,&H00FFFFFF,&H000000FF,&H0077234B,&HA00000FF,-1,0,0,0,100,100,3,0,1,3,2,2,10,10,15,1
Style: staff_1080P,思源宋体 CN Heavy,60,&H00FFFFFF,&H000000FF,&H0077234B,&HA00000FF,-1,0,0,0,100,100,2,0,1,2,1,7,30,10,30,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:01.00,0:00:10.00,staff_1080P,,0,0,0,,{'''+r'''\fad(300,1000)}泪痣愛季応援団 '''+r'''\N源:'''+r'''\N制作:
Dialogue: 0,0:00:08.95,0:03:29.40,staff_1080P,,0,0,0,,{'''+r'''\fad(1000,1000)'''+r'''\pos(30,30)'''+r'''\bord0'''+r'''\shad0'''+r'''\c&HFFFFFF&'''+r'''\1a&H3C&}泪痣愛季応援団
Dialogue: 0,0:00:00.00,0:00:05.00,正文_1080P,,0,0,0,,谷口爱季字幕组''',
'head_str_asuka':'''[Script Info]
; The script is generated by N46Whisper
; http://www.aegisub.org/
Title: Default Aegisub file
ScriptType: v4.00+
WrapStyle: 0
ScaledBorderAndShadow: yes
YCbCr Matrix: None
[Aegisub Project Garbage]
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,2,10,10,10,1
Style: DEFAULT1,微软雅黑,65,&H00FFFFFF,&HF08B581A,&H007E672E,&H0084561E,-1,0,0,0,100,100,0,0,1,2,1,2,20,20,5,1
Style: STAFF,Microsoft YaHei,50,&H00FFFFFF,&HF08B581A,&H007E672E,&HF084561E,-1,0,0,0,100,100,0,0,1,2.5,3,7,30,30,3,134
Style: 名单1,方正粗倩_GBK,45,&H00E7D793,&H00E9C116,&H004C3F00,&H0016161D,-1,0,0,0,100,100,0,0,1,3,2,2,10,10,10,1
Style: 名单2,方正粗黑简体,45,&H00FAF9EC,&H00493F15,&H008A4D1F,&H000A0A0B,-1,0,0,0,100,100,0,0,1,3,1.5,2,10,10,10,1
Style: 中文歌词,方正粗黑简体,50,&H00FFFFFF,&HF0000000,&H00000000,&H96000000,-1,0,0,0,100,100,0,0,1,1.5,2,2,10,10,4,134
Style: 日文歌词,方正粗黑简体,40,&H00FFFFFF,&HF0000000,&H00000000,&H96000000,-1,0,0,0,100,100,0,0,1,1.5,2,2,10,10,10,134
Style: 屏幕字/注释,微软雅黑,50,&H00FFFFFF,&HF0000000,&H00000000,&H96000000,-1,0,0,0,100,100,0,0,1,1.5,2,2,10,10,10,134
Style: purple1,文鼎特圆简,26,&H00670067,&H00FFFFFF,&H00FFFFFF,&H00FFFFFF,0,0,0,0,100,100,0,0,1,4.6,0,2,10,10,10,1
Style: 鸟,微软雅黑,35,&H00FFFFFF,&HF08B581A,&H00F3B70F,&H0084561E,-1,0,0,0,100,100,0,0,1,2,1,2,100,20,465,1
Style: 哈利,微软雅黑,35,&H00FFFFFF,&HF08B581A,&H00445FE1,&H00445FE1,-1,0,0,0,100,100,0,0,1,2,1,2,0,150,220,1
Style: 期数,Berlin Sans FB,25,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,-1,0,0,0,100,100,0,0,1,2,1,9,10,10,0,1
Style: HamAsuka-屏幕字,方正卡通_GBK,50,&H00FFFFFF,&H000000FF,&H00D08C27,&H00010102,-1,0,0,0,100,100,0,0,1,3.5,3,2,900,10,170,1
Style: HamAsuka-屏幕字 小黑,方正粗黑宋简体,65,&H00000000,&H000000FF,&H00FFFFFF,&H00010102,0,0,0,0,100,100,0,0,1,0,0,2,10,10,170,1
Style: HamAsuka-屏幕字 蓝底,微软雅黑,80,&H00FFFFFF,&H000000FF,&H00A21C14,&H00FFFFFF,-1,0,0,0,100,100,0,0,3,4,0,2,10,10,10,1
Style: HamAsuka-屏幕字 标题,微软雅黑,90,&H00303030,&H0006C6F6,&H00FFFFFF,&H00010102,-1,0,0,0,100,100,0,0,1,0,0,2,10,10,10,1
Style: HamAsuka-屏幕字 问题 白底,微软雅黑,90,&H002C2C2C,&H00B77B1B,&H00FFFFFF,&H00010102,-1,0,0,0,100,100,0,0,3,5,0,2,10,10,10,1
Style: HamAsuka 歌词,微软雅黑,70,&H00FFFFFF,&H00000000,&H00000000,&H00010102,-1,0,0,0,100,100,0,0,1,0,0,2,10,10,10,1
Style: HamAsuka 小窗,微软雅黑,50,&H00FFFFFF,&HF0000000,&H00000000,&H96000000,-1,0,0,0,100,100,0,0,1,1.5,2,9,10,10,300,134
Style: HamAsuka-屏幕字 标题 蓝底,微软雅黑,90,&H00F9F8FB,&H000000FF,&H00AC9769,&H00000000,-1,0,0,0,100,100,0,0,3,5,0,2,10,10,10,1
Style: HamAsuka-屏幕字 标题 黑字,微软雅黑,80,&H00292B2C,&H000000FF,&H00FFFFFF,&H00000000,-1,0,0,0,100,100,0,0,3,5,0,2,10,10,10,1
Style: 毕业曲MV 中文歌词,思源黑体 CN,76,&H0AFFFFFF,&H000000FF,&H0F000000,&H00FFFFFF,-1,0,0,0,100,100,0,0,1,1,0,2,10,10,75,1
Style: 毕业曲MV 日文歌词,思源黑体 CN,58,&H0AFFFFFF,&H000000FF,&H0F000000,&H00FFFFFF,-1,0,0,0,100,100,0,0,1,1,0,2,10,10,15,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,'''
# ADD MORE
}
# if __name__ == "__main__":
# srt2ass('sub_split_test.srt','sugawaraCN','No','Aggressive')