Spaces:
Running
Running
import re | |
def timeformat_srt(time): | |
hours = time // 3600 | |
minutes = (time - hours * 3600) // 60 | |
seconds = time - hours * 3600 - minutes * 60 | |
milliseconds = (time - int(time)) * 1000 | |
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}" | |
def timeformat_vtt(time): | |
hours = time // 3600 | |
minutes = (time - hours * 3600) // 60 | |
seconds = time - hours * 3600 - minutes * 60 | |
milliseconds = (time - int(time)) * 1000 | |
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}" | |
def write_file(subtitle, output_file): | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(subtitle) | |
def get_srt(segments): | |
output = "" | |
for i, segment in enumerate(segments): | |
output += f"{i + 1}\n" | |
output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n" | |
if segment['text'].startswith(' '): | |
segment['text'] = segment['text'][1:] | |
output += f"{segment['text']}\n\n" | |
return output | |
def get_vtt(segments): | |
output = "WebVTT\n\n" | |
for i, segment in enumerate(segments): | |
output += f"{i + 1}\n" | |
output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n" | |
if segment['text'].startswith(' '): | |
segment['text'] = segment['text'][1:] | |
output += f"{segment['text']}\n\n" | |
return output | |
def get_txt(segments): | |
output = "" | |
for i, segment in enumerate(segments): | |
if segment['text'].startswith(' '): | |
segment['text'] = segment['text'][1:] | |
output += f"{segment['text']}\n" | |
return output | |
def parse_srt(file_path): | |
"""Reads SRT file and returns as dict""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
srt_data = file.read() | |
data = [] | |
blocks = srt_data.split('\n\n') | |
for block in blocks: | |
if block.strip() != '': | |
lines = block.strip().split('\n') | |
index = lines[0] | |
timestamp = lines[1] | |
sentence = ' '.join(lines[2:]) | |
data.append({ | |
"index": index, | |
"timestamp": timestamp, | |
"sentence": sentence | |
}) | |
return data | |
def parse_vtt(file_path): | |
"""Reads WebVTT file and returns as dict""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
webvtt_data = file.read() | |
data = [] | |
blocks = webvtt_data.split('\n\n') | |
for block in blocks: | |
if block.strip() != '' and not block.strip().startswith("WebVTT"): | |
lines = block.strip().split('\n') | |
index = lines[0] | |
timestamp = lines[1] | |
sentence = ' '.join(lines[2:]) | |
data.append({ | |
"index": index, | |
"timestamp": timestamp, | |
"sentence": sentence | |
}) | |
return data | |
def get_serialized_srt(dicts): | |
output = "" | |
for dic in dicts: | |
output += f'{dic["index"]}\n' | |
output += f'{dic["timestamp"]}\n' | |
output += f'{dic["sentence"]}\n\n' | |
return output | |
def get_serialized_vtt(dicts): | |
output = "WebVTT\n\n" | |
for dic in dicts: | |
output += f'{dic["index"]}\n' | |
output += f'{dic["timestamp"]}\n' | |
output += f'{dic["sentence"]}\n\n' | |
return output | |
def safe_filename(name): | |
from app import _args | |
INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]' | |
safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name) | |
if not _args.colab: | |
return safe_name | |
# Truncate the filename if it exceeds the max_length (20) | |
if len(safe_name) > 20: | |
file_extension = safe_name.split('.')[-1] | |
if len(file_extension) + 1 < 20: | |
truncated_name = safe_name[:20 - len(file_extension) - 1] | |
safe_name = truncated_name + '.' + file_extension | |
else: | |
safe_name = safe_name[:20] | |
return safe_name | |