File size: 14,367 Bytes
54fe84c 2076f18 54fe84c 9f1ac0e 54fe84c cdfd068 54fe84c 0aacb80 54fe84c e85ddeb 54fe84c e85ddeb 54fe84c 6914ff7 e85ddeb 04fc0d0 e85ddeb 6914ff7 54fe84c e85ddeb 54fe84c 920380e 54fe84c 920380e 54fe84c 920380e 54fe84c 80e7dca 54fe84c 10210b4 54fe84c e85ddeb 54fe84c e85ddeb 54fe84c 920380e 54fe84c cdfd068 e85ddeb 54fe84c cdfd068 54fe84c e85ddeb cdfd068 54fe84c e85ddeb 54fe84c e85ddeb 2076f18 e85ddeb 54fe84c 920380e 54fe84c e85ddeb 54fe84c 09962d6 54fe84c bc02504 54fe84c e85ddeb 54fe84c 1b68cf8 54fe84c 0aacb80 54fe84c e85ddeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 |
from utils import language_dict
import math
import torch
import gc
import time
from faster_whisper import WhisperModel
import os
import re
import uuid
import shutil
def get_language_name(lang_code):
global language_dict
# Iterate through the language dictionary
for language, details in language_dict.items():
# Check if the language code matches
if details["lang_code"] == lang_code:
return language # Return the language name
return lang_code
def clean_file_name(file_path):
# Get the base file name and extension
file_name = os.path.basename(file_path)
file_name, file_extension = os.path.splitext(file_name)
# Replace non-alphanumeric characters with an underscore
cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)
# Remove any multiple underscores
clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')
# Generate a random UUID for uniqueness
random_uuid = uuid.uuid4().hex[:6]
# Combine cleaned file name with the original extension
clean_file_path = os.path.join(os.path.dirname(file_path), clean_file_name + f"_{random_uuid}" + file_extension)
return clean_file_path
def format_segments(segments):
saved_segments = list(segments)
sentence_timestamp = []
words_timestamp = []
speech_to_text = ""
for i in saved_segments:
temp_sentence_timestamp = {}
# Store sentence information in sentence_timestamp
text = i.text.strip()
sentence_id = len(sentence_timestamp) # Get the current index for the new entry
sentence_timestamp.append({
"id": sentence_id, # Use the index as the id
"text": text,
"start": i.start,
"end": i.end,
"words": [] # Initialize words as an empty list within the sentence
})
speech_to_text += text + " "
# Process each word in the sentence
for word in i.words:
word_data = {
"word": word.word.strip(),
"start": word.start,
"end": word.end
}
# Append word timestamps to the sentence's word list
sentence_timestamp[sentence_id]["words"].append(word_data)
# Optionally, add the word data to the global words_timestamp list
words_timestamp.append(word_data)
return sentence_timestamp, words_timestamp, speech_to_text
def combine_word_segments(words_timestamp, max_words_per_subtitle=8, min_silence_between_words=0.5):
if max_words_per_subtitle<=1:
max_words_per_subtitle=1
before_translate = {}
id = 1
text = ""
start = None
end = None
word_count = 0
last_end_time = None
for i in words_timestamp:
try:
word = i['word']
word_start = i['start']
word_end = i['end']
# Check for sentence-ending punctuation
is_end_of_sentence = word.endswith(('.', '?', '!'))
# Check for conditions to create a new subtitle
if ((last_end_time is not None and word_start - last_end_time > min_silence_between_words)
or word_count >= max_words_per_subtitle
or is_end_of_sentence):
# Store the previous subtitle if there's any
if text:
before_translate[id] = {
"text": text,
"start": start,
"end": end
}
id += 1
# Reset for the new subtitle segment
text = word
start = word_start # Set the start time for the new subtitle
word_count = 1
else:
if word_count == 0: # First word in the subtitle
start = word_start # Ensure the start time is set
text += " " + word
word_count += 1
end = word_end # Update the end timestamp
last_end_time = word_end # Update the last end timestamp
except KeyError as e:
print(f"KeyError: {e} - Skipping word")
pass
# After the loop, make sure to add the last subtitle segment
if text:
before_translate[id] = {
"text": text,
"start": start,
"end": end
}
return before_translate
def custom_word_segments(words_timestamp, min_silence_between_words=0.3, max_characters_per_subtitle=17):
before_translate = []
id = 1
text = ""
start = None
end = None
last_end_time = None
i = 0
while i < len(words_timestamp):
word = words_timestamp[i]['word']
word_start = words_timestamp[i]['start']
word_end = words_timestamp[i]['end']
# Look ahead to check if the next word (i+1) starts with a hyphen
if i + 1 < len(words_timestamp) and words_timestamp[i + 1]['word'].startswith("-"):
# Combine the current word and the next word (i, i+1) if next word starts with a hyphen
combined_text = word + words_timestamp[i + 1]['word'][:] # Skip the hyphen and combine
combined_start_time = word_start
combined_end_time = words_timestamp[i + 1]['end']
i += 1 # Skip the next word (i+1) since it has been combined
# Look ahead for the next non-hyphenated word, check further if needed (i+2, i+3, etc.)
while i + 1 < len(words_timestamp) and words_timestamp[i + 1]['word'].startswith("-"):
combined_text += words_timestamp[i + 1]['word'][:] # Add word excluding hyphen
combined_end_time = words_timestamp[i + 1]['end']
i += 1 # Skip the next hyphenated word
else:
# No hyphen at the next word, just take the current word
combined_text = word
combined_start_time = word_start
combined_end_time = word_end
# Check if the combined text exceeds the maximum character limit
if len(text) + len(combined_text) > max_characters_per_subtitle:
# If accumulated text is non-empty, store it as a subtitle
if text:
before_translate.append({
"word": text.strip(),
"start": start,
"end": end
})
id += 1
# Start a new subtitle with the combined text
text = combined_text
start = combined_start_time
else:
# Accumulate text
if not text:
start = combined_start_time
text += " " + combined_text
# Update the end timestamp
end = combined_end_time
last_end_time = end
# Move to the next word
i += 1
# Add the final subtitle segment if text is not empty
if text:
before_translate.append({
"word": text.strip(),
"start": start,
"end": end
})
return before_translate
def convert_time_to_srt_format(seconds):
""" Convert seconds to SRT time format (HH:MM:SS,ms) """
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
milliseconds = int((seconds - int(seconds)) * 1000)
return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
# Open the file with UTF-8 encoding
with open(filename, 'w', encoding='utf-8') as f:
for id, entry in subtitles.items():
# Write the subtitle index
f.write(f"{id}\n")
if entry['start'] is None or entry['end'] is None:
print(id)
# Write the start and end time in SRT format
start_time = convert_time_to_srt_format(entry['start'])
end_time = convert_time_to_srt_format(entry['end'])
f.write(f"{start_time} --> {end_time}\n")
# Write the text and speaker information
f.write(f"{entry['text']}\n\n")
def word_level_srt(words_timestamp, srt_path="world_level_subtitle.srt",shorts=False):
punctuation_pattern = re.compile(r'[.,!?;:"\ββ_~^+*|]')
with open(srt_path, 'w', encoding='utf-8') as srt_file:
for i, word_info in enumerate(words_timestamp, start=1):
start_time = convert_time_to_srt_format(word_info['start'])
end_time = convert_time_to_srt_format(word_info['end'])
word=word_info['word']
word =re.sub(punctuation_pattern, '', word)
if word.strip() == 'i':
word = "I"
if shorts==False:
word=word.replace("-","")
srt_file.write(f"{i}\n{start_time} --> {end_time}\n{word}\n\n")
def generate_srt_from_sentences(sentence_timestamp, srt_path="default_subtitle.srt"):
with open(srt_path, 'w', encoding='utf-8') as srt_file:
for index, sentence in enumerate(sentence_timestamp):
start_time = convert_time_to_srt_format(sentence['start'])
end_time = convert_time_to_srt_format(sentence['end'])
srt_file.write(f"{index + 1}\n{start_time} --> {end_time}\n{sentence['text']}\n\n")
def get_audio_file(uploaded_file):
global temp_folder
file_path = os.path.join(temp_folder, os.path.basename(uploaded_file))
file_path=clean_file_name(file_path)
shutil.copy(uploaded_file, file_path)
return file_path
def whisper_subtitle(uploaded_file,Source_Language,max_words_per_subtitle=8):
global language_dict,base_path,subtitle_folder
#Load model
if torch.cuda.is_available():
# If CUDA is available, use GPU with float16 precision
device = "cuda"
compute_type = "float16"
# compute_type="int8_float16"
else:
# If CUDA is not available, use CPU with int8 precision
device = "cpu"
compute_type = "int8"
faster_whisper_model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
audio_path=get_audio_file(uploaded_file)
if Source_Language=="Automatic":
segments,d = faster_whisper_model.transcribe(audio_path, word_timestamps=True)
lang_code=d.language
src_lang=get_language_name(lang_code)
else:
lang=language_dict[Source_Language]['lang_code']
segments,d = faster_whisper_model.transcribe(audio_path, word_timestamps=True,language=lang)
src_lang=Source_Language
sentence_timestamp,words_timestamp,text=format_segments(segments)
if os.path.exists(audio_path):
os.remove(audio_path)
del faster_whisper_model
gc.collect()
torch.cuda.empty_cache()
word_segments=combine_word_segments(words_timestamp, max_words_per_subtitle=max_words_per_subtitle, min_silence_between_words=0.5)
shorts_segments=custom_word_segments(words_timestamp, min_silence_between_words=0.3, max_characters_per_subtitle=17)
#setup srt file names
base_name = os.path.basename(uploaded_file).rsplit('.', 1)[0][:30]
save_name = f"{subtitle_folder}/{base_name}_{src_lang}.srt"
original_srt_name=clean_file_name(save_name)
original_txt_name=original_srt_name.replace(".srt",".txt")
word_level_srt_name=original_srt_name.replace(".srt","_word_level.srt")
customize_srt_name=original_srt_name.replace(".srt","_customize.srt")
shorts_srt_name=original_srt_name.replace(".srt","_shorts.srt")
generate_srt_from_sentences(sentence_timestamp, srt_path=original_srt_name)
word_level_srt(words_timestamp, srt_path=word_level_srt_name)
word_level_srt(shorts_segments, srt_path=shorts_srt_name,shorts=True)
write_subtitles_to_file(word_segments, filename=customize_srt_name)
with open(original_txt_name, 'w', encoding='utf-8') as f1:
f1.write(text)
return original_srt_name,customize_srt_name,word_level_srt_name,shorts_srt_name,original_txt_name
#@title Using Gradio Interface
def subtitle_maker(Audio_or_Video_File,Source_Language,max_words_per_subtitle):
try:
default_srt_path,customize_srt_path,word_level_srt_path,shorts_srt_name,text_path=whisper_subtitle(Audio_or_Video_File,Source_Language,max_words_per_subtitle=max_words_per_subtitle)
except Exception as e:
print(f"Error in whisper_subtitle: {e}")
default_srt_path,customize_srt_path,word_level_srt_path,shorts_srt_name,text_path=None,None,None,None,None
return default_srt_path,customize_srt_path,word_level_srt_path,shorts_srt_name,text_path
import gradio as gr
import click
base_path="."
subtitle_folder=f"{base_path}/generated_subtitle"
temp_folder = f"{base_path}/subtitle_audio"
if not os.path.exists(subtitle_folder):
os.makedirs(subtitle_folder, exist_ok=True)
if not os.path.exists(temp_folder):
os.makedirs(temp_folder, exist_ok=True)
source_lang_list = ['Automatic']
available_language=language_dict.keys()
source_lang_list.extend(available_language)
@click.command()
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
def main(debug, share):
description = """**Note**: Avoid uploading large video files. Instead, upload the audio from the video for faster processing.
You can find the model at [faster-whisper-large-v3-turbo-ct2](https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2)"""
# Define Gradio inputs and outputs
gradio_inputs = [
gr.File(label="Upload Audio or Video File"),
gr.Dropdown(label="Language", choices=source_lang_list, value="Automatic"),
gr.Number(label="Max Word Per Subtitle Segment [Useful for Vertical Videos]", value=8)
]
gradio_outputs = [
gr.File(label="Default SRT File", show_label=True),
gr.File(label="Customize SRT File", show_label=True),
gr.File(label="Word Level SRT File", show_label=True),
gr.File(label="SRT File For Shorts", show_label=True),
gr.File(label="Text File", show_label=True)
]
# Create Gradio interface
demo = gr.Interface(fn=subtitle_maker, inputs=gradio_inputs, outputs=gradio_outputs, title="Auto Subtitle Generator Using Whisper-Large-V3-Turbo-Ct2",description=description)
# Launch Gradio with command-line options
demo.queue().launch(debug=debug, share=share)
if __name__ == "__main__":
main()
|