import os import re import gradio as gr import stable_whisper import moviepy.editor as mp from transformers import AutoModelForSeq2SeqLM from transformers import AutoTokenizer import torch import time from tqdm import tqdm def load_assets(TOKEN): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = AutoModelForSeq2SeqLM.from_pretrained("InferciaNLP/NMT-SubGen", token=TOKEN) tokenizer = AutoTokenizer.from_pretrained("InferciaNLP/NMT-SubGen", token=TOKEN) model = model.to(device) return model, tokenizer def translate_text(model, tokenizer, text): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') inputs = tokenizer(text, return_tensors="pt").input_ids.to(device) outputs = model.generate(inputs, num_beams=5, num_return_sequences=1, max_new_tokens=64, forced_bos_token_id=tokenizer.lang_code_to_id["fa_IR"]) return tokenizer.decode(outputs[0], skip_special_tokens=True) def generate_subtitles(model, tokenizer): with open("source_transcription.srt", "r") as f: srt = f.read() segments = re.split("\n\n", srt) translated_segments = [] for segment in tqdm(segments): lines = segment.split("\n") index, timecode, text = lines[0], lines[1], " ".join(lines[2:]) translated_text = translate_text(model, tokenizer, text) translated_segment = "\n".join([index, timecode, translated_text]) translated_segments.append(translated_segment) translated_srt = "\n\n".join(translated_segments) with open(f"target_transcription.srt", "w") as f: f.write(translated_srt) def transcribe_audio(): model = stable_whisper.load_model("medium") result = model.transcribe("audio_sample.ogg", language='en') srt = result.to_srt_vtt(word_level=False, segment_level=True) with open(f"source_transcription.srt", "w") as f: f.write(srt) def extract_audio(video): clip = mp.VideoFileClip(video) audio = clip.audio audio.write_audiofile("audio_sample.ogg") def merge_srt(video, subtitle): subtitles = mp.TextClip(subtitle, font="Arial", fontsize=24, color="white") subtitles = subtitles.set_position(("center", "bottom")).set_duration(video.duration) final = mp.CompositeVideoClip([video, subtitles]) final.write_videofile("target_video.mp4") return final def init(video): init_time = time.time() print('Starting Proccess') # TOKEN = os.environ.get('INFERCIA_TOKEN') TOKEN = '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' model, tokenizer = load_assets(TOKEN) print(f"Assets Loaded, Time Taken: {time.time() - init_time}"); init_time = time.time() extract_audio(video) print(f"Audio Extacted from Video, Time Taken: {time.time() - init_time}"); init_time = time.time() transcribe_audio() print(f"Audio Transcribed using Whisper, Time Taken: {time.time() - init_time}"); init_time = time.time() generate_subtitles(model, tokenizer) print(f"Transcriptions Translated, Time Taken: {time.time() - init_time}"); init_time = time.time() # translated_video = merge_srt(video, 'target_transcription.srt') # print(f"Subtitle Added, Time Taken: {time.time() - init_time}"); init_time = time.time() return 'target_transcription.srt' video_input = gr.Video(type="file", label="Video") srt_outputs = gr.File(label="Subtitle", file_types=[".srt", ".vtt"]) interface = gr.Interface(fn=init, inputs=video_input, outputs="video", title="Subtitle Generator") interface.launch(share=True) # WITH BUFFERS # import io # def extract_audio(video): # clip = mp.VideoFileClip(video) # audio = clip.audio # buffer = io.BytesIO() # audio.write_audiofile(buffer) # return buffer # def transcribe_audio(buffer, model, tokenizer): # model = whisper.load_model("medium") # buffer.seek(0) # reset the buffer position to the beginning # result = model.transcribe(buffer, language='en')