SubGen / app.py
InferciaNLP's picture
final
4e1ab51
raw
history blame
3.88 kB
import os
import re
import gradio as gr
import stable_whisper
import moviepy.editor as mp
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
import torch
import time
from tqdm import tqdm
def load_assets(TOKEN):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSeq2SeqLM.from_pretrained("InferciaNLP/NMT-SubGen", token=TOKEN)
tokenizer = AutoTokenizer.from_pretrained("InferciaNLP/NMT-SubGen", token=TOKEN)
model = model.to(device)
return model, tokenizer
def translate_text(model, tokenizer, text):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(inputs, num_beams=5,
num_return_sequences=1, max_new_tokens=64,
forced_bos_token_id=tokenizer.lang_code_to_id["fa_IR"])
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def generate_subtitles(model, tokenizer):
with open("source_transcription.srt", "r") as f:
srt = f.read()
segments = re.split("\n\n", srt)
translated_segments = []
for segment in tqdm(segments):
lines = segment.split("\n")
index, timecode, text = lines[0], lines[1], " ".join(lines[2:])
translated_text = translate_text(model, tokenizer, text)
translated_segment = "\n".join([index, timecode, translated_text])
translated_segments.append(translated_segment)
translated_srt = "\n\n".join(translated_segments)
with open(f"target_transcription.srt", "w") as f:
f.write(translated_srt)
def transcribe_audio():
model = stable_whisper.load_model("medium")
result = model.transcribe("audio_sample.ogg", language='en')
srt = result.to_srt_vtt(word_level=False, segment_level=True)
with open(f"source_transcription.srt", "w") as f:
f.write(srt)
def extract_audio(video):
clip = mp.VideoFileClip(video)
audio = clip.audio
audio.write_audiofile("audio_sample.ogg")
def merge_srt(video, subtitle):
subtitles = mp.TextClip(subtitle, font="Arial", fontsize=24, color="white")
subtitles = subtitles.set_position(("center", "bottom")).set_duration(video.duration)
final = mp.CompositeVideoClip([video, subtitles])
final.write_videofile("target_video.mp4")
return final
def init(video):
init_time = time.time()
print('Starting Proccess')
# TOKEN = os.environ.get('INFERCIA_TOKEN')
TOKEN = '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'
model, tokenizer = load_assets(TOKEN)
print(f"Assets Loaded, Time Taken: {time.time() - init_time}"); init_time = time.time()
extract_audio(video)
print(f"Audio Extacted from Video, Time Taken: {time.time() - init_time}"); init_time = time.time()
transcribe_audio()
print(f"Audio Transcribed using Whisper, Time Taken: {time.time() - init_time}"); init_time = time.time()
generate_subtitles(model, tokenizer)
print(f"Transcriptions Translated, Time Taken: {time.time() - init_time}"); init_time = time.time()
# translated_video = merge_srt(video, 'target_transcription.srt')
# print(f"Subtitle Added, Time Taken: {time.time() - init_time}"); init_time = time.time()
return 'target_transcription.srt'
video_input = gr.Video(type="file", label="Video")
srt_outputs = gr.File(label="Subtitle", file_types=[".srt", ".vtt"])
interface = gr.Interface(fn=init, inputs=video_input, outputs="video", title="Subtitle Generator")
interface.launch(share=True)
# WITH BUFFERS
# import io
# def extract_audio(video):
# clip = mp.VideoFileClip(video)
# audio = clip.audio
# buffer = io.BytesIO()
# audio.write_audiofile(buffer)
# return buffer
# def transcribe_audio(buffer, model, tokenizer):
# model = whisper.load_model("medium")
# buffer.seek(0) # reset the buffer position to the beginning
# result = model.transcribe(buffer, language='en')