Spaces:
Runtime error
Runtime error
import os | |
import re | |
import gradio as gr | |
import stable_whisper | |
import moviepy.editor as mp | |
from transformers import AutoModelForSeq2SeqLM | |
from transformers import AutoTokenizer | |
import torch | |
import time | |
from tqdm import tqdm | |
def load_assets(TOKEN): | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model = AutoModelForSeq2SeqLM.from_pretrained("InferciaNLP/NMT-SubGen", token=TOKEN) | |
tokenizer = AutoTokenizer.from_pretrained("InferciaNLP/NMT-SubGen", token=TOKEN) | |
model = model.to(device) | |
return model, tokenizer | |
def translate_text(model, tokenizer, text): | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
inputs = tokenizer(text, return_tensors="pt").input_ids.to(device) | |
outputs = model.generate(inputs, num_beams=5, | |
num_return_sequences=1, max_new_tokens=64, | |
forced_bos_token_id=tokenizer.lang_code_to_id["fa_IR"]) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
def generate_subtitles(model, tokenizer): | |
with open("source_transcription.srt", "r") as f: | |
srt = f.read() | |
segments = re.split("\n\n", srt) | |
translated_segments = [] | |
for segment in tqdm(segments): | |
lines = segment.split("\n") | |
index, timecode, text = lines[0], lines[1], " ".join(lines[2:]) | |
translated_text = translate_text(model, tokenizer, text) | |
translated_segment = "\n".join([index, timecode, translated_text]) | |
translated_segments.append(translated_segment) | |
translated_srt = "\n\n".join(translated_segments) | |
with open(f"target_transcription.srt", "w") as f: | |
f.write(translated_srt) | |
def transcribe_audio(): | |
model = stable_whisper.load_model("medium") | |
result = model.transcribe("audio_sample.ogg", language='en') | |
srt = result.to_srt_vtt(word_level=False, segment_level=True) | |
with open(f"source_transcription.srt", "w") as f: | |
f.write(srt) | |
def extract_audio(video): | |
clip = mp.VideoFileClip(video) | |
audio = clip.audio | |
audio.write_audiofile("audio_sample.ogg") | |
def merge_srt(video, subtitle): | |
subtitles = mp.TextClip(subtitle, font="Arial", fontsize=24, color="white") | |
subtitles = subtitles.set_position(("center", "bottom")).set_duration(video.duration) | |
final = mp.CompositeVideoClip([video, subtitles]) | |
final.write_videofile("target_video.mp4") | |
return final | |
def init(video): | |
init_time = time.time() | |
print('Starting Proccess') | |
# TOKEN = os.environ.get('INFERCIA_TOKEN') | |
TOKEN = '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' | |
model, tokenizer = load_assets(TOKEN) | |
print(f"Assets Loaded, Time Taken: {time.time() - init_time}"); init_time = time.time() | |
extract_audio(video) | |
print(f"Audio Extacted from Video, Time Taken: {time.time() - init_time}"); init_time = time.time() | |
transcribe_audio() | |
print(f"Audio Transcribed using Whisper, Time Taken: {time.time() - init_time}"); init_time = time.time() | |
generate_subtitles(model, tokenizer) | |
print(f"Transcriptions Translated, Time Taken: {time.time() - init_time}"); init_time = time.time() | |
# translated_video = merge_srt(video, 'target_transcription.srt') | |
# print(f"Subtitle Added, Time Taken: {time.time() - init_time}"); init_time = time.time() | |
return 'target_transcription.srt' | |
video_input = gr.Video(type="file", label="Video") | |
srt_outputs = gr.File(label="Subtitle", file_types=[".srt", ".vtt"]) | |
interface = gr.Interface(fn=init, inputs=video_input, outputs="video", title="Subtitle Generator") | |
interface.launch(share=True) | |
# WITH BUFFERS | |
# import io | |
# def extract_audio(video): | |
# clip = mp.VideoFileClip(video) | |
# audio = clip.audio | |
# buffer = io.BytesIO() | |
# audio.write_audiofile(buffer) | |
# return buffer | |
# def transcribe_audio(buffer, model, tokenizer): | |
# model = whisper.load_model("medium") | |
# buffer.seek(0) # reset the buffer position to the beginning | |
# result = model.transcribe(buffer, language='en') |