artificialguybr's picture
Update app.py
e304087 verified
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from subprocess import run
from faster_whisper import WhisperModel
import json
import tempfile
import os
import ffmpeg
from zipfile import ZipFile
import stat
import uuid
import subprocess
import torch
import bitsandbytes
import scipy
from googletrans import Translator
import re
import subprocess
import datetime
ZipFile("ffmpeg.zip").extractall()
st = os.stat('ffmpeg')
os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
with open('google_lang_codes.json', 'r') as f:
google_lang_codes = json.load(f)
translator = Translator()
whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16")
print("cwd", os.getcwd())
print(os.listdir())
def process_video(Video, target_language, translate_video):
current_path = os.getcwd()
print("Iniciando process_video")
common_uuid = uuid.uuid4()
print("Checking FFmpeg availability...")
run(["ffmpeg", "-version"])
audio_file = f"{common_uuid}.wav"
run(["ffmpeg", "-i", Video, audio_file])
transcript_file = f"{current_path}/{common_uuid}.srt"
# Transcription with Whisper.
target_language_code = google_lang_codes.get(target_language, "en")
print("Iniciando transcrição com Whisper")
segments, _ = whisper_model.transcribe(audio_file, beam_size=5)
segments = list(segments)
with open(transcript_file, "w+", encoding="utf-8") as f:
counter = 1
for segment in segments:
start_hours = int(segment.start // 3600)
start_minutes = int((segment.start % 3600) // 60)
start_seconds = int(segment.start % 60)
start_milliseconds = int((segment.start - int(segment.start)) * 1000)
end_hours = int(segment.end // 3600)
end_minutes = int((segment.end % 3600) // 60)
end_seconds = int(segment.end % 60)
end_milliseconds = int((segment.end - int(segment.end)) * 1000)
formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}"
formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}"
f.write(f"{counter}\n")
f.write(f"{formatted_start} --> {formatted_end}\n")
f.write(f"{segment.text}\n\n")
counter += 1
# Check if translation is needed
if translate_video:
# Translating the SRT from Whisper with Google Translate.
translated_lines = []
f.seek(0) # Move the file pointer to the beginning of the file.
for line in f:
if line.strip().isnumeric() or "-->" in line:
translated_lines.append(line)
elif line.strip() != "":
translated_text = translator.translate(line.strip(), dest=target_language_code).text
translated_lines.append(translated_text + "\n")
else:
translated_lines.append("\n")
f.seek(0) # Move the file pointer to the beginning of the file and truncate it.
f.truncate()
f.writelines(translated_lines) # Write the translated lines back into the original file.
output_video = f"{common_uuid}_output_video.mp4"
# Debugging: Validate FFmpeg command for subtitle embedding
print("Validating FFmpeg command for subtitle embedding...")
print(f"Translated SRT file: {transcript_file}")
with open(transcript_file, 'r', encoding='utf-8') as f:
print(f"First few lines of translated SRT: {f.readlines()[:10]}")
if os.path.exists(transcript_file):
print(f"{transcript_file} exists.")
else:
print(f"{transcript_file} does not exist.")
#transcript_file_abs_path = os.path.abspath(transcript_file)
try:
if target_language_code == 'ja': # 'ja' é o código de idioma para o japonês
subtitle_style = "FontName=Noto Sans CJK JP,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
else:
subtitle_style = "FontName=Arial Unicode MS,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
result = subprocess.run(["ffmpeg", "-i", Video, "-vf", f"subtitles={transcript_file}:force_style='{subtitle_style}'", output_video], capture_output=True, text=True)
if result.returncode == 0:
print("FFmpeg executado com sucesso.")
else:
print(f"FFmpeg falhou com o código de retorno {result.returncode}.")
print("Stdout:", result.stdout)
print("Stderr:", result.stderr)
except Exception as e:
print(f"Ocorreu uma exceção: {e}")
print("process_video concluído com sucesso")
os.unlink(audio_file)
os.unlink(transcript_file)
print(f"Returning output video path: {output_video}")
return output_video
iface = gr.Interface(
fn=process_video,
inputs=[
gr.Video(),
gr.Dropdown(choices=list(google_lang_codes.keys()), label="Target Language for Translation", value="English"),
gr.Checkbox(label="Translate Video", value=True, info="Check to translate the video to the selected language. Uncheck for transcription only."),
],
outputs=[
gr.Video(),
#gr.FileExplorer()
],
live=False,
title="VIDEO TRANSCRIPTION AND TRANSLATION",
description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Test the [Video Dubbing](https://huggingface.co/spaces/artificialguybr/video-dubbing) space!""",
allow_flagging=False
)
with gr.Blocks() as demo:
iface.render()
gr.Markdown("""
**Note:**
- Video limit is 15 minute. It will do the transcription and translate of subtitles.
- The tool uses open-source models for all models. It's a alpha version.
""")
demo.queue(concurrency_count=1, max_size=15)
demo.launch()