import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from subprocess import run from faster_whisper import WhisperModel import json import tempfile import os import ffmpeg from zipfile import ZipFile import stat import uuid import subprocess import torch import bitsandbytes import scipy from googletrans import Translator import re import subprocess import datetime ZipFile("ffmpeg.zip").extractall() st = os.stat('ffmpeg') os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC) with open('google_lang_codes.json', 'r') as f: google_lang_codes = json.load(f) translator = Translator() whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16") print("cwd", os.getcwd()) print(os.listdir()) def process_video(Video, target_language, translate_video): current_path = os.getcwd() print("Iniciando process_video") common_uuid = uuid.uuid4() print("Checking FFmpeg availability...") run(["ffmpeg", "-version"]) audio_file = f"{common_uuid}.wav" run(["ffmpeg", "-i", Video, audio_file]) transcript_file = f"{current_path}/{common_uuid}.srt" # Transcription with Whisper. target_language_code = google_lang_codes.get(target_language, "en") print("Iniciando transcrição com Whisper") segments, _ = whisper_model.transcribe(audio_file, beam_size=5) segments = list(segments) with open(transcript_file, "w+", encoding="utf-8") as f: counter = 1 for segment in segments: start_hours = int(segment.start // 3600) start_minutes = int((segment.start % 3600) // 60) start_seconds = int(segment.start % 60) start_milliseconds = int((segment.start - int(segment.start)) * 1000) end_hours = int(segment.end // 3600) end_minutes = int((segment.end % 3600) // 60) end_seconds = int(segment.end % 60) end_milliseconds = int((segment.end - int(segment.end)) * 1000) formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}" formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}" f.write(f"{counter}\n") f.write(f"{formatted_start} --> {formatted_end}\n") f.write(f"{segment.text}\n\n") counter += 1 # Check if translation is needed if translate_video: # Translating the SRT from Whisper with Google Translate. translated_lines = [] f.seek(0) # Move the file pointer to the beginning of the file. for line in f: if line.strip().isnumeric() or "-->" in line: translated_lines.append(line) elif line.strip() != "": translated_text = translator.translate(line.strip(), dest=target_language_code).text translated_lines.append(translated_text + "\n") else: translated_lines.append("\n") f.seek(0) # Move the file pointer to the beginning of the file and truncate it. f.truncate() f.writelines(translated_lines) # Write the translated lines back into the original file. output_video = f"{common_uuid}_output_video.mp4" # Debugging: Validate FFmpeg command for subtitle embedding print("Validating FFmpeg command for subtitle embedding...") print(f"Translated SRT file: {transcript_file}") with open(transcript_file, 'r', encoding='utf-8') as f: print(f"First few lines of translated SRT: {f.readlines()[:10]}") if os.path.exists(transcript_file): print(f"{transcript_file} exists.") else: print(f"{transcript_file} does not exist.") #transcript_file_abs_path = os.path.abspath(transcript_file) try: if target_language_code == 'ja': # 'ja' é o código de idioma para o japonês subtitle_style = "FontName=Noto Sans CJK JP,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1" else: subtitle_style = "FontName=Arial Unicode MS,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1" result = subprocess.run(["ffmpeg", "-i", Video, "-vf", f"subtitles={transcript_file}:force_style='{subtitle_style}'", output_video], capture_output=True, text=True) if result.returncode == 0: print("FFmpeg executado com sucesso.") else: print(f"FFmpeg falhou com o código de retorno {result.returncode}.") print("Stdout:", result.stdout) print("Stderr:", result.stderr) except Exception as e: print(f"Ocorreu uma exceção: {e}") print("process_video concluído com sucesso") os.unlink(audio_file) os.unlink(transcript_file) print(f"Returning output video path: {output_video}") return output_video iface = gr.Interface( fn=process_video, inputs=[ gr.Video(), gr.Dropdown(choices=list(google_lang_codes.keys()), label="Target Language for Translation", value="English"), gr.Checkbox(label="Translate Video", value=True, info="Check to translate the video to the selected language. Uncheck for transcription only."), ], outputs=[ gr.Video(), #gr.FileExplorer() ], live=False, title="VIDEO TRANSCRIPTION AND TRANSLATION", description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Test the [Video Dubbing](https://huggingface.co/spaces/artificialguybr/video-dubbing) space!""", allow_flagging=False ) with gr.Blocks() as demo: iface.render() gr.Markdown(""" **Note:** - Video limit is 15 minute. It will do the transcription and translate of subtitles. - The tool uses open-source models for all models. It's a alpha version. """) demo.queue(concurrency_count=1, max_size=15) demo.launch()