Spaces:

artificialguybr
/

VIDEO-TRANSLATION-TRANSCRIPTION

Running on T4

App Files Files Community

VIDEO-TRANSLATION-TRANSCRIPTION / app.py

artificialguybr

Update app.py

e304087 verified 4 months ago

raw history blame contribute delete

No virus

6.15 kB

	import gradio as gr
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	from subprocess import run
	from faster_whisper import WhisperModel
	import json
	import tempfile
	import os
	import ffmpeg
	from zipfile import ZipFile
	import stat
	import uuid
	import subprocess
	import torch
	import bitsandbytes
	import scipy
	from googletrans import Translator
	import re
	import subprocess
	import datetime

	ZipFile("ffmpeg.zip").extractall()
	st = os.stat('ffmpeg')
	os.chmod('ffmpeg', st.st_mode \| stat.S_IEXEC)

	with open('google_lang_codes.json', 'r') as f:
	google_lang_codes = json.load(f)

	translator = Translator()
	whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16")

	print("cwd", os.getcwd())
	print(os.listdir())


	def process_video(Video, target_language, translate_video):
	current_path = os.getcwd()
	print("Iniciando process_video")
	common_uuid = uuid.uuid4()
	print("Checking FFmpeg availability...")
	run(["ffmpeg", "-version"])
	audio_file = f"{common_uuid}.wav"
	run(["ffmpeg", "-i", Video, audio_file])
	transcript_file = f"{current_path}/{common_uuid}.srt"

	# Transcription with Whisper.
	target_language_code = google_lang_codes.get(target_language, "en")
	print("Iniciando transcrição com Whisper")
	segments, _ = whisper_model.transcribe(audio_file, beam_size=5)
	segments = list(segments)

	with open(transcript_file, "w+", encoding="utf-8") as f:
	counter = 1
	for segment in segments:
	start_hours = int(segment.start // 3600)
	start_minutes = int((segment.start % 3600) // 60)
	start_seconds = int(segment.start % 60)
	start_milliseconds = int((segment.start - int(segment.start)) * 1000)

	end_hours = int(segment.end // 3600)
	end_minutes = int((segment.end % 3600) // 60)
	end_seconds = int(segment.end % 60)
	end_milliseconds = int((segment.end - int(segment.end)) * 1000)

	formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}"
	formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}"

	f.write(f"{counter}\n")
	f.write(f"{formatted_start} --> {formatted_end}\n")
	f.write(f"{segment.text}\n\n")
	counter += 1

	# Check if translation is needed
	if translate_video:
	# Translating the SRT from Whisper with Google Translate.
	translated_lines = []
	f.seek(0) # Move the file pointer to the beginning of the file.
	for line in f:
	if line.strip().isnumeric() or "-->" in line:
	translated_lines.append(line)
	elif line.strip() != "":
	translated_text = translator.translate(line.strip(), dest=target_language_code).text
	translated_lines.append(translated_text + "\n")
	else:
	translated_lines.append("\n")

	f.seek(0) # Move the file pointer to the beginning of the file and truncate it.
	f.truncate()
	f.writelines(translated_lines) # Write the translated lines back into the original file.
	output_video = f"{common_uuid}_output_video.mp4"
	# Debugging: Validate FFmpeg command for subtitle embedding
	print("Validating FFmpeg command for subtitle embedding...")
	print(f"Translated SRT file: {transcript_file}")

	with open(transcript_file, 'r', encoding='utf-8') as f:
	print(f"First few lines of translated SRT: {f.readlines()[:10]}")
	if os.path.exists(transcript_file):
	print(f"{transcript_file} exists.")
	else:
	print(f"{transcript_file} does not exist.")
	#transcript_file_abs_path = os.path.abspath(transcript_file)
	try:
	if target_language_code == 'ja': # 'ja' é o código de idioma para o japonês
	subtitle_style = "FontName=Noto Sans CJK JP,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
	else:
	subtitle_style = "FontName=Arial Unicode MS,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
	result = subprocess.run(["ffmpeg", "-i", Video, "-vf", f"subtitles={transcript_file}:force_style='{subtitle_style}'", output_video], capture_output=True, text=True)
	if result.returncode == 0:
	print("FFmpeg executado com sucesso.")
	else:
	print(f"FFmpeg falhou com o código de retorno {result.returncode}.")
	print("Stdout:", result.stdout)
	print("Stderr:", result.stderr)
	except Exception as e:
	print(f"Ocorreu uma exceção: {e}")
	print("process_video concluído com sucesso")
	os.unlink(audio_file)
	os.unlink(transcript_file)
	print(f"Returning output video path: {output_video}")
	return output_video

	iface = gr.Interface(
	fn=process_video,
	inputs=[
	gr.Video(),
	gr.Dropdown(choices=list(google_lang_codes.keys()), label="Target Language for Translation", value="English"),
	gr.Checkbox(label="Translate Video", value=True, info="Check to translate the video to the selected language. Uncheck for transcription only."),
	],
	outputs=[
	gr.Video(),
	#gr.FileExplorer()
	],
	live=False,
	title="VIDEO TRANSCRIPTION AND TRANSLATION",
	description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Test the [Video Dubbing](https://huggingface.co/spaces/artificialguybr/video-dubbing) space!""",
	allow_flagging=False
	)
	with gr.Blocks() as demo:
	iface.render()
	gr.Markdown("""
	Note:
	- Video limit is 15 minute. It will do the transcription and translate of subtitles.
	- The tool uses open-source models for all models. It's a alpha version.
	""")
	demo.queue(concurrency_count=1, max_size=15)
	demo.launch()