|
|
import os |
|
|
import sys |
|
|
import subprocess |
|
|
|
|
|
|
|
|
try: |
|
|
import imageio_ffmpeg |
|
|
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe() |
|
|
ffmpeg_dir = os.path.dirname(ffmpeg_path) |
|
|
|
|
|
os.environ["PATH"] += os.pathsep + ffmpeg_dir |
|
|
|
|
|
subprocess.run(["chmod", "+x", ffmpeg_path]) |
|
|
print(f"✅ FFmpeg configured at: {ffmpeg_path}") |
|
|
except ImportError: |
|
|
print("⚠️ imageio-ffmpeg not found. Please add it to requirements.txt") |
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import spaces |
|
|
from soni_translate.logging_setup import logger, set_logging_level, configure_logging_libs |
|
|
configure_logging_libs() |
|
|
import whisperx |
|
|
from soni_translate.preprocessor import audio_video_preprocessor, audio_preprocessor |
|
|
from soni_translate.postprocessor import media_out, get_no_ext_filename, sound_separate, get_subtitle_speaker |
|
|
from soni_translate.speech_segmentation import transcribe_speech, align_speech, diarize_speech, ASR_MODEL_OPTIONS, find_whisper_models, diarization_models, COMPUTE_TYPE_CPU, COMPUTE_TYPE_GPU |
|
|
from soni_translate.translate_segments import translate_text, TRANSLATION_PROCESS_OPTIONS |
|
|
from soni_translate.text_to_speech import audio_segmentation_to_voice, edge_tts_voices_list, coqui_xtts_voices_list, piper_tts_voices_list |
|
|
from soni_translate.audio_segments import create_translated_audio, accelerate_segments |
|
|
from soni_translate.language_configuration import LANGUAGES, LANGUAGES_LIST |
|
|
from soni_translate.utils import remove_files, get_link_list, get_valid_files, is_audio_file, is_subtitle_file |
|
|
from soni_translate.text_multiformat_processor import process_subtitles, srt_file_to_segments, break_aling_segments |
|
|
from soni_translate.languages_gui import language_data |
|
|
import hashlib |
|
|
import json |
|
|
import copy |
|
|
from pydub import AudioSegment |
|
|
|
|
|
|
|
|
if "GOOGLE_API_KEY" in os.environ: |
|
|
print("✅ Google API Key found in secrets.") |
|
|
else: |
|
|
print("⚠️ Google API Key not found. Please set it in the Space secrets.") |
|
|
|
|
|
if "OPENAI_API_KEY" in os.environ: |
|
|
print("✅ OpenAI API Key found in secrets.") |
|
|
else: |
|
|
print("⚠️ OpenAI API Key not found. Please set it in the Space secrets if you use OpenAI models.") |
|
|
|
|
|
|
|
|
|
|
|
directories = ["downloads", "logs", "weights", "clean_song_output", "_XTTS_", "audio", "outputs"] |
|
|
for directory in directories: |
|
|
if not os.path.exists(directory): |
|
|
os.makedirs(directory) |
|
|
|
|
|
class SoniTranslate: |
|
|
def __init__(self): |
|
|
|
|
|
self.result_diarize = None |
|
|
self.align_language = None |
|
|
self.result_source_lang = None |
|
|
self.tts_info = self._get_tts_info() |
|
|
|
|
|
def _get_tts_info(self): |
|
|
|
|
|
class TTS_Info: |
|
|
def tts_list(self): |
|
|
try: |
|
|
return edge_tts_voices_list() |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not get Edge-TTS voices: {e}") |
|
|
return ["en-US-JennyNeural-Female"] |
|
|
return TTS_Info() |
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=300) |
|
|
def multilingual_media_conversion( |
|
|
self, |
|
|
media_file, |
|
|
link_media, |
|
|
directory_input, |
|
|
origin_language, |
|
|
target_language, |
|
|
tts_voice, |
|
|
transcriber_model, |
|
|
max_speakers, |
|
|
is_gui=True, |
|
|
progress=gr.Progress(), |
|
|
): |
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
logger.info(f"Working on device: {self.device}") |
|
|
|
|
|
try: |
|
|
progress(0.05, desc="Starting process...") |
|
|
|
|
|
|
|
|
input_media = None |
|
|
if media_file is not None: |
|
|
input_media = media_file.name |
|
|
elif link_media: |
|
|
input_media = link_media |
|
|
elif directory_input and os.path.exists(directory_input): |
|
|
input_media = directory_input |
|
|
|
|
|
if not input_media: |
|
|
raise ValueError("No input media specified. Please upload a file or provide a URL.") |
|
|
|
|
|
base_audio_wav = "audio.wav" |
|
|
base_video_file = "video.mp4" |
|
|
|
|
|
remove_files(base_audio_wav, base_video_file) |
|
|
|
|
|
progress(0.1, desc="Processing input media...") |
|
|
if is_audio_file(input_media): |
|
|
audio_preprocessor(False, input_media, base_audio_wav) |
|
|
else: |
|
|
audio_video_preprocessor(False, input_media, base_video_file, base_audio_wav) |
|
|
|
|
|
|
|
|
progress(0.25, desc="Transcribing audio with WhisperX...") |
|
|
source_lang_code = LANGUAGES[origin_language] if origin_language != "Automatic detection" else None |
|
|
|
|
|
|
|
|
compute_type = "float16" if self.device == "cuda" else "int8" |
|
|
|
|
|
audio, result = transcribe_speech( |
|
|
base_audio_wav, |
|
|
transcriber_model, |
|
|
compute_type, |
|
|
16, |
|
|
source_lang_code |
|
|
) |
|
|
|
|
|
progress(0.4, desc="Aligning transcription...") |
|
|
self.align_language = result["language"] |
|
|
result = align_speech(audio, result) |
|
|
|
|
|
|
|
|
progress(0.5, desc="Separating speakers...") |
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if not hf_token: |
|
|
logger.warning("Hugging Face token not found. Diarization might fail.") |
|
|
|
|
|
self.result_diarize = diarize_speech( |
|
|
base_audio_wav, |
|
|
result, |
|
|
1, |
|
|
max_speakers, |
|
|
hf_token, |
|
|
diarization_models["pyannote_3.1"] |
|
|
) |
|
|
self.result_source_lang = copy.deepcopy(self.result_diarize) |
|
|
|
|
|
|
|
|
progress(0.6, desc="Translating text...") |
|
|
translate_to_code = LANGUAGES[target_language] |
|
|
self.result_diarize["segments"] = translate_text( |
|
|
self.result_diarize["segments"], |
|
|
translate_to_code, |
|
|
"google_translator_batch", |
|
|
chunk_size=1800, |
|
|
source=self.align_language, |
|
|
) |
|
|
|
|
|
|
|
|
progress(0.75, desc="Generating dubbed audio...") |
|
|
valid_speakers = audio_segmentation_to_voice( |
|
|
self.result_diarize, |
|
|
translate_to_code, |
|
|
is_gui, |
|
|
tts_voice |
|
|
) |
|
|
|
|
|
|
|
|
progress(0.85, desc="Synchronizing and mixing audio...") |
|
|
dub_audio_file = "audio_dub_solo.ogg" |
|
|
remove_files(dub_audio_file) |
|
|
audio_files, _ = accelerate_segments(self.result_diarize, 1.8, valid_speakers) |
|
|
create_translated_audio(self.result_diarize, audio_files, dub_audio_file, False, False) |
|
|
|
|
|
mix_audio_file = "audio_mix.mp3" |
|
|
remove_files(mix_audio_file) |
|
|
|
|
|
|
|
|
command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume=0.1[a];[1:0]volume=1.5[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}' |
|
|
os.system(command_volume_mix) |
|
|
|
|
|
|
|
|
progress(0.95, desc="Creating final video...") |
|
|
output_filename = "video_dub.mp4" |
|
|
remove_files(output_filename) |
|
|
|
|
|
if os.path.exists(base_video_file): |
|
|
os.system(f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {output_filename}") |
|
|
final_output = media_out(input_media, translate_to_code, "", "mp4", file_obj=output_filename) |
|
|
else: |
|
|
final_output = media_out(input_media, translate_to_code, "", "mp3", file_obj=mix_audio_file) |
|
|
|
|
|
progress(1.0, desc="Done!") |
|
|
return final_output |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"An error occurred: {e}") |
|
|
gr.Error(f"An error occurred: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
SoniTr = SoniTranslate() |
|
|
|
|
|
|
|
|
with gr.Blocks(theme="Taithrah/Minimal") as app: |
|
|
gr.Markdown("<center><h1>📽️ ابزار دوبله ویدیو با هوش مصنوعی 🈷️</h1></center>") |
|
|
gr.Markdown("ساخته شده توسط [aigolden](https://youtube.com/@aigolden) - بر پایه [SoniTranslate](https://github.com/r3gm/SoniTranslate)") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### ۱. ورودی ویدیو") |
|
|
video_file_input = gr.File(label="آپلود ویدیو") |
|
|
link_media_input = gr.Textbox(label="یا لینک یوتیوب", placeholder="https://www.youtube.com/watch?v=...") |
|
|
|
|
|
gr.Markdown("### ۲. تنظیمات دوبله") |
|
|
origin_language_input = gr.Dropdown(LANGUAGES_LIST, value="Automatic detection", label="زبان اصلی ویدیو") |
|
|
target_language_input = gr.Dropdown(LANGUAGES_LIST[1:], value="Persian (fa)", label="زبان مقصد دوبله") |
|
|
tts_voice_input = gr.Dropdown(SoniTr.tts_info.tts_list(), value="fa-IR-FaridNeural", label="صدای گوینده") |
|
|
|
|
|
with gr.Accordion("تنظیمات پیشرفته", open=False): |
|
|
transcriber_model_input = gr.Dropdown( |
|
|
ASR_MODEL_OPTIONS + find_whisper_models(), |
|
|
value="large-v3", |
|
|
label="مدل استخراج متن (Whisper)", |
|
|
info="مدلهای بزرگتر دقیقتر اما کندتر هستند." |
|
|
) |
|
|
max_speakers_input = gr.Slider(1, 10, value=2, step=1, label="حداکثر تعداد گوینده") |
|
|
|
|
|
process_button = gr.Button("شروع دوبله", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("### ۳. خروجی") |
|
|
output_video = gr.Video(label="ویدیوی دوبله شده") |
|
|
output_file = gr.File(label="دانلود فایل") |
|
|
|
|
|
process_button.click( |
|
|
SoniTr.multilingual_media_conversion, |
|
|
inputs=[ |
|
|
video_file_input, |
|
|
link_media_input, |
|
|
gr.Textbox(visible=False), |
|
|
origin_language_input, |
|
|
target_language_input, |
|
|
tts_voice_input, |
|
|
transcriber_model_input, |
|
|
max_speakers_input, |
|
|
], |
|
|
outputs=[output_file] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch(server_name="0.0.0.0", server_port=7860) |