subtify / app.py
Maximofn's picture
Add delete buton
d6c6cf5
raw history blame
No virus
27.4 kB
import gradio as gr
import argparse
import os
import torch
from time import sleep
from tqdm import tqdm
from lang_list import union_language_dict
# import pyperclip
from pytube import YouTube
import re
NUMBER = 100
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DOWNLOAD = True
SLICE_AUDIO = True
SEPARE_VOCALS = False
TRANSCRIBE_AUDIO = True
CONCATENATE_TRANSCRIPTIONS = True
TRANSLATE_TRANSCRIPTIONS = True
ADD_SUBTITLES_TO_VIDEO = True
REMOVE_FILES = False
if DEVICE == "cpu":
# I supose that I am on huggingface server
SECONDS = 300
else:
SECONDS = 300
YOUTUBE = "youtube"
TWITCH = "twitch"
ERROR = "error"
language_dict = union_language_dict()
def subtify_no_ui():
number_works = 7
progress_bar = tqdm(total=number_works, desc="Subtify")
folder_vocals = "vocals"
folder_chunck = "chunks"
folder_concatenated = "concatenated_transcriptions"
folder_translated_transcriptions = "translated_transcriptions"
if not os.path.exists(folder_vocals):
os.makedirs(folder_vocals)
if not os.path.exists(folder_chunck):
os.makedirs(folder_chunck)
if not os.path.exists(folder_concatenated):
os.makedirs(folder_concatenated)
if not os.path.exists(folder_translated_transcriptions):
os.makedirs(folder_translated_transcriptions)
################## Download video and audio ##################
if DOWNLOAD:
print('*'*NUMBER)
# url = "https://www.twitch.tv/videos/1936119752" # twitch Rob Mula 2 horas
# url = "https://www.youtube.com/watch?v=yX5EJf4R77s" # ✅ debate, varios hablantes, 3 minutos
# url = "https://www.youtube.com/watch?v=cgx0QnXo1OU" # ✅ smart home, un solo hablante, 4:42 minutos
url = "https://www.youtube.com/watch?v=dgOBxhi19T8" # ✅ rob mula, muchos hablantes, 4:28 minutos
# url = "https://www.youtube.com/watch?v=Coj72EzmX20" # rob mula, un solo hablante, 16 minutos
# url = "https://www.youtube.com/watch?v=Tqth0fKo0_g" # Conversación short
print(f"Downloading video and audio from {url}")
python_file = "download.py"
command = f"python {python_file} {url}"
os.system(command)
sleep(5)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
################## Slice audio ##################
if SLICE_AUDIO:
print('*'*NUMBER)
print("Slicing audio")
python_file = "slice_audio.py"
audio = "audios/download_audio.mp3"
command = f"python {python_file} {audio} {SECONDS}"
os.system(command)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
################## Get vocals ##################
chunck_file = "chunks/output_files.txt"
print('*'*NUMBER)
if SEPARE_VOCALS:
print("Get vocals")
python_file = "separe_vocals.py"
command = f"python {python_file} {chunck_file} {DEVICE}"
os.system(command)
if REMOVE_FILES:
with open(chunck_file, 'r') as f:
files = f.read().splitlines()
for file in files:
command = f"rm {file}"
os.system(command)
else:
print("Moving chunks")
with open(f"{folder_vocals}/speakers.txt", 'w') as f:
f.write(str(0))
if REMOVE_FILES:
command = f"mv {folder_chunck}/*.mp3 {folder_vocals}/"
os.system(command)
else:
command = f"cp {folder_chunck}/*.mp3 {folder_vocals}/"
os.system(command)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
################# Transcript vocals ##################
speakers_file = "vocals/speakers.txt"
if TRANSCRIBE_AUDIO:
print('*'*NUMBER)
print("Transcript vocals")
python_file = "transcribe.py"
language = "English"
command = f"python {python_file} {chunck_file} {language} {speakers_file} {DEVICE} {not SEPARE_VOCALS}"
os.system(command)
if REMOVE_FILES:
vocals_folder = "vocals"
with open(chunck_file, 'r') as f:
files = f.read().splitlines()
with open(speakers_file, 'r') as f:
speakers = f.read().splitlines()
speakers = int(speakers[0])
for file in files:
if speakers > 0:
vocals_extension = "wav"
for i in range(speakers):
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
vocal = f'{vocals_folder}/{file_name}_speaker{i:003d}.{vocals_extension}'
command = f"rm {vocal}"
os.system(command)
else:
vocals_extension = "mp3"
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
vocal = f'{vocals_folder}/{file_name}.{vocals_extension}'
command = f"rm {vocal}"
os.system(command)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
################## Concatenate transcriptions ##################
if CONCATENATE_TRANSCRIPTIONS:
print('*'*NUMBER)
print("Concatenate transcriptions")
python_file = "concat_transcriptions.py"
command = f"python {python_file} {chunck_file} {SECONDS} {speakers_file}"
os.system(command)
if REMOVE_FILES:
with open(chunck_file, 'r') as f:
files = f.read().splitlines()
for file in files:
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
transcriptions_folder = "transcriptions"
transcription_extension = "srt"
command = f"rm {transcriptions_folder}/{file_name}.{transcription_extension}"
os.system(command)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
################## Translate transcription ##################
target_languaje = "Español"
if TRANSLATE_TRANSCRIPTIONS:
print('*'*NUMBER)
print("Translate transcription")
transcription_file = "concatenated_transcriptions/download_audio.srt"
source_languaje = "English"
python_file = "translate_transcriptions.py"
command = f"python {python_file} {transcription_file} --source_languaje {source_languaje} --target_languaje {target_languaje} --device {DEVICE}"
os.system(command)
if REMOVE_FILES:
command = f"rm {transcription_file}"
os.system(command)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
################## Add subtitles to video ##################
if ADD_SUBTITLES_TO_VIDEO:
print('*'*NUMBER)
print("Add subtitles to video")
python_file = "add_subtitles_to_video.py"
transcription_file = f"translated_transcriptions/download_audio_{target_languaje}.srt"
input_video_file = "videos/download_video.mp4"
input_audio_file = "audios/download_audio.mp3"
command = f"python {python_file} {transcription_file} {input_video_file} {input_audio_file}"
os.system(command)
if REMOVE_FILES:
command = f"rm {input_video_file}"
os.system(command)
command = f"rm {input_audio_file}"
os.system(command)
command = f"rm {transcription_file}"
os.system(command)
command = f"rm chunks/output_files.txt"
os.system(command)
command = f"rm vocals/speakers.txt"
os.system(command)
print('*'*NUMBER)
print("\n\n")
progress_bar.update(1)
def remove_all():
command = f"rm -r audios"
os.system(command)
command = f"rm -r chunks"
os.system(command)
command = f"rm -r concatenated_transcriptions"
os.system(command)
command = f"rm -r transcriptions"
os.system(command)
command = f"rm -r translated_transcriptions"
os.system(command)
command = f"rm -r videos"
os.system(command)
command = f"rm -r vocals"
os.system(command)
# # def copy_url_from_clipboard():
# # return pyperclip.paste()
def clear_video_url():
visible = False
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
image = gr.Image(visible=visible, scale=1)
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
start_block = gr.Textbox(placeholder="Waiting", label="Start block", elem_id="start_block", interactive=False, visible=visible)
video_donwloaded = gr.Textbox(placeholder="Waiting", label="Video downloaded", elem_id="video_downloaded", interactive=False, visible=visible)
video_sliced = gr.Textbox(placeholder="Waiting", label="Video sliced", elem_id="video_sliced", interactive=False, visible=visible)
video_transcribed = gr.Textbox(placeholder="Waiting", label="Video transcribed", elem_id="video_transcribed", interactive=False, visible=visible)
transcriptions_concatenated = gr.Textbox(placeholder="Waiting", label="Transcriptions concatenated", elem_id="transcriptions_concatenated", interactive=False, visible=visible)
video_translated = gr.Textbox(placeholder="Waiting", label="Transcription translated", elem_id="transcription_translated", interactive=False, visible=visible)
video_subtitled = gr.Textbox(placeholder="Waiting", label="Video subtitled", elem_id="video_subtitled", interactive=False, visible=visible)
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
original_video_path = gr.Textbox(label="Original video path", elem_id="original_video_path", visible=visible)
original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
return (
"",
image,
source_languaje,
target_languaje,
number_of_speakers,
subtify_button,
start_block,
video_donwloaded,
video_sliced,
video_transcribed,
transcriptions_concatenated,
video_translated,
video_subtitled,
original_audio_path,
original_video_path,
original_audio_transcribed_path,
original_audio_translated_path,
subtitled_video,
)
def get_youtube_thumbnail(url):
yt = YouTube(url)
thumbnail_url = yt.thumbnail_url
return thumbnail_url
def is_valid_youtube_url(url):
patron_youtube = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+'
if not re.match(patron_youtube, url):
return False
return True
def is_valid_url(url):
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
# Youtube
if "youtube" in url.lower() or "youtu.be" in url.lower():
if is_valid_youtube_url(url):
thumbnail = get_youtube_thumbnail(url)
if thumbnail:
return (
gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
source_languaje,
target_languaje,
number_of_speakers,
subtify_button,
)
else:
return (
gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
source_languaje,
target_languaje,
number_of_speakers,
subtify_button,
)
# Twitch
elif "twitch" in url.lower() or "twitch.tv" in url.lower():
return (
gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
source_languaje,
target_languaje,
number_of_speakers,
subtify_button,
)
# Error
visible = False
image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
return (
image,
source_languaje,
target_languaje,
number_of_speakers,
subtify_button,
)
def change_visibility_texboxes(url):
start_block = gr.Textbox(value="Done", label="Start block", elem_id="start_block", interactive=False, visible=False)
video_donwloaded = gr.Textbox(placeholder="Waiting", label="Video downloaded", elem_id="video_downloaded", interactive=False, visible=True)
video_sliced = gr.Textbox(placeholder="Waiting", label="Video sliced", elem_id="video_sliced", interactive=False, visible=True)
video_transcribed = gr.Textbox(placeholder="Waiting", label="Video transcribed", elem_id="video_transcribed", interactive=False, visible=True)
transcriptions_concatenated = gr.Textbox(placeholder="Waiting", label="Transcriptions concatenated", elem_id="transcriptions_concatenated", interactive=False, visible=True)
video_translated = gr.Textbox(placeholder="Waiting", label="Transcription translated", elem_id="transcription_translated", interactive=False, visible=True)
video_subtitled = gr.Textbox(placeholder="Waiting", label="Video subtitled", elem_id="video_subtitled", interactive=False, visible=True)
return (
start_block,
video_donwloaded,
video_sliced,
video_transcribed,
transcriptions_concatenated,
video_translated,
video_subtitled,
)
def get_audio_and_video_from_video(url):
python_file = "download.py"
command = f"python {python_file} {url}"
os.system(command)
sleep(5)
audio = "audios/download_audio.mp3"
video = "videos/download_video.mp4"
return (
gr.Textbox(value="Ok", label="Video downloaded", elem_id="video_downloaded", interactive=False, visible=True),
gr.Textbox(value=audio, label="Original audio path", elem_id="original_audio_path", visible=False),
gr.Textbox(value=video, label="Original video path", elem_id="original_video_path", visible=False),
)
def slice_audio(audio_path):
folder_vocals = "vocals"
folder_chunck = "chunks"
if not os.path.exists(folder_vocals):
os.makedirs(folder_vocals)
if not os.path.exists(folder_chunck):
os.makedirs(folder_chunck)
python_file = "slice_audio.py"
command = f"python {python_file} {audio_path} {SECONDS}"
os.system(command)
with open(f"{folder_vocals}/speakers.txt", 'w') as f:
f.write(str(0))
command = f"mv {folder_chunck}/*.mp3 {folder_vocals}/"
os.system(command)
# sleep(5)
return (
gr.Textbox(value="Ok", label="Video sliced", elem_id="video_sliced", interactive=False, visible=True)
)
def trascribe_audio(source_languaje):
folder_vocals = "vocals"
python_file = "transcribe.py"
chunck_file = "chunks/output_files.txt"
speakers_file = "vocals/speakers.txt"
command = f"python {python_file} {chunck_file} {source_languaje} {speakers_file} {DEVICE} {not SEPARE_VOCALS}"
os.system(command)
with open(chunck_file, 'r') as f:
files = f.read().splitlines()
with open(speakers_file, 'r') as f:
speakers = f.read().splitlines()
speakers = int(speakers[0])
for file in files:
if speakers > 0:
vocals_extension = "wav"
for i in range(speakers):
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
vocal = f'{folder_vocals}/{file_name}_speaker{i:003d}.{vocals_extension}'
command = f"rm {vocal}"
os.system(command)
else:
vocals_extension = "mp3"
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
vocal = f'{folder_vocals}/{file_name}.{vocals_extension}'
command = f"rm {vocal}"
os.system(command)
return (
gr.Textbox(value="Ok", label="Video transcribed", elem_id="video_transcribed", interactive=False, visible=True)
)
def concatenate_transcriptions():
folder_concatenated = "concatenated_transcriptions"
if not os.path.exists(folder_concatenated):
os.makedirs(folder_concatenated)
chunck_file = "chunks/output_files.txt"
speakers_file = "vocals/speakers.txt"
python_file = "concat_transcriptions.py"
command = f"python {python_file} {chunck_file} {SECONDS} {speakers_file}"
os.system(command)
with open(chunck_file, 'r') as f:
files = f.read().splitlines()
for file in files:
file_name, _ = file.split(".")
_, file_name = file_name.split("/")
transcriptions_folder = "transcriptions"
transcription_extension = "srt"
command = f"rm {transcriptions_folder}/{file_name}.{transcription_extension}"
os.system(command)
audio_transcribed = "concatenated_transcriptions/download_audio.srt"
with open(audio_transcribed, 'r') as f:
result = f.read()
return (
gr.Textbox(value="Ok", label="Transcriptions concatenated", elem_id="transcriptions_concatenated", interactive=False, visible=True),
gr.Textbox(value=audio_transcribed, label="Original audio transcribed", elem_id="original_audio_transcribed", visible=False),
)
def translate_transcription(original_audio_transcribed_path, source_languaje, target_languaje):
folder_translated_transcriptions = "translated_transcriptions"
if not os.path.exists(folder_translated_transcriptions):
os.makedirs(folder_translated_transcriptions)
python_file = "translate_transcriptions.py"
command = f"python {python_file} {original_audio_transcribed_path} --source_languaje {source_languaje} --target_languaje {target_languaje} --device {DEVICE}"
os.system(command)
translated_transcription = f"translated_transcriptions/download_audio_{target_languaje}.srt"
with open(translated_transcription, 'r') as f:
result = f.read()
transcription_file = "concatenated_transcriptions/download_audio.srt"
command = f"rm {transcription_file}"
os.system(command)
return (
gr.Textbox(value="Ok", label="Transcription translated", elem_id="transcription_translated", interactive=False, visible=True),
gr.Textbox(value=translated_transcription, label="Original audio translated", elem_id="original_audio_translated", visible=False)
)
def add_translated_subtitles_to_video(original_video_path, original_audio_path, original_audio_translated_path):
python_file = "add_subtitles_to_video.py"
command = f"python {python_file} {original_audio_translated_path} {original_video_path} {original_audio_path}"
os.system(command)
command = f"rm {original_video_path}"
os.system(command)
command = f"rm {original_audio_path}"
os.system(command)
command = f"rm {original_audio_translated_path}"
os.system(command)
command = f"rm chunks/output_files.txt"
os.system(command)
command = f"rm vocals/speakers.txt"
os.system(command)
subtitled_video = "videos/download_video_with_subtitles.mp4"
return (
gr.Textbox(value="Ok", label="Video subtitled", elem_id="video_subtitled", interactive=False, visible=True),
gr.Video(value=subtitled_video, label="Subtitled video", elem_id="subtitled_video", visible=True, interactive=False),
)
def subtify():
with gr.Blocks() as demo:
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
# Layout
gr.Markdown("""# Subtify""")
with gr.Row(variant="panel"):
url_textbox = gr.Textbox(placeholder="Add video URL here", label="Video URL", elem_id="video_url", scale=1, interactive=True)
# copy_button = gr.Button(size="sm", icon="icons/copy.svg", value="", min_width="10px", scale=0)
delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="", min_width="10px", scale=0)
visible = False
with gr.Row(equal_height=False):
image = gr.Image(visible=visible, scale=1)
with gr.Column():
with gr.Row():
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
with gr.Row():
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
start_block = gr.Textbox(placeholder="Waiting", label="Start block", elem_id="start_block", interactive=False, visible=visible)
with gr.Row():
video_donwloaded = gr.Textbox(placeholder="Waiting", label="Video downloaded", elem_id="video_downloaded", interactive=False, visible=visible)
video_sliced = gr.Textbox(placeholder="Waiting", label="Video sliced", elem_id="video_sliced", interactive=False, visible=visible)
video_transcribed = gr.Textbox(placeholder="Waiting", label="Video transcribed", elem_id="video_transcribed", interactive=False, visible=visible)
transcriptions_concatenated = gr.Textbox(placeholder="Waiting", label="Transcriptions concatenated", elem_id="transcriptions_concatenated", interactive=False, visible=visible)
video_translated = gr.Textbox(placeholder="Waiting", label="Transcription translated", elem_id="transcription_translated", interactive=False, visible=visible)
video_subtitled = gr.Textbox(placeholder="Waiting", label="Video subtitled", elem_id="video_subtitled", interactive=False, visible=visible)
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
original_video_path = gr.Textbox(label="Original video path", elem_id="original_video_path", visible=visible)
original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
# Events
# copy_button.click(fn=copy_url_from_clipboard, outputs=url_textbox)
delete_button.click(
fn=clear_video_url,
outputs=[
url_textbox,
image,
source_languaje,
target_languaje,
number_of_speakers,
subtify_button,
start_block,
video_donwloaded,
video_sliced,
video_transcribed,
transcriptions_concatenated,
video_translated,
video_subtitled,
subtitled_video,
]
)
url_textbox.change(
fn=is_valid_url,
inputs=url_textbox,
outputs=[image, source_languaje, target_languaje, number_of_speakers, subtify_button]
)
subtify_button.click(
fn=change_visibility_texboxes,
outputs=[start_block, video_donwloaded, video_sliced, video_transcribed, transcriptions_concatenated, video_translated, video_subtitled]
)
start_block.change(
fn=get_audio_and_video_from_video,
inputs=[url_textbox],
outputs=[video_donwloaded, original_audio_path, original_video_path]
)
video_donwloaded.change(
fn=slice_audio,
inputs=[original_audio_path],
outputs=[video_sliced]
)
video_sliced.change(
fn=trascribe_audio,
inputs=[source_languaje],
outputs=[video_transcribed]
)
video_transcribed.change(
fn=concatenate_transcriptions,
outputs=[transcriptions_concatenated, original_audio_transcribed_path]
)
transcriptions_concatenated.change(
fn=translate_transcription,
inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
outputs=[video_translated, original_audio_translated_path]
)
video_translated.change(
fn=add_translated_subtitles_to_video,
inputs=[original_video_path, original_audio_path, original_audio_translated_path],
outputs=[video_subtitled, subtitled_video]
)
demo.launch()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--no_ui", action="store_true")
parser.add_argument("--remove_all", action="store_true")
args = parser.parse_args()
if args.no_ui:
subtify_no_ui()
elif args.remove_all:
remove_all()
else:
subtify()