Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisper | |
import os | |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
from docx import Document | |
from reportlab.pdfgen import canvas | |
from reportlab.pdfbase.ttfonts import TTFont | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.lib.pagesizes import A4 | |
import arabic_reshaper | |
from bidi.algorithm import get_display | |
from pptx import Presentation | |
import subprocess | |
import shlex | |
import yt_dlp | |
# Load the Whisper model (smaller model for faster transcription) | |
model = whisper.load_model("tiny") | |
# Load M2M100 translation model for different languages | |
def load_translation_model(target_language): | |
lang_codes = { | |
"fa": "fa", # Persian (Farsi) | |
"es": "es", # Spanish | |
"fr": "fr", # French | |
"de": "de", # German | |
"it": "it", # Italian | |
"pt": "pt", # Portuguese | |
"ar": "ar", # Arabic | |
"zh": "zh", # Chinese | |
"hi": "hi", # Hindi | |
"ja": "ja", # Japanese | |
"ko": "ko", # Korean | |
"ru": "ru", # Russian | |
} | |
target_lang_code = lang_codes.get(target_language) | |
if not target_lang_code: | |
raise ValueError(f"Translation model for {target_language} not supported") | |
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") | |
translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") | |
tokenizer.src_lang = "en" | |
tokenizer.tgt_lang = target_lang_code | |
return tokenizer, translation_model | |
def translate_text(text, tokenizer, model): | |
try: | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang)) | |
return tokenizer.decode(translated[0], skip_special_tokens=True) | |
except Exception as e: | |
raise RuntimeError(f"Error during translation: {e}") | |
# Helper function to format timestamps in SRT format | |
def format_timestamp(seconds): | |
milliseconds = int((seconds % 1) * 1000) | |
seconds = int(seconds) | |
hours = seconds // 3600 | |
minutes = (seconds % 3600) // 60 | |
seconds = seconds % 60 | |
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" | |
# Corrected write_srt function | |
def write_srt(transcription, output_file, tokenizer=None, translation_model=None): | |
with open(output_file, "w") as f: | |
for i, segment in enumerate(transcription['segments']): | |
start = segment['start'] | |
end = segment['end'] | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
start_time = format_timestamp(start) | |
end_time = format_timestamp(end) | |
f.write(f"{i + 1}\n") | |
f.write(f"{start_time} --> {end_time}\n") | |
f.write(f"{text.strip()}\n\n") | |
# Embedding subtitles into video (hardsub) | |
def embed_hardsub_in_video(video_file, srt_file, output_video): | |
command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"' | |
try: | |
process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300) | |
if process.returncode != 0: | |
raise RuntimeError(f"ffmpeg error: {process.stderr}") | |
except subprocess.TimeoutExpired: | |
raise RuntimeError("ffmpeg process timed out.") | |
except Exception as e: | |
raise RuntimeError(f"Error running ffmpeg: {e}") | |
# Helper function to write Word documents | |
def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None): | |
doc = Document() | |
rtl = target_language == "fa" | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
para = doc.add_paragraph(f"{i + 1}. {text.strip()}") | |
if rtl: | |
para.paragraph_format.right_to_left = True | |
doc.save(output_file) | |
# Helper function to reverse text for RTL | |
def reverse_text_for_rtl(text): | |
return ' '.join([word[::-1] for word in text.split()]) | |
# Helper function to write PDF documents | |
def write_pdf(transcription, output_file, tokenizer=None, translation_model=None, target_language=None): | |
# Create PDF with A4 page size | |
c = canvas.Canvas(output_file, pagesize=A4) | |
# Get the directory where app.py is located | |
app_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Define font paths for different languages | |
fonts = { | |
'fa': os.path.join(app_dir, 'B-NAZANIN.TTF'), # Persian Font | |
'ar': os.path.join(app_dir, 'Amiri-Regular.ttf'), # Arabic Font | |
'default': 'Arial' # Default font for other languages | |
} | |
# Register and set the appropriate font | |
font_path = fonts.get(target_language, fonts['default']) | |
if os.path.exists(font_path): | |
try: | |
pdfmetrics.registerFont(TTFont('custom_font', font_path)) | |
c.setFont('custom_font', 12) | |
except Exception as e: | |
raise RuntimeError(f"Error registering font: {e}.") | |
else: | |
raise FileNotFoundError(f"Font file not found at {font_path}. Please ensure it is available.") | |
# Initialize y position from top of page | |
y_position = A4[1] - 50 # Start 50 points from top | |
line_height = 20 | |
# Process each segment | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
# Translate if translation model is provided | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
# Format the line with segment number | |
line = f"{i + 1}. {text.strip()}" | |
# For RTL languages like Persian and Arabic, reshape and reorder text | |
if target_language in ['fa', 'ar']: | |
reshaped_text = arabic_reshaper.reshape(line) | |
bidi_text = get_display(reshaped_text) | |
else: | |
bidi_text = line # For LTR languages, no reshaping needed | |
# Add new page if needed | |
if y_position < 50: # Leave 50 points margin at bottom | |
c.showPage() | |
c.setFont('custom_font', 12) | |
y_position = A4[1] - 50 | |
# Draw the text right-aligned for RTL languages, otherwise left-aligned | |
if target_language in ['fa', 'ar']: | |
c.drawRightString(A4[0] - 50, y_position, bidi_text) # Right align for RTL | |
else: | |
c.drawString(50, y_position, bidi_text) # Left align for LTR | |
# Update y position for next line | |
y_position -= line_height | |
# Save the PDF | |
c.save() | |
return output_file | |
# Helper function to write PowerPoint slides | |
def write_ppt(transcription, output_file, tokenizer=None, translation_model=None): | |
ppt = Presentation() | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
slide = ppt.slides.add_slide(ppt.slide_layouts[5]) | |
title = slide.shapes.title | |
title.text = f"{i + 1}. {text.strip()}" | |
ppt.save(output_file) | |
# Function to download YouTube video | |
def download_youtube_video(url): | |
ydl_opts = { | |
'format': 'mp4', | |
'outtmpl': 'downloaded_video.mp4', | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
ydl.download([url]) | |
return 'downloaded_video.mp4' | |
# Transcribing video and generating output | |
def transcribe_video(video_file, video_url, language, target_language, output_format): | |
if video_url: | |
video_file_path = download_youtube_video(video_url) | |
else: | |
video_file_path = video_file.name | |
result = model.transcribe(video_file_path, language=language) | |
video_name = os.path.splitext(video_file_path)[0] | |
if target_language != "en": | |
try: | |
tokenizer, translation_model = load_translation_model(target_language) | |
except Exception as e: | |
raise RuntimeError(f"Error loading translation model: {e}") | |
else: | |
tokenizer, translation_model = None, None | |
srt_file = f"{video_name}.srt" | |
write_srt(result, srt_file, tokenizer, translation_model) | |
if output_format == "SRT": | |
return srt_file | |
elif output_format == "Video with Hardsub": | |
output_video = f"{video_name}_with_subtitles.mp4" | |
try: | |
embed_hardsub_in_video(video_file_path, srt_file, output_video) | |
return output_video | |
except Exception as e: | |
raise RuntimeError(f"Error embedding subtitles in video: {e}") | |
elif output_format == "Word": | |
word_file = f"{video_name}.docx" | |
write_word(result, word_file, tokenizer, translation_model, target_language) | |
return word_file | |
elif output_format == "PDF": | |
pdf_file = f"{video_name}.pdf" | |
write_pdf(result, pdf_file, tokenizer, translation_model) | |
return pdf_file | |
elif output_format == "PowerPoint": | |
ppt_file = f"{video_name}.pptx" | |
write_ppt(result, ppt_file, tokenizer, translation_model) | |
return ppt_file | |
# Gradio interface with YouTube URL | |
iface = gr.Interface( | |
fn=transcribe_video, | |
inputs=[ | |
gr.File(label="Upload Video File (or leave empty for YouTube link)"), # Removed 'optional=True' | |
gr.Textbox(label="YouTube Video URL (optional)", placeholder="https://www.youtube.com/watch?v=..."), | |
gr.Dropdown(label="Select Original Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"), | |
gr.Dropdown(label="Select Subtitle Translation Language", choices=["en", "fa", "es", "de", "fr", "it", "pt"], value="fa"), | |
gr.Radio(label="Choose Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub") | |
], | |
outputs=gr.File(label="Download File"), | |
title="Video Subtitle Generator with Translation & Multi-Format Output (Supports YouTube)", | |
description=( | |
"This tool allows you to generate subtitles from a video file or YouTube link using Whisper, " | |
"translate the subtitles into multiple languages using M2M100, and export them " | |
"in various formats including SRT, hardcoded subtitles in video, Word, PDF, or PowerPoint." | |
), | |
theme="compact", | |
live=False | |
) | |
if __name__ == "__main__": | |
iface.launch() | |