ChiBenevisamPas commited on
Commit
1a2251f
·
verified ·
1 Parent(s): a939c89
Files changed (1) hide show
  1. app.py +33 -68
app.py CHANGED
@@ -2,16 +2,14 @@ import gradio as gr
2
  import whisper
3
  import os
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
- from docx import Document # For Word output
6
- from fpdf import FPDF # For PDF output
7
- from pptx import Presentation # For PowerPoint output
8
- import subprocess # To use ffmpeg for embedding subtitles
9
- import shlex # For better command-line argument handling
10
- from docx.oxml.ns import qn
11
- from docx.oxml import OxmlElement
12
 
13
- # Load the Whisper model
14
- model = whisper.load_model("tiny") # Smaller model for faster transcription
15
 
16
  # Load M2M100 translation model for different languages
17
  def load_translation_model(target_language):
@@ -24,7 +22,6 @@ def load_translation_model(target_language):
24
  if not target_lang_code:
25
  raise ValueError(f"Translation model for {target_language} not supported")
26
 
27
- # Load M2M100 model and tokenizer
28
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
29
  translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
30
 
@@ -41,7 +38,7 @@ def translate_text(text, tokenizer, model):
41
  except Exception as e:
42
  raise RuntimeError(f"Error during translation: {e}")
43
 
44
- # Helper function to format timestamps in SRT format (hh:mm:ss,ms)
45
  def format_timestamp(seconds):
46
  milliseconds = int((seconds % 1) * 1000)
47
  seconds = int(seconds)
@@ -68,102 +65,66 @@ def write_srt(transcription, output_file, tokenizer=None, translation_model=None
68
  f.write(f"{start_time} --> {end_time}\n")
69
  f.write(f"{text.strip()}\n\n")
70
 
 
71
  def embed_hardsub_in_video(video_file, srt_file, output_video):
72
- """Uses ffmpeg to burn subtitles into the video (hardsub)."""
73
  command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
74
-
75
  try:
76
- print(f"Running command: {command}") # Debug statement
77
  process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
78
- print(f"ffmpeg output: {process.stdout}") # Debug statement
79
  if process.returncode != 0:
80
- raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error
81
  except subprocess.TimeoutExpired:
82
  raise RuntimeError("ffmpeg process timed out.")
83
  except Exception as e:
84
  raise RuntimeError(f"Error running ffmpeg: {e}")
85
 
86
- from docx.oxml.ns import qn
87
- from docx.oxml import OxmlElement
88
-
89
  def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None):
90
- """Creates a Word document from the transcription with support for RTL when translating to Persian."""
91
  doc = Document()
92
-
93
- # Check if the target language is Persian for RTL text direction
94
  rtl = target_language == "fa"
95
-
96
  for i, segment in enumerate(transcription['segments']):
97
  text = segment['text']
98
-
99
  if translation_model:
100
  text = translate_text(text, tokenizer, translation_model)
101
-
102
- # Add a paragraph with the text
103
  para = doc.add_paragraph(f"{i + 1}. {text.strip()}")
104
-
105
- # If RTL is required, modify the paragraph's properties
106
  if rtl:
107
- # Set the paragraph direction to RTL
108
- para_format = para.paragraph_format
109
- para_format.right_to_left = True
110
-
111
- # Set RTL for the text itself
112
- run = para.runs[0]
113
- run._element.rPr.append(OxmlElement('w:bidi'))
114
-
115
  doc.save(output_file)
116
 
 
117
  def reverse_text_for_rtl(text):
118
- # Reverse each word in the text to display it correctly in RTL
119
  return ' '.join([word[::-1] for word in text.split()])
120
 
 
121
  def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
122
- """Creates a PDF document from the transcription without timestamps."""
123
  pdf = FPDF()
124
  pdf.add_page()
125
-
126
- # Set up the font for Farsi (Unicode-compliant)
127
- font_path = "/home/user/app/B-NAZANIN.TTF" # Ensure the correct path to the font file
128
  pdf.add_font('B-NAZANIN', '', font_path, uni=True)
129
  pdf.set_font('B-NAZANIN', size=12)
130
-
131
  for i, segment in enumerate(transcription['segments']):
132
  text = segment['text']
133
-
134
  if translation_model:
135
  text = translate_text(text, tokenizer, translation_model)
136
-
137
- # Reverse the text for proper RTL display
138
  reversed_text = reverse_text_for_rtl(text)
139
-
140
- # Add the reversed text to the PDF, right-aligned for Farsi
141
  pdf.multi_cell(0, 10, f"{i + 1}. {reversed_text.strip()}", align='R')
142
-
143
  pdf.output(output_file)
144
 
 
145
  def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
146
- """Creates a PowerPoint presentation from the transcription without timestamps."""
147
  ppt = Presentation()
148
-
149
  for i, segment in enumerate(transcription['segments']):
150
  text = segment['text']
151
-
152
  if translation_model:
153
  text = translate_text(text, tokenizer, translation_model)
154
-
155
- slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
156
  title = slide.shapes.title
157
- title.text = f"{i + 1}. {text.strip()}" # No timestamps
158
-
159
  ppt.save(output_file)
160
 
 
161
  def transcribe_video(video_file, language, target_language, output_format):
162
- # Transcribe the video with Whisper
163
  result = model.transcribe(video_file.name, language=language)
164
  video_name = os.path.splitext(video_file.name)[0]
165
-
166
- # Load the translation model for the selected subtitle language
167
  if target_language != "en":
168
  try:
169
  tokenizer, translation_model = load_translation_model(target_language)
@@ -172,11 +133,9 @@ def transcribe_video(video_file, language, target_language, output_format):
172
  else:
173
  tokenizer, translation_model = None, None
174
 
175
- # Save the SRT file
176
  srt_file = f"{video_name}.srt"
177
  write_srt(result, srt_file, tokenizer, translation_model)
178
 
179
- # Output based on user's selection
180
  if output_format == "SRT":
181
  return srt_file
182
  elif output_format == "Video with Hardsub":
@@ -199,18 +158,24 @@ def transcribe_video(video_file, language, target_language, output_format):
199
  write_ppt(result, ppt_file, tokenizer, translation_model)
200
  return ppt_file
201
 
202
- # Gradio interface
203
  iface = gr.Interface(
204
  fn=transcribe_video,
205
  inputs=[
206
- gr.File(label="Upload Video"),
207
- gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
208
- gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
209
- gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
210
  ],
211
- outputs=gr.File(label="Download Subtitles, Video, or Document"),
212
- title="Video Subtitle Generator with Hardsub and Document Formats",
213
- description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
 
 
 
 
 
 
214
  )
215
 
216
  if __name__ == "__main__":
 
2
  import whisper
3
  import os
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
+ from docx import Document
6
+ from fpdf import FPDF
7
+ from pptx import Presentation
8
+ import subprocess
9
+ import shlex
 
 
10
 
11
+ # Load the Whisper model (smaller model for faster transcription)
12
+ model = whisper.load_model("tiny")
13
 
14
  # Load M2M100 translation model for different languages
15
  def load_translation_model(target_language):
 
22
  if not target_lang_code:
23
  raise ValueError(f"Translation model for {target_language} not supported")
24
 
 
25
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
26
  translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
27
 
 
38
  except Exception as e:
39
  raise RuntimeError(f"Error during translation: {e}")
40
 
41
+ # Helper function to format timestamps in SRT format
42
  def format_timestamp(seconds):
43
  milliseconds = int((seconds % 1) * 1000)
44
  seconds = int(seconds)
 
65
  f.write(f"{start_time} --> {end_time}\n")
66
  f.write(f"{text.strip()}\n\n")
67
 
68
+ # Embedding subtitles into video (hardsub)
69
  def embed_hardsub_in_video(video_file, srt_file, output_video):
 
70
  command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
 
71
  try:
 
72
  process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
 
73
  if process.returncode != 0:
74
+ raise RuntimeError(f"ffmpeg error: {process.stderr}")
75
  except subprocess.TimeoutExpired:
76
  raise RuntimeError("ffmpeg process timed out.")
77
  except Exception as e:
78
  raise RuntimeError(f"Error running ffmpeg: {e}")
79
 
80
+ # Helper function to write Word documents
 
 
81
  def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None):
 
82
  doc = Document()
 
 
83
  rtl = target_language == "fa"
 
84
  for i, segment in enumerate(transcription['segments']):
85
  text = segment['text']
 
86
  if translation_model:
87
  text = translate_text(text, tokenizer, translation_model)
 
 
88
  para = doc.add_paragraph(f"{i + 1}. {text.strip()}")
 
 
89
  if rtl:
90
+ para.paragraph_format.right_to_left = True
 
 
 
 
 
 
 
91
  doc.save(output_file)
92
 
93
+ # Helper function to reverse text for RTL
94
  def reverse_text_for_rtl(text):
 
95
  return ' '.join([word[::-1] for word in text.split()])
96
 
97
+ # Helper function to write PDF documents
98
  def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
 
99
  pdf = FPDF()
100
  pdf.add_page()
101
+ font_path = "/home/user/app/B-NAZANIN.TTF"
 
 
102
  pdf.add_font('B-NAZANIN', '', font_path, uni=True)
103
  pdf.set_font('B-NAZANIN', size=12)
 
104
  for i, segment in enumerate(transcription['segments']):
105
  text = segment['text']
 
106
  if translation_model:
107
  text = translate_text(text, tokenizer, translation_model)
 
 
108
  reversed_text = reverse_text_for_rtl(text)
 
 
109
  pdf.multi_cell(0, 10, f"{i + 1}. {reversed_text.strip()}", align='R')
 
110
  pdf.output(output_file)
111
 
112
+ # Helper function to write PowerPoint slides
113
  def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
 
114
  ppt = Presentation()
 
115
  for i, segment in enumerate(transcription['segments']):
116
  text = segment['text']
 
117
  if translation_model:
118
  text = translate_text(text, tokenizer, translation_model)
119
+ slide = ppt.slides.add_slide(ppt.slide_layouts[5])
 
120
  title = slide.shapes.title
121
+ title.text = f"{i + 1}. {text.strip()}"
 
122
  ppt.save(output_file)
123
 
124
+ # Transcribing video and generating output
125
  def transcribe_video(video_file, language, target_language, output_format):
 
126
  result = model.transcribe(video_file.name, language=language)
127
  video_name = os.path.splitext(video_file.name)[0]
 
 
128
  if target_language != "en":
129
  try:
130
  tokenizer, translation_model = load_translation_model(target_language)
 
133
  else:
134
  tokenizer, translation_model = None, None
135
 
 
136
  srt_file = f"{video_name}.srt"
137
  write_srt(result, srt_file, tokenizer, translation_model)
138
 
 
139
  if output_format == "SRT":
140
  return srt_file
141
  elif output_format == "Video with Hardsub":
 
158
  write_ppt(result, ppt_file, tokenizer, translation_model)
159
  return ppt_file
160
 
161
+ # Gradio interface with better UI
162
  iface = gr.Interface(
163
  fn=transcribe_video,
164
  inputs=[
165
+ gr.File(label="Upload Video File"),
166
+ gr.Dropdown(label="Select Original Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
167
+ gr.Dropdown(label="Select Subtitle Translation Language", choices=["en", "fa", "es", "fr"], value="fa"),
168
+ gr.Radio(label="Choose Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
169
  ],
170
+ outputs=gr.File(label="Download File"),
171
+ title="Video Subtitle Generator with Translation & Multi-Format Output",
172
+ description=(
173
+ "This tool allows you to generate subtitles from a video file using Whisper, "
174
+ "translate the subtitles into multiple languages using M2M100, and export them "
175
+ "in various formats including SRT, hardcoded subtitles in video, Word, PDF, or PowerPoint."
176
+ ),
177
+ theme="compact",
178
+ live=False # No live interaction needed
179
  )
180
 
181
  if __name__ == "__main__":