artificialguybr commited on
Commit
49f95b1
·
verified ·
1 Parent(s): 4fe6158

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -53
app.py CHANGED
@@ -14,7 +14,6 @@ from huggingface_hub import HfApi
14
  import moviepy.editor as mp
15
  import spaces
16
 
17
-
18
  # Constants and initialization
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
  REPO_ID = "artificialguybr/video-dubbing"
@@ -50,7 +49,6 @@ language_mapping = {
50
  'Greek': ('el', 'el-GR-NestorasNeural')
51
  }
52
 
53
-
54
  print("Starting the program...")
55
 
56
  def generate_unique_filename(extension):
@@ -62,20 +60,6 @@ def cleanup_files(*files):
62
  os.remove(file)
63
  print(f"Removed file: {file}")
64
 
65
- def check_for_faces(video_path):
66
- face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
67
- cap = cv2.VideoCapture(video_path)
68
-
69
- while True:
70
- ret, frame = cap.read()
71
- if not ret:
72
- break
73
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
74
- faces = face_cascade.detectMultiScale(gray, 1.1, 4)
75
- if len(faces) > 0:
76
- return True
77
- return False
78
-
79
  @spaces.GPU(duration=90)
80
  def transcribe_audio(file_path):
81
  print(f"Starting transcription of file: {file_path}")
@@ -128,7 +112,7 @@ async def text_to_speech(text, voice, output_file):
128
  await communicate.save(output_file)
129
 
130
  @spaces.GPU
131
- def process_video(radio, video, target_language, has_closeup_face):
132
  try:
133
  if target_language is None:
134
  raise ValueError("Please select a Target Language for Dubbing.")
@@ -163,12 +147,12 @@ def process_video(radio, video, target_language, has_closeup_face):
163
 
164
  asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
165
 
166
- if has_closeup_face or check_for_faces(video_path):
167
  try:
168
  subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
169
  except subprocess.CalledProcessError as e:
170
  print(f"Wav2Lip error: {str(e)}")
171
- gr.Warning("Wav2lip didn't detect a face or encountered an error. Falling back to simple audio replacement.")
172
  subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
173
  else:
174
  subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
@@ -190,43 +174,53 @@ def process_video(radio, video, target_language, has_closeup_face):
190
  print(f"Error in process_video: {str(e)}")
191
  return None, f"Error: {str(e)}"
192
 
193
- def swap(radio):
194
- return gr.update(source="upload" if radio == "Upload" else "webcam")
195
-
196
  # Gradio interface setup
197
- video = gr.Video()
198
- radio = gr.Radio(["Upload", "Record"], value="Upload", show_label=False)
199
- iface = gr.Interface(
200
- fn=process_video,
201
- inputs=[
202
- radio,
203
- video,
204
- gr.Dropdown(choices=list(language_mapping.keys()), label="Target Language for Dubbing", value="Spanish"),
205
- gr.Checkbox(label="Video has a close-up face. Use Wav2lip.", value=False, info="Say if video have close-up face. For Wav2lip. Will not work if checked wrongly.")
206
- ],
207
- outputs=[
208
- gr.Video(label="Processed Video"),
209
- gr.Textbox(label="Error Message")
210
- ],
211
- live=False,
212
- title="AI Video Dubbing",
213
- description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Thanks [@yeswondwer](https://twitter.com/@yeswondwerr) for original code. Test the [Video Transcription and Translate](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) space!""",
214
- allow_flagging=False
215
- )
216
-
217
- with gr.Blocks() as demo:
218
- iface.render()
219
- radio.change(swap, inputs=[radio], outputs=video)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  gr.Markdown("""
221
- **Note:**
222
- - Video limit is 1 minute. It will dubbing all people using just one voice.
223
- - Generation may take up to 5 minutes.
224
- - The tool uses open-source models for all models. It's an alpha version.
225
- - Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
226
- - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
227
- - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
228
  """)
229
 
230
  print("Launching Gradio interface...")
231
  demo.queue()
232
- demo.launch()
 
14
  import moviepy.editor as mp
15
  import spaces
16
 
 
17
  # Constants and initialization
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
  REPO_ID = "artificialguybr/video-dubbing"
 
49
  'Greek': ('el', 'el-GR-NestorasNeural')
50
  }
51
 
 
52
  print("Starting the program...")
53
 
54
  def generate_unique_filename(extension):
 
60
  os.remove(file)
61
  print(f"Removed file: {file}")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @spaces.GPU(duration=90)
64
  def transcribe_audio(file_path):
65
  print(f"Starting transcription of file: {file_path}")
 
112
  await communicate.save(output_file)
113
 
114
  @spaces.GPU
115
+ def process_video(video, target_language, use_wav2lip):
116
  try:
117
  if target_language is None:
118
  raise ValueError("Please select a Target Language for Dubbing.")
 
147
 
148
  asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
149
 
150
+ if use_wav2lip:
151
  try:
152
  subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
153
  except subprocess.CalledProcessError as e:
154
  print(f"Wav2Lip error: {str(e)}")
155
+ gr.Warning("Wav2lip encountered an error. Falling back to simple audio replacement.")
156
  subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
157
  else:
158
  subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
 
174
  print(f"Error in process_video: {str(e)}")
175
  return None, f"Error: {str(e)}"
176
 
 
 
 
177
  # Gradio interface setup
178
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
179
+ gr.Markdown("# AI Video Dubbing")
180
+ gr.Markdown("This tool uses AI to dub videos into different languages. Upload a video, choose a target language, and get a dubbed version!")
181
+
182
+ with gr.Row():
183
+ with gr.Column(scale=2):
184
+ video_input = gr.Video(label="Upload Video")
185
+ target_language = gr.Dropdown(
186
+ choices=list(language_mapping.keys()),
187
+ label="Target Language for Dubbing",
188
+ value="Spanish"
189
+ )
190
+ use_wav2lip = gr.Checkbox(
191
+ label="Use Wav2Lip for lip sync",
192
+ value=False,
193
+ info="Enable this if the video has close-up faces. May not work for all videos."
194
+ )
195
+ submit_button = gr.Button("Process Video", variant="primary")
196
+
197
+ with gr.Column(scale=2):
198
+ output_video = gr.Video(label="Processed Video")
199
+ error_message = gr.Textbox(label="Status/Error Message")
200
+
201
+ submit_button.click(
202
+ process_video,
203
+ inputs=[video_input, target_language, use_wav2lip],
204
+ outputs=[output_video, error_message]
205
+ )
206
+
207
+ gr.Markdown("""
208
+ ## Notes:
209
+ - Video limit is 1 minute. The tool will dub all speakers using a single voice.
210
+ - Processing may take up to 5 minutes.
211
+ - This is an alpha version using open-source models.
212
+ - Quality vs. speed trade-off was made for scalability and hardware limitations.
213
+ - For videos longer than 1 minute, please duplicate this Space and adjust the limit in the code.
214
+ """)
215
+
216
  gr.Markdown("""
217
+ ---
218
+ Developed by [@artificialguybr](https://twitter.com/artificialguybr) using open-source tools.
219
+ Special thanks to Hugging Face for GPU support and [@yeswondwer](https://twitter.com/@yeswondwerr) for the original code.
220
+
221
+ Try our [Video Transcription and Translation](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) tool!
 
 
222
  """)
223
 
224
  print("Launching Gradio interface...")
225
  demo.queue()
226
+ demo.launch()