artificialguybr commited on
Commit
7ac5c7d
1 Parent(s): 2cc952b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -85
app.py CHANGED
@@ -1,90 +1,78 @@
 
 
 
 
1
  import tempfile
 
2
  import gradio as gr
3
- import subprocess
4
- import os, stat
5
- import uuid
6
  from googletrans import Translator
7
  from TTS.api import TTS
8
- import ffmpeg
9
  from faster_whisper import WhisperModel
10
- from scipy.signal import wiener
11
  import soundfile as sf
12
- from pydub import AudioSegment
13
  import numpy as np
14
- import librosa
15
- from zipfile import ZipFile
16
- import shlex
17
  import cv2
18
- import torch
19
- import torchvision
20
- from tqdm import tqdm
21
- from numba import jit
22
  from huggingface_hub import HfApi
23
 
24
-
25
  HF_TOKEN = os.environ.get("HF_TOKEN")
26
  os.environ["COQUI_TOS_AGREED"] = "1"
27
  api = HfApi(token=HF_TOKEN)
28
  repo_id = "artificialguybr/video-dubbing"
 
 
29
  ZipFile("ffmpeg.zip").extractall()
30
  st = os.stat('ffmpeg')
31
  os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
32
- #Whisper
 
33
  model_size = "small"
34
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
35
 
36
  def check_for_faces(video_path):
37
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
38
  cap = cv2.VideoCapture(video_path)
39
-
40
  while True:
41
  ret, frame = cap.read()
42
  if not ret:
43
  break
44
-
45
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
46
  faces = face_cascade.detectMultiScale(gray, 1.1, 4)
47
-
48
  if len(faces) > 0:
49
  return True
50
-
51
- return False
52
 
 
 
 
53
  def process_video(radio, video, target_language, has_closeup_face):
54
  if target_language is None:
55
  return gr.Error("Please select a Target Language for Dubbing.")
56
-
57
  run_uuid = uuid.uuid4().hex[:6]
58
  output_filename = f"{run_uuid}_resized_video.mp4"
59
- ffmpeg.input(video).output(output_filename, vf='scale=-2:720').run()
60
-
61
- video_path = output_filename
62
 
 
 
 
 
63
  if not os.path.exists(video_path):
64
  return f"Error: {video_path} does not exist."
65
-
66
- # Move the duration check here
67
- video_info = ffmpeg.probe(video_path)
68
- video_duration = float(video_info['streams'][0]['duration'])
69
-
70
  if video_duration > 60:
71
- os.remove(video_path) # Delete the resized video
72
  return gr.Error("Video duration exceeds 1 minute. Please upload a shorter video.")
73
-
74
- ffmpeg.input(video_path).output(f"{run_uuid}_output_audio.wav", acodec='pcm_s24le', ar=48000, map='a').run()
75
-
76
- #y, sr = sf.read(f"{run_uuid}_output_audio.wav")
77
- #y = y.astype(np.float32)
78
- #y_denoised = wiener(y)
79
- #sf.write(f"{run_uuid}_output_audio_denoised.wav", y_denoised, sr)
80
-
81
- #sound = AudioSegment.from_file(f"{run_uuid}_output_audio_denoised.wav", format="wav")
82
- #sound = sound.apply_gain(0)
83
- #sound = sound.low_pass_filter(3000).high_pass_filter(100)
84
- #sound.export(f"{run_uuid}_output_audio_processed.wav", format="wav")
85
-
86
- shell_command = f"ffmpeg -y -i {run_uuid}_output_audio.wav -af lowpass=3000,highpass=100 {run_uuid}_output_audio_final.wav".split(" ")
87
- subprocess.run([item for item in shell_command], capture_output=False, text=True, check=True)
88
 
89
  print("Attempting to transcribe with Whisper...")
90
  try:
@@ -95,54 +83,36 @@ def process_video(radio, video, target_language, has_closeup_face):
95
  except RuntimeError as e:
96
  print(f"RuntimeError encountered: {str(e)}")
97
  if "CUDA failed with error device-side assert triggered" in str(e):
98
- gr.Warning("Error. Space need to restart. Please retry in a minute")
99
- # Restart the script
100
  api.restart_space(repo_id=repo_id)
101
-
102
  language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'}
103
  target_language_code = language_mapping[target_language]
104
  translator = Translator()
105
  translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text
106
  print(translated_text)
107
-
108
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
109
- tts.to('cuda')
110
  tts.tts_to_file(translated_text, speaker_wav=f"{run_uuid}_output_audio_final.wav", file_path=f"{run_uuid}_output_synth.wav", language=target_language_code)
111
 
112
- pad_top = 0
113
- pad_bottom = 15
114
- pad_left = 0
115
- pad_right = 0
116
- rescaleFactor = 1
117
-
118
- video_path_fix = video_path
119
-
120
- if has_closeup_face:
121
- has_face = True
122
- else:
123
- has_face = check_for_faces(video_path)
124
-
125
  if has_closeup_face:
126
  try:
127
- cmd = f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face {shlex.quote(video_path)} --audio '{run_uuid}_output_synth.wav' --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} --nosmooth --outfile '{run_uuid}_output_video.mp4'"
128
- subprocess.run(cmd, shell=True, check=True)
129
  except subprocess.CalledProcessError as e:
130
  if "Face not detected! Ensure the video contains a face in all the frames." in str(e.stderr):
131
- # Fallback to FFmpeg merge
132
  gr.Warning("Wav2lip didn't detect a face. Please try again with the option disabled.")
133
- cmd = f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4"
134
- subprocess.run(cmd, shell=True)
135
- else:
136
- # Merge audio with the original video without running Wav2Lip
137
- cmd = f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4"
138
- subprocess.run(cmd, shell=True)
139
-
140
  if not os.path.exists(f"{run_uuid}_output_video.mp4"):
141
  raise FileNotFoundError(f"Error: {run_uuid}_output_video.mp4 was not generated.")
142
-
143
  output_video_path = f"{run_uuid}_output_video.mp4"
144
-
145
- # Cleanup: Delete all generated files except the final output video
146
  files_to_delete = [
147
  f"{run_uuid}_resized_video.mp4",
148
  f"{run_uuid}_output_audio.wav",
@@ -154,16 +124,15 @@ def process_video(radio, video, target_language, has_closeup_face):
154
  os.remove(file)
155
  except FileNotFoundError:
156
  print(f"File {file} not found for deletion.")
157
-
158
- return output_video_path
159
-
160
 
 
 
161
  def swap(radio):
162
- if(radio == "Upload"):
163
  return gr.update(source="upload")
164
  else:
165
  return gr.update(source="webcam")
166
-
167
  video = gr.Video()
168
  radio = gr.Radio(["Upload", "Record"], value="Upload", show_label=False)
169
  iface = gr.Interface(
@@ -173,9 +142,9 @@ iface = gr.Interface(
173
  video,
174
  gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing", value="Spanish"),
175
  gr.Checkbox(
176
- label="Video has a close-up face. Use Wav2lip.",
177
- value=False,
178
- info="Say if video have close-up face. For Wav2lip. Will not work if checked wrongly.")
179
  ],
180
  outputs=gr.Video(),
181
  live=False,
@@ -183,6 +152,7 @@ iface = gr.Interface(
183
  description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Thanks [@yeswondwer](https://twitter.com/@yeswondwerr) for original code. Test the [Video Transcription and Translate](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) space!""",
184
  allow_flagging=False
185
  )
 
186
  with gr.Blocks() as demo:
187
  iface.render()
188
  radio.change(swap, inputs=[radio], outputs=video)
@@ -196,5 +166,6 @@ with gr.Blocks() as demo:
196
  - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
197
  - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
198
  """)
 
199
  demo.queue(concurrency_count=1, max_size=15)
200
  demo.launch()
 
1
+ import os
2
+ import stat
3
+ import uuid
4
+ import subprocess
5
  import tempfile
6
+ from zipfile import ZipFile
7
  import gradio as gr
8
+ import spaces
 
 
9
  from googletrans import Translator
10
  from TTS.api import TTS
 
11
  from faster_whisper import WhisperModel
 
12
  import soundfile as sf
 
13
  import numpy as np
 
 
 
14
  import cv2
 
 
 
 
15
  from huggingface_hub import HfApi
16
 
 
17
  HF_TOKEN = os.environ.get("HF_TOKEN")
18
  os.environ["COQUI_TOS_AGREED"] = "1"
19
  api = HfApi(token=HF_TOKEN)
20
  repo_id = "artificialguybr/video-dubbing"
21
+
22
+ # Extract FFmpeg
23
  ZipFile("ffmpeg.zip").extractall()
24
  st = os.stat('ffmpeg')
25
  os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
26
+
27
+ # Whisper model initialization
28
  model_size = "small"
29
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
30
 
31
  def check_for_faces(video_path):
32
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
33
  cap = cv2.VideoCapture(video_path)
34
+
35
  while True:
36
  ret, frame = cap.read()
37
  if not ret:
38
  break
39
+
40
  gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
41
  faces = face_cascade.detectMultiScale(gray, 1.1, 4)
42
+
43
  if len(faces) > 0:
44
  return True
 
 
45
 
46
+ return False
47
+
48
+ @spaces.GPU
49
  def process_video(radio, video, target_language, has_closeup_face):
50
  if target_language is None:
51
  return gr.Error("Please select a Target Language for Dubbing.")
52
+
53
  run_uuid = uuid.uuid4().hex[:6]
54
  output_filename = f"{run_uuid}_resized_video.mp4"
 
 
 
55
 
56
+ # Use FFmpeg via subprocess
57
+ subprocess.run(['ffmpeg', '-i', video, '-vf', 'scale=-2:720', output_filename])
58
+
59
+ video_path = output_filename
60
  if not os.path.exists(video_path):
61
  return f"Error: {video_path} does not exist."
62
+
63
+ # Check video duration
64
+ video_info = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', video_path], capture_output=True, text=True)
65
+ video_duration = float(video_info.stdout)
66
+
67
  if video_duration > 60:
68
+ os.remove(video_path)
69
  return gr.Error("Video duration exceeds 1 minute. Please upload a shorter video.")
70
+
71
+ # Extract audio
72
+ subprocess.run(['ffmpeg', '-i', video_path, '-acodec', 'pcm_s24le', '-ar', '48000', '-map', 'a', f"{run_uuid}_output_audio.wav"])
73
+
74
+ # Audio processing
75
+ subprocess.run(['ffmpeg', '-y', '-i', f"{run_uuid}_output_audio.wav", '-af', 'lowpass=3000,highpass=100', f"{run_uuid}_output_audio_final.wav"])
 
 
 
 
 
 
 
 
 
76
 
77
  print("Attempting to transcribe with Whisper...")
78
  try:
 
83
  except RuntimeError as e:
84
  print(f"RuntimeError encountered: {str(e)}")
85
  if "CUDA failed with error device-side assert triggered" in str(e):
86
+ gr.Warning("Error. Space needs to restart. Please retry in a minute")
 
87
  api.restart_space(repo_id=repo_id)
88
+
89
  language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'}
90
  target_language_code = language_mapping[target_language]
91
  translator = Translator()
92
  translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text
93
  print(translated_text)
94
+
95
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
 
96
  tts.tts_to_file(translated_text, speaker_wav=f"{run_uuid}_output_audio_final.wav", file_path=f"{run_uuid}_output_synth.wav", language=target_language_code)
97
 
98
+ has_face = check_for_faces(video_path) if not has_closeup_face else True
99
+
 
 
 
 
 
 
 
 
 
 
 
100
  if has_closeup_face:
101
  try:
102
+ subprocess.run(['python', 'Wav2Lip/inference.py', '--checkpoint_path', 'Wav2Lip/checkpoints/wav2lip_gan.pth', '--face', video_path, '--audio', f'{run_uuid}_output_synth.wav', '--pads', '0', '15', '0', '0', '--resize_factor', '1', '--nosmooth', '--outfile', f'{run_uuid}_output_video.mp4'], check=True)
 
103
  except subprocess.CalledProcessError as e:
104
  if "Face not detected! Ensure the video contains a face in all the frames." in str(e.stderr):
 
105
  gr.Warning("Wav2lip didn't detect a face. Please try again with the option disabled.")
106
+ subprocess.run(['ffmpeg', '-i', video_path, '-i', f'{run_uuid}_output_synth.wav', '-c:v', 'copy', '-c:a', 'aac', '-strict', 'experimental', '-map', '0:v:0', '-map', '1:a:0', f'{run_uuid}_output_video.mp4'])
107
+ else:
108
+ subprocess.run(['ffmpeg', '-i', video_path, '-i', f'{run_uuid}_output_synth.wav', '-c:v', 'copy', '-c:a', 'aac', '-strict', 'experimental', '-map', '0:v:0', '-map', '1:a:0', f'{run_uuid}_output_video.mp4'])
109
+
 
 
 
110
  if not os.path.exists(f"{run_uuid}_output_video.mp4"):
111
  raise FileNotFoundError(f"Error: {run_uuid}_output_video.mp4 was not generated.")
112
+
113
  output_video_path = f"{run_uuid}_output_video.mp4"
114
+
115
+ # Cleanup
116
  files_to_delete = [
117
  f"{run_uuid}_resized_video.mp4",
118
  f"{run_uuid}_output_audio.wav",
 
124
  os.remove(file)
125
  except FileNotFoundError:
126
  print(f"File {file} not found for deletion.")
 
 
 
127
 
128
+ return output_video_path
129
+
130
  def swap(radio):
131
+ if radio == "Upload":
132
  return gr.update(source="upload")
133
  else:
134
  return gr.update(source="webcam")
135
+
136
  video = gr.Video()
137
  radio = gr.Radio(["Upload", "Record"], value="Upload", show_label=False)
138
  iface = gr.Interface(
 
142
  video,
143
  gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing", value="Spanish"),
144
  gr.Checkbox(
145
+ label="Video has a close-up face. Use Wav2lip.",
146
+ value=False,
147
+ info="Say if video have close-up face. For Wav2lip. Will not work if checked wrongly.")
148
  ],
149
  outputs=gr.Video(),
150
  live=False,
 
152
  description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Thanks [@yeswondwer](https://twitter.com/@yeswondwerr) for original code. Test the [Video Transcription and Translate](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) space!""",
153
  allow_flagging=False
154
  )
155
+
156
  with gr.Blocks() as demo:
157
  iface.render()
158
  radio.change(swap, inputs=[radio], outputs=video)
 
166
  - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
167
  - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
168
  """)
169
+
170
  demo.queue(concurrency_count=1, max_size=15)
171
  demo.launch()