r3gm commited on
Commit
fc97911
·
1 Parent(s): f994bb7

Upload 4 files

Browse files
soni_translate/text_to_speech.py CHANGED
@@ -3,28 +3,31 @@ import edge_tts
3
  import asyncio
4
  import nest_asyncio
5
 
6
- def make_voice(tts_text, tts_voice, filename):
 
7
  try:
8
  nest_asyncio.apply()
9
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
10
- except 1:
11
- tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
12
- tts.save(filename)
13
- print('USE GTTS')
14
- except 2:
15
- tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
16
- tts.save(filename)
17
- print('REPLACE AUDIO GTTS')
 
18
 
19
- def make_voice_gradio(tts_text, tts_voice, filename):
20
  print(tts_text, filename)
21
  try:
22
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
23
- except 1:
24
- tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
25
- tts.save(filename)
26
- print('USE GTTS')
27
- except 2:
28
- tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
29
- tts.save(filename)
30
- print('REPLACE AUDIO GTTS')
 
 
3
  import asyncio
4
  import nest_asyncio
5
 
6
+ def make_voice(tts_text, tts_voice, filename,language):
7
+ #print(tts_text, filename)
8
  try:
9
  nest_asyncio.apply()
10
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
11
+ except:
12
+ try:
13
+ tts = gTTS(tts_text, lang=language)
14
+ tts.save(filename)
15
+ print(f'No audio was received. Please change the tts voice for {tts_voice}. USING gTTS.')
16
+ except:
17
+ tts = gTTS('a', lang=language)
18
+ tts.save(filename)
19
+ print('Error: Audio will be replaced.')
20
 
21
+ def make_voice_gradio(tts_text, tts_voice, filename, language):
22
  print(tts_text, filename)
23
  try:
24
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
25
+ except:
26
+ try:
27
+ tts = gTTS(tts_text, lang=language)
28
+ tts.save(filename)
29
+ print(f'No audio was received. Please change the tts voice for {tts_voice}. USING gTTS.')
30
+ except:
31
+ tts = gTTS('a', lang=language)
32
+ tts.save(filename)
33
+ print('Error: Audio will be replaced.')
soni_translate/translate_segments.py CHANGED
@@ -2,9 +2,15 @@ from tqdm import tqdm
2
  from deep_translator import GoogleTranslator
3
 
4
  def translate_text(segments, TRANSLATE_AUDIO_TO):
 
 
 
 
 
 
5
  for line in tqdm(range(len(segments))):
6
  text = segments[line]['text']
7
- translator = GoogleTranslator(source='auto', target=TRANSLATE_AUDIO_TO)
8
  translated_line = translator.translate(text.strip())
9
  segments[line]['text'] = translated_line
 
10
  return segments
 
2
  from deep_translator import GoogleTranslator
3
 
4
  def translate_text(segments, TRANSLATE_AUDIO_TO):
5
+
6
+ if TRANSLATE_AUDIO_TO == "zh":
7
+ TRANSLATE_AUDIO_TO = "zh-CN"
8
+
9
+ translator = GoogleTranslator(source='auto', target=TRANSLATE_AUDIO_TO)
10
+
11
  for line in tqdm(range(len(segments))):
12
  text = segments[line]['text']
 
13
  translated_line = translator.translate(text.strip())
14
  segments[line]['text'] = translated_line
15
+
16
  return segments
soni_translate/video_dubbing.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gradio as gr
3
+ import whisperx
4
+ import torch
5
+ from gtts import gTTS
6
+ import librosa
7
+ import edge_tts
8
+ import gc
9
+ from pydub import AudioSegment
10
+ from tqdm import tqdm
11
+ from deep_translator import GoogleTranslator
12
+ import os
13
+ from soni_translate.audio_segments import create_translated_audio
14
+ from soni_translate.text_to_speech import make_voice
15
+ from soni_translate.translate_segments import translate_text
16
+ import time
17
+
18
+ def translate_from_video(
19
+ video,
20
+ YOUR_HF_TOKEN,
21
+ preview=False,
22
+ WHISPER_MODEL_SIZE="large-v1",
23
+ batch_size=16,
24
+ compute_type="float16",
25
+ SOURCE_LANGUAGE= "Automatic detection",
26
+ TRANSLATE_AUDIO_TO="en",
27
+ min_speakers=1,
28
+ max_speakers=2,
29
+ tts_voice00="en-AU-WilliamNeural-Male",
30
+ tts_voice01="en-CA-ClaraNeural-Female",
31
+ tts_voice02="en-GB-ThomasNeural-Male",
32
+ tts_voice03="en-GB-SoniaNeural-Female",
33
+ tts_voice04="en-NZ-MitchellNeural-Male",
34
+ tts_voice05="en-GB-MaisieNeural-Female",
35
+ video_output="video_dub.mp4"
36
+ ):
37
+
38
+ if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
39
+ YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
40
+
41
+ if not os.path.exists('audio'):
42
+ os.makedirs('audio')
43
+
44
+ if not os.path.exists('audio2/audio'):
45
+ os.makedirs('audio2/audio')
46
+
47
+ # Check GPU
48
+ device = "cuda" if torch.cuda.is_available() else "cpu"
49
+ compute_type = "float32" if device == "cpu" else compute_type
50
+
51
+ OutputFile = 'Video.mp4'
52
+ audio_wav = "audio.wav"
53
+ Output_name_file = "audio_dub_solo.ogg"
54
+ mix_audio = "audio_mix.mp3"
55
+
56
+ os.system("rm Video.mp4")
57
+ os.system("rm audio.webm")
58
+ os.system("rm audio.wav")
59
+
60
+ if os.path.exists(video):
61
+ if preview:
62
+ print('Creating preview video, 10 seconds')
63
+ os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
64
+ else:
65
+ os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
66
+
67
+ os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
68
+ else:
69
+ if preview:
70
+ print('Creating preview from link, 10 seconds')
71
+ #https://github.com/yt-dlp/yt-dlp/issues/2220
72
+ mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
73
+ wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
74
+ os.system(mp4_)
75
+ os.system(wav_)
76
+ else:
77
+ mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
78
+ wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
79
+
80
+ os.system(wav_)
81
+
82
+ for i in range (120):
83
+ time.sleep(1)
84
+ print('process audio')
85
+ if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
86
+ time.sleep(1)
87
+ os.system(mp4_)
88
+ break
89
+ if i == 119:
90
+ print('Error donwloading the audio')
91
+ return
92
+
93
+ print("Set file complete.")
94
+
95
+ SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE
96
+
97
+ # 1. Transcribe with original whisper (batched)
98
+ model = whisperx.load_model(
99
+ WHISPER_MODEL_SIZE,
100
+ device,
101
+ compute_type=compute_type,
102
+ language= SOURCE_LANGUAGE,
103
+ )
104
+ audio = whisperx.load_audio(audio_wav)
105
+ result = model.transcribe(audio, batch_size=batch_size)
106
+ gc.collect(); torch.cuda.empty_cache(); del model
107
+ print("Transcript complete")
108
+
109
+ # 2. Align whisper output
110
+ model_a, metadata = whisperx.load_align_model(
111
+ language_code=result["language"],
112
+ device=device
113
+ )
114
+ result = whisperx.align(
115
+ result["segments"],
116
+ model_a,
117
+ metadata,
118
+ audio,
119
+ device,
120
+ return_char_alignments=True,
121
+ )
122
+ gc.collect(); torch.cuda.empty_cache(); del model_a
123
+ print("Align complete")
124
+
125
+ if result['segments'] == []:
126
+ print('No active speech found in audio')
127
+ return
128
+
129
+ # 3. Assign speaker labels
130
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
131
+ diarize_segments = diarize_model(
132
+ audio_wav,
133
+ min_speakers=min_speakers,
134
+ max_speakers=max_speakers)
135
+ result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
136
+ gc.collect(); torch.cuda.empty_cache(); del diarize_model
137
+ print("Diarize complete")
138
+
139
+ result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
140
+ print("Translation complete")
141
+
142
+ audio_files = []
143
+
144
+ # Mapping speakers to voice variables
145
+ speaker_to_voice = {
146
+ 'SPEAKER_00': tts_voice00,
147
+ 'SPEAKER_01': tts_voice01,
148
+ 'SPEAKER_02': tts_voice02,
149
+ 'SPEAKER_03': tts_voice03,
150
+ 'SPEAKER_04': tts_voice04,
151
+ 'SPEAKER_05': tts_voice05
152
+ }
153
+
154
+ for segment in tqdm(result_diarize['segments']):
155
+
156
+ text = segment['text']
157
+ start = segment['start']
158
+ end = segment['end']
159
+
160
+ try:
161
+ speaker = segment['speaker']
162
+ except KeyError:
163
+ segment['speaker'] = "SPEAKER_99"
164
+ speaker = segment['speaker']
165
+ print("NO SPEAKER DETECT IN SEGMENT")
166
+
167
+ # make the tts audio
168
+ filename = f"audio/{start}.ogg"
169
+
170
+ if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
171
+ make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
172
+ elif speaker == "SPEAKER_99":
173
+ try:
174
+ tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
175
+ tts.save(filename)
176
+ print('Using GTTS')
177
+ except:
178
+ tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
179
+ tts.save(filename)
180
+ print('Error: Audio will be replaced.')
181
+
182
+ # duration
183
+ duration_true = end - start
184
+ duration_tts = librosa.get_duration(filename=filename)
185
+
186
+ # porcentaje
187
+ porcentaje = duration_tts / duration_true
188
+
189
+ if porcentaje > 2.1:
190
+ porcentaje = 2.1
191
+ elif porcentaje <= 1.2 and porcentaje >= 0.8:
192
+ porcentaje = 1.0
193
+ elif porcentaje <= 0.79:
194
+ porcentaje = 0.8
195
+
196
+ # Smoth and round
197
+ porcentaje = round(porcentaje+0.0, 1)
198
+
199
+ # apply aceleration or opposite to the audio file in audio2 folder
200
+ os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
201
+
202
+ duration_create = librosa.get_duration(filename=f"audio2/{filename}")
203
+ audio_files.append(filename)
204
+
205
+ # replace files with the accelerates
206
+ os.system("mv -f audio2/audio/*.ogg audio/")
207
+
208
+ os.system(f"rm {Output_name_file}")
209
+ create_translated_audio(result_diarize, audio_files, Output_name_file)
210
+
211
+ os.system(f"rm {mix_audio}")
212
+ os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')
213
+
214
+ os.system(f"rm {video_output}")
215
+ os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
216
+
217
+ return video_output