sc45 commited on
Commit
f0ceee4
1 Parent(s): 2165f59

Initial Commit

Browse files
Files changed (10) hide show
  1. .DS_Store +0 -0
  2. UI.py +25 -0
  3. diarization.py +81 -0
  4. main.py +63 -0
  5. opus.py +63 -0
  6. requirements.txt +197 -0
  7. translated_video.py +77 -0
  8. tts.py +96 -0
  9. video_to_text.py +86 -0
  10. yt_download.py +53 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
UI.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main import main as process_video
3
+
4
+ def run_pipeline(youtube_url):
5
+ # Run the main processing function from your script
6
+ # This function should save the final video in the '/translated/' directory
7
+ process_video(youtube_url)
8
+
9
+ # Construct the path to the final video
10
+ # Assuming the video is named 'final_video.mp4' and stored in '/translated/'
11
+ final_video_path = './translated/final_video.mp4'
12
+
13
+ # Return the path for Gradio to display
14
+ return final_video_path
15
+
16
+ iface = gr.Interface(
17
+ fn=run_pipeline,
18
+ inputs=gr.Textbox(lines=2, placeholder="Enter YouTube Video URL here..."),
19
+ outputs=gr.Video(),
20
+ title="YouTube Video Processing",
21
+ description="Enter a YouTube URL to process the video through transcription, translation, and more."
22
+ )
23
+
24
+ if __name__ == "__main__":
25
+ iface.launch()
diarization.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyannote.audio import Pipeline
2
+ from pydub import AudioSegment
3
+ import os
4
+ import re
5
+ import torch
6
+
7
+ def perform_diarization(audio_file_path, translated_file_path, output_dir='./audio/diarization'):
8
+ # Initialize diarization pipeline
9
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
10
+
11
+ # Send pipeline to GPU (when available)
12
+ pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
13
+
14
+ # Load audio file
15
+ audio = AudioSegment.from_wav(audio_file_path)
16
+
17
+ # Apply pretrained pipeline
18
+ diarization = pipeline(audio_file_path)
19
+
20
+ os.makedirs(output_dir, exist_ok=True)
21
+
22
+ # Process and save each speaker's audio segments
23
+ speaker_segments_audio = {}
24
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
25
+ start_ms = int(turn.start * 1000) # Convert to milliseconds
26
+ end_ms = int(turn.end * 1000) # Convert to milliseconds
27
+ segment = audio[start_ms:end_ms]
28
+
29
+ if speaker in speaker_segments_audio:
30
+ speaker_segments_audio[speaker] += segment
31
+ else:
32
+ speaker_segments_audio[speaker] = segment
33
+
34
+ # Save audio segments
35
+ for speaker, segment in speaker_segments_audio.items():
36
+ output_path = os.path.join(output_dir, f"{speaker}.wav")
37
+ segment.export(output_path, format="wav")
38
+ print(f"Combined audio for speaker {speaker} saved in {output_path}")
39
+
40
+ # Load translated text
41
+ with open(translated_file_path, "r") as file:
42
+ translated_lines = file.readlines()
43
+
44
+ # Process and align translated text with diarization data
45
+ last_speaker = None
46
+ aligned_text = []
47
+ timestamp_pattern = re.compile(r'\[(\d+\.\d+)\-(\d+\.\d+)\]')
48
+ for line in translated_lines:
49
+ match = timestamp_pattern.match(line)
50
+
51
+ if match:
52
+ start_time = float(match.group(1))
53
+ end_time = float(match.group(2))
54
+ text = line[match.end():].strip() # Extract text part
55
+
56
+ speaker_found = False
57
+ # Find corresponding speaker
58
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
59
+ speaker_start = turn.start
60
+ speaker_end = turn.end
61
+ # Check for overlap between speaker segment and line timestamp
62
+ if max(speaker_start, start_time) < min(speaker_end, end_time):
63
+ aligned_text.append(f"[{speaker}] [{start_time}-{end_time}] {text}")
64
+ speaker_found = True
65
+ last_speaker = speaker
66
+ break
67
+
68
+ # If no speaker found, use the last speaker
69
+ if not speaker_found:
70
+ if last_speaker is not None:
71
+ aligned_text.append(f"[{last_speaker}] [{start_time}-{end_time}] {text}")
72
+ else:
73
+ aligned_text.append(f"[Unknown Speaker] [{start_time}-{end_time}] {text}")
74
+
75
+ # Save aligned text to a single file
76
+ aligned_text_output_path = os.path.join(output_dir, "aligned_text.txt")
77
+ with open(aligned_text_output_path, "w") as aligned_text_file:
78
+ aligned_text_file.write('\n'.join(aligned_text))
79
+ print(f"Aligned text saved in {aligned_text_output_path}")
80
+
81
+ # The rest of your script, if any
main.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from yt_download import download_video
4
+ from video_to_text import convert_video_to_text
5
+ from opus import translate_file
6
+ from diarization import perform_diarization
7
+ from tts import main as tts_main
8
+ from translated_video import create_translated_video
9
+
10
+ def get_transcription_filename(video_path):
11
+ base_name = os.path.splitext(os.path.basename(video_path))[0]
12
+ return f'./transcribed/{base_name}.txt'
13
+
14
+ def get_audio_filename(video_path):
15
+ base_name = os.path.splitext(os.path.basename(video_path))[0]
16
+ return f'./audio/{base_name}.wav'
17
+
18
+ def main(youtube_url):
19
+ # Ensure necessary directories exist
20
+ if not os.path.exists('./downloads'):
21
+ os.makedirs('./downloads')
22
+ if not os.path.exists('./audio'):
23
+ os.makedirs('./audio')
24
+ if not os.path.exists('./transcribed'):
25
+ os.makedirs('./transcribed')
26
+ if not os.path.exists('./translated'):
27
+ os.makedirs('./translated')
28
+
29
+ # Step 1: Download the video
30
+ downloaded_video_path = download_video(youtube_url)
31
+
32
+ # Step 2: Transcribe the video's audio
33
+ transcribed_text_path = get_transcription_filename(downloaded_video_path)
34
+ model_type = 'base' # You can specify the Whisper model type
35
+ convert_video_to_text(downloaded_video_path, model_type)
36
+
37
+
38
+ # Step 3: Translate the transcribed text to Spanish
39
+ translated_text_path = './translated/translated_text.txt'
40
+ translate_file(transcribed_text_path, translated_text_path)
41
+
42
+ # Step 4: Perform diarization
43
+ audio_path = get_audio_filename(downloaded_video_path)
44
+ diarized_audio_dir = './audio/diarization'
45
+ perform_diarization(audio_path, translated_text_path)
46
+
47
+ # Step 5: Generate speech for translated text
48
+ speaker_directory = './audio/diarization'
49
+ aligned_text_file = './audio/diarization/aligned_text.txt' # Ensure this is the correct path
50
+ output_audio_file = './translated/final_audio.wav'
51
+ tts_main(speaker_directory, aligned_text_file, output_audio_file)
52
+
53
+ # Step 6: Create the final translated video
54
+ final_video_path = create_translated_video(downloaded_video_path, output_audio_file, translated_text_path)
55
+
56
+ print(f"Final translated video created at {final_video_path}")
57
+
58
+ if __name__ == "__main__":
59
+ parser = argparse.ArgumentParser(description="Process a YouTube video with multiple steps.")
60
+ parser.add_argument("youtube_url", help="YouTube video URL")
61
+ args = parser.parse_args()
62
+
63
+ main(args.youtube_url)
opus.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MarianMTModel, MarianTokenizer
2
+ from tqdm import tqdm
3
+ import os
4
+ import re
5
+ import argparse
6
+
7
+ # Load Model and Tokenizer
8
+ model_name = "Helsinki-NLP/opus-mt-en-es"
9
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
10
+ model = MarianMTModel.from_pretrained(model_name)
11
+
12
+ # Extract & separate timestamp and text
13
+ def extract_timestamp_and_text(line):
14
+ match = re.match(r'\[(\d+\.\d+\-\d+\.\d+)\]\s+(.*)', line)
15
+ if match:
16
+ return match.group(1), match.group(2)
17
+ return '', line
18
+
19
+ # Translate text
20
+ def translate_text(text):
21
+ lines = text.split('\n')
22
+ translated_lines = []
23
+
24
+ for line in tqdm(lines, desc="Translating lines", leave=False):
25
+ if not line.strip():
26
+ translated_lines.append('')
27
+ continue
28
+
29
+ timestamp, line_text = extract_timestamp_and_text(line)
30
+
31
+ if line_text.strip():
32
+ model_inputs = tokenizer(line_text, return_tensors="pt", truncation=True, padding="longest")
33
+ translated = model.generate(**model_inputs)
34
+ translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
35
+ translated_line = f'[{timestamp}] {translated_text}'
36
+ else:
37
+ translated_line = f'[{timestamp}]'
38
+
39
+ translated_lines.append(translated_line)
40
+
41
+ return '\n'.join(translated_lines)
42
+
43
+ # Main function to translate a file
44
+ def translate_file(src_file_path, dst_file_path):
45
+ try:
46
+ with open(src_file_path, 'r') as file:
47
+ english_text = file.read()
48
+ spanish_text = translate_text(english_text)
49
+
50
+ with open(dst_file_path, 'w') as file:
51
+ file.write(spanish_text)
52
+ print(f"Translation completed: {dst_file_path}")
53
+
54
+ except Exception as e:
55
+ print(f"Error processing file: {e}")
56
+
57
+ if __name__ == "__main__":
58
+ parser = argparse.ArgumentParser(description="Translate English text to Spanish")
59
+ parser.add_argument("src_file_path", help="Path to the source file with English text")
60
+ parser.add_argument("dst_file_path", help="Path to save the translated Spanish text")
61
+ args = parser.parse_args()
62
+
63
+ translate_file(args.src_file_path, args.dst_file_path)
requirements.txt ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ aiohttp==3.9.0
3
+ aiosignal==1.3.1
4
+ alembic==1.12.1
5
+ annotated-types==0.6.0
6
+ antlr4-python3-runtime==4.9.3
7
+ anyascii==0.3.2
8
+ asteroid-filterbanks==0.4.0
9
+ attrs==23.1.0
10
+ audioread==3.0.1
11
+ Babel==2.13.1
12
+ bangla==0.0.2
13
+ blinker==1.7.0
14
+ blis==0.7.11
15
+ bnnumerizer==0.0.2
16
+ bnunicodenormalizer==0.1.6
17
+ Brotli @ file:///D:/bld/brotli-split_1695989908365/work
18
+ cachetools==5.3.2
19
+ catalogue==2.0.10
20
+ certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1700303426725/work/certifi
21
+ cffi==1.16.0
22
+ charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1698833585322/work
23
+ click==8.1.7
24
+ cloudpathlib==0.16.0
25
+ colorama==0.4.6
26
+ colorlog==6.7.0
27
+ confection==0.1.3
28
+ contourpy==1.2.0
29
+ coqpit==0.0.17
30
+ cycler==0.12.1
31
+ cymem==2.0.8
32
+ Cython==3.0.5
33
+ dateparser==1.1.8
34
+ decorator==4.4.2
35
+ docopt==0.6.2
36
+ einops==0.7.0
37
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
38
+ encodec==0.1.1
39
+ ffmpeg-python==0.2.0
40
+ filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1698714947081/work
41
+ fire==0.5.0
42
+ Flask==3.0.0
43
+ fonttools==4.45.0
44
+ frozenlist==1.4.0
45
+ fsspec==2023.10.0
46
+ future==0.18.3
47
+ g2pkk==0.1.2
48
+ google-auth==2.23.4
49
+ google-auth-oauthlib==1.1.0
50
+ greenlet==3.0.1
51
+ grpcio==1.59.3
52
+ gruut==2.2.3
53
+ gruut-ipa==0.13.0
54
+ gruut-lang-de==2.0.0
55
+ gruut-lang-en==2.0.0
56
+ gruut-lang-es==2.0.0
57
+ gruut-lang-fr==2.0.2
58
+ hangul-romanize==0.1.0
59
+ huggingface-hub==0.19.4
60
+ HyperPyYAML==1.2.2
61
+ idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1663625384323/work
62
+ imageio==2.33.0
63
+ imageio-ffmpeg==0.4.9
64
+ inflect==7.0.0
65
+ itsdangerous==2.1.2
66
+ jamo==0.4.1
67
+ jieba==0.42.1
68
+ Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1654302431367/work
69
+ joblib==1.3.2
70
+ jsonlines==1.2.0
71
+ julius==0.2.7
72
+ kiwisolver==1.4.5
73
+ langcodes==3.3.0
74
+ lazy_loader==0.3
75
+ librosa==0.10.1
76
+ lightning==2.1.2
77
+ lightning-utilities==0.10.0
78
+ llvmlite==0.41.1
79
+ Mako==1.3.0
80
+ Markdown==3.5.1
81
+ markdown-it-py==3.0.0
82
+ MarkupSafe @ file:///D:/bld/markupsafe_1695367436673/work
83
+ matplotlib==3.8.2
84
+ mdurl==0.1.2
85
+ more-itertools==10.1.0
86
+ moviepy==1.0.3
87
+ mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1678228039184/work
88
+ msgpack==1.0.7
89
+ multidict==6.0.4
90
+ murmurhash==1.0.10
91
+ networkx==2.8.8
92
+ nltk==3.8.1
93
+ num2words==0.5.13
94
+ numba==0.58.1
95
+ numpy @ file:///D:/bld/numpy_1694920156760/work/dist/numpy-1.26.0-cp311-cp311-win_amd64.whl#sha256=52e1af97f7d84aafe72cc1aaae3e1c9d52dff69c7ffcc96e2f4f7799fdad7a0c
96
+ oauthlib==3.2.2
97
+ omegaconf==2.3.0
98
+ openai-whisper==20231117
99
+ opencv-python==4.8.1.78
100
+ optuna==3.4.0
101
+ packaging==23.2
102
+ pandas==1.5.3
103
+ Pillow @ file:///D:/bld/pillow_1697423754480/work
104
+ platformdirs==4.0.0
105
+ pooch==1.8.0
106
+ preshed==3.0.9
107
+ primePy==1.3
108
+ proglog==0.1.10
109
+ protobuf==4.23.4
110
+ psutil==5.9.6
111
+ pyannote.audio==3.1.0
112
+ pyannote.core==5.0.0
113
+ pyannote.database==5.0.1
114
+ pyannote.metrics==3.2.1
115
+ pyannote.pipeline==3.0.1
116
+ pyasn1==0.5.1
117
+ pyasn1-modules==0.3.0
118
+ pycparser==2.21
119
+ pydantic==2.5.2
120
+ pydantic_core==2.14.5
121
+ pydub==0.25.1
122
+ Pygments==2.17.2
123
+ pymp3==0.1.9
124
+ pynndescent==0.5.11
125
+ pyparsing==3.1.1
126
+ pypinyin==0.49.0
127
+ pysbd==0.3.4
128
+ PySocks @ file:///D:/bld/pysocks_1661604991356/work
129
+ PySoundFile==0.9.0.post1
130
+ python-crfsuite==0.9.9
131
+ python-dateutil==2.8.2
132
+ pytorch-lightning==2.1.2
133
+ pytorch-metric-learning==2.3.0
134
+ pytube==15.0.0
135
+ pytz==2023.3.post1
136
+ PyYAML @ file:///D:/bld/pyyaml_1695373635661/work
137
+ regex==2023.10.3
138
+ requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1684774241324/work
139
+ requests-oauthlib==1.3.1
140
+ rich==13.7.0
141
+ rsa==4.9
142
+ ruamel.yaml==0.18.5
143
+ ruamel.yaml.clib==0.2.8
144
+ sacremoses==0.1.1
145
+ safetensors==0.4.0
146
+ scikit-learn==1.3.2
147
+ scipy==1.11.4
148
+ semver==3.0.2
149
+ sentencepiece==0.1.99
150
+ shellingham==1.5.4
151
+ six==1.16.0
152
+ smart-open==6.4.0
153
+ sortedcontainers==2.4.0
154
+ soundfile==0.12.1
155
+ soxr==0.3.7
156
+ spacy==3.7.2
157
+ spacy-legacy==3.0.12
158
+ spacy-loggers==1.0.5
159
+ speechbrain==0.5.16
160
+ SQLAlchemy==2.0.23
161
+ srsly==2.4.8
162
+ srt==3.5.3
163
+ SudachiDict-core==20230927
164
+ SudachiPy==0.6.7
165
+ sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1684180539862/work
166
+ tabulate==0.9.0
167
+ tensorboard==2.15.1
168
+ tensorboard-data-server==0.7.2
169
+ tensorboardX==2.6.2.2
170
+ termcolor==2.4.0
171
+ thinc==8.2.1
172
+ threadpoolctl==3.2.0
173
+ tiktoken==0.5.1
174
+ tokenizers==0.15.0
175
+ torch==2.1.1
176
+ torch-audiomentations==0.11.0
177
+ torch-pitch-shift==1.2.4
178
+ torch-time-stretch==1.0.3
179
+ torchaudio==2.1.1
180
+ torchmetrics==1.2.0
181
+ torchvision==0.16.1
182
+ tqdm==4.66.1
183
+ trainer==0.0.32
184
+ transformers==4.35.2
185
+ TTS==0.21.3
186
+ typer==0.9.0
187
+ typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1695040754690/work
188
+ tzdata==2023.3
189
+ tzlocal==5.2
190
+ umap-learn==0.5.5
191
+ Unidecode==1.3.7
192
+ urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1699933488691/work
193
+ wasabi==1.1.2
194
+ weasel==0.3.4
195
+ Werkzeug==3.0.1
196
+ win-inet-pton @ file:///D:/bld/win_inet_pton_1667051142467/work
197
+ yarl==1.9.3
translated_video.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from moviepy.editor import VideoFileClip, AudioFileClip
2
+ from pydub import AudioSegment
3
+ import srt
4
+ import datetime
5
+ import ffmpeg
6
+ import os
7
+ import re
8
+
9
+ def create_translated_video(original_video_path, translated_audio_path, translated_text_path, output_dir='./translated'):
10
+ # Load original video
11
+ video = VideoFileClip(original_video_path)
12
+
13
+ # Load TTS audio
14
+ new_audio = AudioFileClip(translated_audio_path)
15
+ video = video.set_audio(new_audio)
16
+ audio_segment = AudioSegment.from_file(translated_audio_path, format="wav")
17
+
18
+ # Check if new audio is shorter to pad with silence
19
+ if new_audio.duration < video.duration:
20
+ silence_duration = (video.duration - new_audio.duration) * 1000 # convert to milliseconds
21
+ silence_segment = AudioSegment.silent(duration=silence_duration)
22
+ audio_segment = audio_segment + silence_segment
23
+ padded_audio_path = os.path.join(output_dir, 'padded_audio.wav')
24
+ audio_segment.export(padded_audio_path, format='wav')
25
+ new_audio = AudioFileClip(padded_audio_path)
26
+
27
+ # Generate SRT content
28
+ def parse_translated_text(file_path):
29
+ with open(file_path, 'r') as file:
30
+ content = file.readlines()
31
+
32
+ subtitles = []
33
+ timestamp_pattern = re.compile(r'\[(\d+\.\d+)\-(\d+\.\d+)\]')
34
+ for line in content:
35
+ match = timestamp_pattern.match(line)
36
+ if match:
37
+ start_time = datetime.timedelta(seconds=float(match.group(1)))
38
+ end_time = datetime.timedelta(seconds=float(match.group(2)))
39
+ text = line[match.end():].strip()
40
+
41
+ subtitle = srt.Subtitle(index=len(subtitles)+1,
42
+ start=start_time,
43
+ end=end_time,
44
+ content=text)
45
+ subtitles.append(subtitle)
46
+
47
+ return srt.compose(subtitles)
48
+
49
+ # Generate SRT content
50
+ srt_content = parse_translated_text(translated_text_path)
51
+
52
+ # Write to an SRT file
53
+ srt_file = './translated/translated.srt'
54
+ with open(srt_file, 'w', encoding='utf-8') as file:
55
+ file.write(srt_content)
56
+
57
+ # Write the final video file
58
+ temp = "./translated/temp.mp4"
59
+ video.write_videofile(temp)
60
+
61
+ # Add subtitles
62
+ final_video_file = os.path.join(output_dir, "final_video.mp4")
63
+
64
+ # Correct the subtitle filter string for ffmpeg
65
+ subtitle_filter_str = f"subtitles='{srt_file}'"
66
+
67
+ try:
68
+ ffmpeg.input(temp).output(final_video_file, vf=subtitle_filter_str).run()
69
+ except ffmpeg.Error as e:
70
+ print(f"Error creating final video: {e}")
71
+ return None
72
+
73
+ # Remove temp file
74
+ os.remove(temp)
75
+ return final_video_file
76
+
77
+ # The rest of your script, if any
tts.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from TTS.api import TTS
2
+ from pydub import AudioSegment
3
+ import os
4
+ import re
5
+ import ffmpeg
6
+ import shutil
7
+ import argparse
8
+
9
+ def adjust_speed(input_file, speed_factor):
10
+ output_file = input_file.replace(".wav", "_adjusted.wav")
11
+ ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
12
+ return output_file
13
+
14
+ def generate_speech(text, speaker_voice_map, output_file):
15
+ combined_audio = AudioSegment.empty()
16
+ temp_files = []
17
+
18
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
19
+
20
+ for line in text.split("\n"):
21
+ if not line.strip():
22
+ continue
23
+
24
+ match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
25
+ if not match:
26
+ continue
27
+
28
+ speaker_id, start_time, end_time, sentence = match.groups()
29
+ start_time, end_time = float(start_time), float(end_time)
30
+ segment_duration = (end_time - start_time) * 1000 # Duration in milliseconds
31
+
32
+ speaker_wav = speaker_voice_map.get(f"SPEAKER_{speaker_id}")
33
+ if not speaker_wav:
34
+ continue
35
+
36
+ os.makedirs('./audio/temp', exist_ok=True)
37
+ temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
38
+ temp_files.append(temp_file_path)
39
+
40
+ tts_speed = 1.0
41
+ tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
42
+
43
+ segment_audio = AudioSegment.from_wav(temp_file_path)
44
+
45
+ if segment_audio.duration_seconds * 1000 > segment_duration:
46
+ while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
47
+ tts_speed += 0.5
48
+ tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
49
+ segment_audio = AudioSegment.from_wav(temp_file_path)
50
+
51
+ if segment_audio.duration_seconds * 1000 > segment_duration:
52
+ required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
53
+ if required_speed < 1.0:
54
+ required_speed = 1.0 / required_speed
55
+ temp_file_path = adjust_speed(temp_file_path, required_speed)
56
+ segment_audio = AudioSegment.from_wav(temp_file_path)
57
+
58
+ if combined_audio.duration_seconds == 0 and start_time > 0:
59
+ combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
60
+
61
+ if segment_audio.duration_seconds * 1000 > segment_duration:
62
+ segment_audio = segment_audio[:segment_duration]
63
+ else:
64
+ segment_audio = segment_audio + AudioSegment.silent(duration=segment_duration - len(segment_audio))
65
+
66
+ combined_audio += segment_audio
67
+
68
+ combined_audio.export(output_file, format="wav")
69
+
70
+ for temp_file in temp_files:
71
+ os.remove(temp_file)
72
+
73
+ def map_speaker_ids(directory):
74
+ speaker_voice_map = {}
75
+ for file in os.listdir(directory):
76
+ if file.endswith(".wav"):
77
+ speaker_id = file.replace(".wav", "")
78
+ speaker_voice_map[speaker_id] = os.path.join(directory, file)
79
+ return speaker_voice_map
80
+
81
+ def main(speaker_directory, aligned_text_file, output_audio_file):
82
+ speaker_voice_map = map_speaker_ids(speaker_directory)
83
+ with open(aligned_text_file, 'r') as file:
84
+ translated_text = file.read()
85
+ generate_speech(translated_text, speaker_voice_map, output_audio_file)
86
+ if os.path.exists('./audio/temp'):
87
+ shutil.rmtree('./audio/temp')
88
+
89
+ if __name__ == "__main__":
90
+ parser = argparse.ArgumentParser(description="Generate speech from translated text")
91
+ parser.add_argument("speaker_directory", help="Directory containing speaker voice clips")
92
+ parser.add_argument("aligned_text_file", help="Path to the translated and aligned text file")
93
+ parser.add_argument("output_audio_file", help="Path to save the generated speech audio file")
94
+ args = parser.parse_args()
95
+
96
+ main(args.speaker_directory, args.aligned_text_file, args.output_audio_file)
video_to_text.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from moviepy.editor import VideoFileClip
3
+ import whisper
4
+ import os
5
+ import re
6
+
7
+ def extract_audio(video_path, audio_dir='./audio'):
8
+ os.makedirs(audio_dir, exist_ok=True)
9
+ base_filename = os.path.splitext(os.path.basename(video_path))[0]
10
+ audio_filename = os.path.join(audio_dir, base_filename + '.wav')
11
+ video_clip = VideoFileClip(video_path)
12
+ video_clip.audio.write_audiofile(audio_filename)
13
+ video_clip.close()
14
+ return audio_filename
15
+
16
+ def transcribe_audio(audio_path, model_type='base', transcribed_dir='./transcribed'):
17
+ model = whisper.load_model(model_type)
18
+ result = model.transcribe(audio_path)
19
+
20
+ os.makedirs(transcribed_dir, exist_ok=True)
21
+ base_filename = os.path.splitext(os.path.basename(audio_path))[0]
22
+ transcribed_filename = os.path.join(transcribed_dir, base_filename + '.txt')
23
+
24
+ with open(transcribed_filename, 'w') as file:
25
+ for segment in result['segments']:
26
+ start = segment['start']
27
+ end = segment['end']
28
+ text = segment['text']
29
+ file.write(f"[{start:.2f}-{end:.2f}] {text}\n")
30
+
31
+ return transcribed_filename, result['text']
32
+
33
+ def merge_lines(file_path):
34
+ timestamp_pattern = re.compile(r'\[(\d+\.\d+)-(\d+\.\d+)\]')
35
+
36
+ with open(file_path, 'r') as file:
37
+ lines = file.readlines()
38
+
39
+ merged_lines = []
40
+ i = 0
41
+
42
+ while i < len(lines):
43
+ line = lines[i].strip()
44
+ match = timestamp_pattern.match(line)
45
+
46
+ if match:
47
+ start_time = float(match.group(1))
48
+ text = line[match.end():].strip()
49
+
50
+ if not (text.endswith('.') or text.endswith('?')):
51
+ if i + 1 < len(lines):
52
+ next_line = lines[i + 1].strip()
53
+ next_match = timestamp_pattern.match(next_line)
54
+
55
+ if next_match:
56
+ end_time = float(next_match.group(2))
57
+ next_text = next_line[next_match.end():].strip()
58
+ merged_text = text + ' ' + next_text
59
+ merged_line = f"[{start_time:.2f}-{end_time:.2f}] {merged_text}\n"
60
+ merged_lines.append(merged_line)
61
+ i += 1
62
+ else:
63
+ end_time = float(match.group(2))
64
+ merged_lines.append(f"[{start_time:.2f}-{end_time:.2f}] {text}\n")
65
+
66
+ i += 1
67
+
68
+ with open(file_path, 'w') as file:
69
+ file.writelines(merged_lines)
70
+
71
+ return file_path
72
+
73
+ def convert_video_to_text(video_file_path, model_type='base'):
74
+ audio_path = extract_audio(video_file_path)
75
+ transcribed_path, _ = transcribe_audio(audio_path, model_type)
76
+ merge_lines(transcribed_path)
77
+ return transcribed_path
78
+
79
+
80
+ if __name__ == "__main__":
81
+ parser = argparse.ArgumentParser(description="Transcribe audio from video")
82
+ parser.add_argument("video_file", help="Path to the video file")
83
+ parser.add_argument("--model", help="Size of the whisper model (e.g., tiny, base, small, medium, large, huge).", default="base")
84
+ args = parser.parse_args()
85
+
86
+ convert_video_to_text(args.video_file, args.model)
yt_download.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pytube import YouTube
3
+ from tqdm import tqdm
4
+ import os
5
+
6
+ def download_youtube_video(video_url, download_captions=False):
7
+ progress_bar = None
8
+
9
+ def progress_function(stream, chunk, bytes_remaining):
10
+ nonlocal progress_bar
11
+ if progress_bar is None:
12
+ progress_bar = tqdm(total=stream.filesize, unit='B', unit_scale=True, desc="Downloading Video")
13
+ current = stream.filesize - bytes_remaining
14
+ progress_bar.n = current
15
+ progress_bar.last_print_n = current
16
+ progress_bar.update()
17
+
18
+ if not os.path.exists('./downloads'):
19
+ os.makedirs('./downloads')
20
+
21
+ yt = YouTube(
22
+ video_url,
23
+ on_progress_callback=progress_function,
24
+ )
25
+
26
+ stream = yt.streams.get_highest_resolution()
27
+ stream.download(output_path='./downloads')
28
+ if progress_bar:
29
+ progress_bar.close()
30
+
31
+ if download_captions:
32
+ caption = yt.captions.get('en') or yt.captions.get('a.en')
33
+ if caption:
34
+ caption_convert_to_srt = caption.generate_srt_captions()
35
+ caption_convert_to_srt = caption_convert_to_srt.replace("\n\n", "\n")
36
+ with open(os.path.join('./downloads', f"{yt.title}.srt"), "w", encoding="utf-8") as file:
37
+ file.write(caption_convert_to_srt)
38
+ print(f"Captions saved to 'downloads/{yt.title}.srt'")
39
+ else:
40
+ print("No English captions found for this video.")
41
+
42
+ def download_video(url, download_captions=False):
43
+ video_path = './downloads/' + YouTube(url).streams.get_highest_resolution().default_filename
44
+ download_youtube_video(url, download_captions)
45
+ return video_path
46
+
47
+ if __name__ == "__main__":
48
+ parser = argparse.ArgumentParser(description="Download YouTube video and captions")
49
+ parser.add_argument("video_url", help="YouTube video URL")
50
+ parser.add_argument("--captions", action="store_true", help="Download captions if available")
51
+ args = parser.parse_args()
52
+
53
+ download_video(args.video_url, args.captions)