juancopi81 commited on
Commit
6249bc9
1 Parent(s): 73a3627

Create YouTube Illustrated Summary

Browse files
Files changed (9) hide show
  1. .gitignore +132 -0
  2. README.md +4 -4
  3. app.py +241 -87
  4. requirements.txt +10 -2
  5. textprocessor.py +56 -0
  6. transcriber.py +12 -0
  7. utils.py +6 -0
  8. videocreator.py +46 -0
  9. youtubeaudioextractor.py +25 -0
.gitignore ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # vscode
132
+ .vscode/
README.md CHANGED
@@ -1,15 +1,15 @@
1
  ---
2
- title: Whisper Demo
3
- emoji: 🇪🇸
4
  colorFrom: indigo
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.9.1
8
  app_file: app.py
9
  pinned: false
 
10
  tags:
11
  - whisper-event
12
- duplicated_from: whisper-event/whisper-demo
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: YouTube to Illustrated Summary
3
+ emoji: 🏢
4
  colorFrom: indigo
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.14.0
8
  app_file: app.py
9
  pinned: false
10
+ license: openrail
11
  tags:
12
  - whisper-event
 
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,97 +1,251 @@
1
- import torch
2
-
3
  import gradio as gr
4
- import pytube as pt
 
 
5
  from transformers import pipeline
6
- from huggingface_hub import model_info
 
7
 
8
- MODEL_NAME = "juancopi81/whisper-medium-es" #this always needs to stay in line 8 :D sorry for the hackiness
 
 
 
 
 
 
9
  lang = "es"
10
 
11
- device = 0 if torch.cuda.is_available() else "cpu"
12
- pipe = pipeline(
 
 
 
 
 
 
 
 
 
 
 
 
13
  task="automatic-speech-recognition",
14
- model=MODEL_NAME,
15
  chunk_length_s=30,
16
  device=device,
17
  )
18
-
19
- pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
20
-
21
- def transcribe(microphone, file_upload):
22
- warn_output = ""
23
- if (microphone is not None) and (file_upload is not None):
24
- warn_output = (
25
- "WARNING: You've uploaded an audio file and used the microphone. "
26
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
27
- )
28
-
29
- elif (microphone is None) and (file_upload is None):
30
- return "ERROR: You have to either use the microphone or upload an audio file"
31
-
32
- file = microphone if microphone is not None else file_upload
33
-
34
- text = pipe(file)["text"]
35
-
36
- return warn_output + text
37
-
38
-
39
- def _return_yt_html_embed(yt_url):
40
- video_id = yt_url.split("?v=")[-1]
41
- HTML_str = (
42
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
43
- " </center>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
- return HTML_str
46
-
47
-
48
- def yt_transcribe(yt_url):
49
- yt = pt.YouTube(yt_url)
50
- html_embed_str = _return_yt_html_embed(yt_url)
51
- stream = yt.streams.filter(only_audio=True)[0]
52
- stream.download(filename="audio.mp3")
53
-
54
- text = pipe("audio.mp3")["text"]
55
-
56
- return html_embed_str, text
57
-
58
-
59
- demo = gr.Blocks()
60
-
61
- mf_transcribe = gr.Interface(
62
- fn=transcribe,
63
- inputs=[
64
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
65
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
66
- ],
67
- outputs="text",
68
- layout="horizontal",
69
- theme="huggingface",
70
- title="Whisper Demo: Transcribe Audio",
71
- description=(
72
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
73
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
74
- " of arbitrary length."
75
- ),
76
- allow_flagging="never",
77
- )
78
-
79
- yt_transcribe = gr.Interface(
80
- fn=yt_transcribe,
81
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
82
- outputs=["html", "text"],
83
- layout="horizontal",
84
- theme="huggingface",
85
- title="Whisper Demo: Transcribe YouTube",
86
- description=(
87
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
88
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
89
- " arbitrary length."
90
- ),
91
- allow_flagging="never",
92
- )
93
-
94
- with demo:
95
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
96
-
97
- demo.launch(enable_queue=True)
 
 
 
1
  import gradio as gr
2
+ from typing import Any
3
+
4
+ import torch
5
  from transformers import pipeline
6
+ from diffusers import StableDiffusionPipeline
7
+ from TTS.api import TTS
8
 
9
+ import utils
10
+ from youtubeaudioextractor import PytubeAudioExtractor
11
+ from transcriber import Transcriber
12
+ from textprocessor import TextProcessor
13
+ from videocreator import VideoCreator
14
+
15
+ TRANSCRIBER_MODEL_NAME = "juancopi81/whisper-medium-es"
16
  lang = "es"
17
 
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ dtype = torch.float16 if device == "cuda" else torch.float32
20
+
21
+ # Detect if code is running in Colab
22
+ is_colab = utils.is_google_colab()
23
+ colab_instruction = "" if is_colab else """
24
+ <p>You can skip the queue using Colab:
25
+ <a href="">
26
+ <img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a></p>"""
27
+ device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
28
+
29
+ # Initialize components
30
+ audio_extractor = PytubeAudioExtractor()
31
+ transcription_pipe = pipeline(
32
  task="automatic-speech-recognition",
33
+ model=TRANSCRIBER_MODEL_NAME,
34
  chunk_length_s=30,
35
  device=device,
36
  )
37
+ transcription_pipe.model.config.forced_decoder_ids = transcription_pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
38
+ audio_transcriber = Transcriber(transcription_pipe)
39
+ openai_model = "text-davinci-003"
40
+ text_processor = TextProcessor(openai_model)
41
+
42
+ image_model_id = "runwayml/stable-diffusion-v1-5"
43
+ image_pipeline = StableDiffusionPipeline.from_pretrained(image_model_id,
44
+ torch_dtype=dtype,
45
+ revision="fp16")
46
+
47
+ image_pipeline = image_pipeline.to(device)
48
+
49
+ vo_model_name = TTS.list_models()[22]
50
+ # Init TTS
51
+ tts = TTS(vo_model_name)
52
+ video_creator = VideoCreator(tts, image_pipeline)
53
+
54
+ def datapipeline(url: str) -> Any:
55
+ audio_path_file = audio_extractor.extract(url)
56
+ print(f"Audio file created at: {audio_path_file}")
57
+ transcribed_text = audio_transcriber.transcribe(audio_path_file)
58
+ print("Audio transcription ready!")
59
+ json_scenes = text_processor.get_json_scenes(transcribed_text)
60
+ print("Scenes ready")
61
+ video = video_creator.create_video(json_scenes)
62
+ return video, video
63
+
64
+ css = """
65
+ a {
66
+ color: inherit;
67
+ text-decoration: underline;
68
+ }
69
+ .gradio-container {
70
+ font-family: 'IBM Plex Sans', sans-serif;
71
+ }
72
+ .gr-button {
73
+ color: white;
74
+ border-color: #000000;
75
+ background: #000000;
76
+ }
77
+ input[type='range'] {
78
+ accent-color: #000000;
79
+ }
80
+ .dark input[type='range'] {
81
+ accent-color: #dfdfdf;
82
+ }
83
+ .container {
84
+ max-width: 730px;
85
+ margin: auto;
86
+ padding-top: 1.5rem;
87
+ }
88
+ #gallery {
89
+ min-height: 22rem;
90
+ margin-bottom: 15px;
91
+ margin-left: auto;
92
+ margin-right: auto;
93
+ border-bottom-right-radius: .5rem !important;
94
+ border-bottom-left-radius: .5rem !important;
95
+ }
96
+ #gallery>div>.h-full {
97
+ min-height: 20rem;
98
+ }
99
+ .details:hover {
100
+ text-decoration: underline;
101
+ }
102
+ .gr-button {
103
+ white-space: nowrap;
104
+ }
105
+ .gr-button:focus {
106
+ border-color: rgb(147 197 253 / var(--tw-border-opacity));
107
+ outline: none;
108
+ box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
109
+ --tw-border-opacity: 1;
110
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
111
+ --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
112
+ --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
113
+ --tw-ring-opacity: .5;
114
+ }
115
+ #advanced-btn {
116
+ font-size: .7rem !important;
117
+ line-height: 19px;
118
+ margin-top: 12px;
119
+ margin-bottom: 12px;
120
+ padding: 2px 8px;
121
+ border-radius: 14px !important;
122
+ }
123
+ #advanced-options {
124
+ margin-bottom: 20px;
125
+ }
126
+ .footer {
127
+ margin-bottom: 45px;
128
+ margin-top: 35px;
129
+ text-align: center;
130
+ border-bottom: 1px solid #e5e5e5;
131
+ }
132
+ .footer>p {
133
+ font-size: .8rem;
134
+ display: inline-block;
135
+ padding: 0 10px;
136
+ transform: translateY(10px);
137
+ background: white;
138
+ }
139
+ .dark .footer {
140
+ border-color: #303030;
141
+ }
142
+ .dark .footer>p {
143
+ background: #0b0f19;
144
+ }
145
+ .acknowledgments h4{
146
+ margin: 1.25em 0 .25em 0;
147
+ font-weight: bold;
148
+ font-size: 115%;
149
+ }
150
+ #container-advanced-btns{
151
+ display: flex;
152
+ flex-wrap: wrap;
153
+ justify-content: space-between;
154
+ align-items: center;
155
+ }
156
+ .animate-spin {
157
+ animation: spin 1s linear infinite;
158
+ }
159
+ @keyframes spin {
160
+ from {
161
+ transform: rotate(0deg);
162
+ }
163
+ to {
164
+ transform: rotate(360deg);
165
+ }
166
+ }
167
+ #share-btn-container {
168
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
169
+ }
170
+ #share-btn {
171
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
172
+ }
173
+ #share-btn * {
174
+ all: unset;
175
+ }
176
+ .gr-form{
177
+ flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
178
+ }
179
+ #prompt-container{
180
+ gap: 0;
181
+ }
182
+ #generated_id{
183
+ min-height: 700px
184
+ }
185
+ """
186
+ block = gr.Blocks(css=css)
187
+
188
+ with block as demo:
189
+ gr.HTML(
190
+ f"""
191
+ <div style="text-align: center; max-width: 650px; margin: 0 auto;">
192
+ <div
193
+ style="
194
+ display: inline-flex;
195
+ align-items: center;
196
+ gap: 0.8rem;
197
+ font-size: 1.75rem;
198
+ "
199
+ >
200
+ <h1 style="font-weight: 900; margin-bottom: 7px;">
201
+ YouTube to Illustraded Summary
202
+ </h1>
203
+ </div>
204
+ <p style="margin-bottom: 10px; font-size: 94%">
205
+ Enter the URL of a YouTuve video (Spanish) and you'll recive a video with an illustraded summary.
206
+ It works for audio books, history lessons, etc. Try it out with a short video (less than 10 minutes).
207
+ </p>
208
+ <p style="margin-bottom: 10px; font-size: 94%">
209
+ Running on <b>{device_print}</b>
210
+ </p>
211
+ </div>
212
+ """
213
  )
214
+ with gr.Group():
215
+ with gr.Box():
216
+ with gr.Row().style(mobile_collapse=False, equal_height=True):
217
+
218
+ url = gr.Textbox(
219
+ label="Enter the URL of the YouTubeVideo", show_label=False, max_lines=1
220
+ ).style(
221
+ border=(True, False, True, True),
222
+ rounded=(True, False, False, True),
223
+ container=False,
224
+ )
225
+ btn = gr.Button("Run").style(
226
+ margin=False,
227
+ rounded=(False, True, True, False),
228
+ )
229
+ video_output = gr.Video()
230
+ file_output = gr.File()
231
+
232
+ btn.click(datapipeline,
233
+ inputs=[url],
234
+ outputs=[video_output, file_output])
235
+
236
+ gr.HTML(
237
+ """
238
+ <div class="footer">
239
+ <p>This demos is part of the Whisper Sprint (Dec. 2022).</a>
240
+ </p>
241
+ </div>
242
+ """
243
+ )
244
+ gr.Markdown('''
245
+ [![Twitter Follow](https://img.shields.io/twitter/follow/juancopi81?style=social)](https://twitter.com/juancopi81)
246
+ ![visitors](https://visitor-badge.glitch.me/badge?page_id=Juancopi81.yt-illustraded-summary)
247
+ ''')
248
+
249
+ if not is_colab:
250
+ demo.queue(concurrency_count=1)
251
+ demo.launch(debug=is_colab, share=is_colab)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- git+https://github.com/huggingface/transformers
2
  torch
3
- pytube
 
 
 
 
 
 
 
 
 
 
 
1
  torch
2
+ pytube
3
+ git+https://github.com/huggingface/transformers
4
+ openai
5
+ typing
6
+ git+https://github.com/huggingface/diffusers.git
7
+ accelerate
8
+ TTS
9
+ moviepy
10
+ imageio==2.4.1
11
+ tensorboard
textprocessor.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict
4
+
5
+ import openai
6
+
7
+ context_prompt = """
8
+
9
+ You are a creator of ilustraded books building a series of scenes for your book.
10
+ Your boss asked you to write a summary and the illustrations of the following text:
11
+
12
+ $TRANSCRIPTION
13
+
14
+ You have to write the summary using a maximum of 7 scenes, and using the following JSON format:
15
+
16
+ Write your answer in JSON format that has: The number of the scene, the summary for each scene, and the Illustration for each scene.
17
+ The value for "Summary" should be in Spanish and it should not be longer than 30 words.
18
+ The value for "Illustration" should be in English and no longer than 20 words. It should have a detail description of an illustration for this scene in English with many details, and a artistic style for the illustration that matches the text.
19
+
20
+ Just answer with the JSON object, so your boss can easily parse it.
21
+
22
+ """
23
+
24
+ openai.api_key = os.getenv("SECRET_KEY_OPENAI")
25
+
26
+ class TextProcessor:
27
+ def __init__(self,
28
+ model: str = "text-davinci-003",
29
+ temperature: float = 0.7,
30
+ max_tokens: int = 2500,
31
+ top_p: int = 1,
32
+ frequency_penalty: int = 0,
33
+ presence_penalty: int = 0) -> None:
34
+ self.model = model
35
+ self.temperature = temperature
36
+ self.max_tokens = max_tokens
37
+ self.top_p = top_p
38
+ self.frequency_penalty = frequency_penalty
39
+ self.presence_penalty = presence_penalty
40
+
41
+ def get_json_scenes(self,
42
+ prompt: str) -> Dict:
43
+ gpt_prompt = context_prompt.replace("$TRANSCRIPTION", prompt)
44
+ response = openai.Completion.create(
45
+ model=self.model,
46
+ prompt=gpt_prompt,
47
+ temperature=self.temperature,
48
+ max_tokens=self.max_tokens,
49
+ top_p=self.top_p,
50
+ frequency_penalty=self.frequency_penalty,
51
+ presence_penalty=self.presence_penalty
52
+ )
53
+ scenes = json.loads(response["choices"][0]["text"])
54
+ if (type(scenes) == list):
55
+ scenes = {i: d for i, d in enumerate(scenes)}
56
+ return scenes
transcriber.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ class Transcriber:
4
+ def __init__(self, pipe: pipeline) -> None:
5
+ self.pipe = pipe
6
+
7
+ def transcribe(self, file_path: str = "yt_audio.mp3") -> str:
8
+ try:
9
+ transcription = self.pipe(file_path)["text"]
10
+ return transcription
11
+ except:
12
+ return "ERROR: No audio file found to transcribe"
utils.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def is_google_colab():
2
+ try:
3
+ import google.colab
4
+ return True
5
+ except:
6
+ return False
videocreator.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ from moviepy.editor import VideoFileClip, concatenate_videoclips
4
+
5
+ class VideoCreator:
6
+ def __init__(self,
7
+ tts_pipeline,
8
+ image_pipeline) -> None:
9
+
10
+ self.tts_pipeline = tts_pipeline
11
+ self.image_pipeline = image_pipeline
12
+
13
+ def create_video(self, scenes: Dict) -> Dict:
14
+ videos_dict = {}
15
+ for index, scene in enumerate(scenes):
16
+ video_scene = self._create_video_from_scene(scenes[scene])
17
+ videos_dict[scene] = video_scene
18
+ merged_video = self._merge_videos(videos_dict)
19
+ return merged_video
20
+
21
+ def _create_video_from_scene(self, scene: Dict) -> str:
22
+ audio_file = self._get_audio_from_text(scene["Summary"])
23
+ bg_image = self._get_bg_image_from_description(scene["Illustration"])
24
+ video = gr.make_waveform(audio=audio_file,
25
+ bg_image=bg_image)
26
+ return video
27
+
28
+ def _get_audio_from_text(self, voice_over: str) -> str:
29
+ self.tts_pipeline.tts_to_file(text=voice_over,
30
+ file_path="output.wav")
31
+ return "output.wav"
32
+
33
+ def _get_bg_image_from_description(self, img_desc: str):
34
+ images = self.image_pipeline(img_desc)
35
+ print("Image generated!")
36
+ image_output = images.images[0]
37
+ image_output.save("img.png")
38
+ return "img.png"
39
+
40
+ def _merge_videos(self, videos_dict: Dict):
41
+ videos_to_concatenate = []
42
+ for video in range(len(videos_dict)):
43
+ video_clip = VideoFileClip(videos_dict[video])
44
+ videos_to_concatenate.append(video_clip)
45
+ final_video = concatenate_videoclips(videos_to_concatenate)
46
+ final_video.write_videofile("final_video.mp4")
youtubeaudioextractor.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ import pytube as pt
4
+
5
+ class YouTubeAudioExtractor(ABC):
6
+
7
+ @abstractmethod
8
+ def extract(self, url: str, save_path: str) -> str:
9
+ pass
10
+
11
+ class PytubeAudioExtractor(YouTubeAudioExtractor):
12
+
13
+ def __init__(self,
14
+ only_audio: bool = True,
15
+ extension: str = ".mp3") -> None:
16
+ self.only_audio = only_audio
17
+ self.extension = extension
18
+
19
+ def extract(self, url: str,
20
+ save_path: str = "yt_audio") -> str:
21
+ yt = pt.YouTube(url)
22
+ stream = yt.streams.filter(only_audio=self.only_audio)[0]
23
+ filename = save_path + self.extension
24
+ stream.download(filename=filename)
25
+ return filename