Spaces:
Build error
Build error
juancopi81
commited on
Commit
•
595153d
1
Parent(s):
259b168
Add English support
Browse files- app.py +68 -15
- requirements.txt +2 -1
- textprocessor.py +5 -3
- transcriber.py +21 -1
- videocreator.py +9 -6
app.py
CHANGED
@@ -5,15 +5,16 @@ import torch
|
|
5 |
from transformers import pipeline
|
6 |
from diffusers import StableDiffusionPipeline
|
7 |
from TTS.api import TTS
|
|
|
8 |
|
9 |
import utils
|
10 |
from youtubeaudioextractor import PytubeAudioExtractor
|
11 |
-
from transcriber import
|
12 |
from textprocessor import TextProcessor
|
13 |
from videocreator import VideoCreator
|
14 |
|
15 |
-
|
16 |
-
|
17 |
|
18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
device_dict = {"cuda": 0, "cpu": -1}
|
@@ -29,14 +30,20 @@ device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
|
|
29 |
|
30 |
# Initialize components
|
31 |
audio_extractor = PytubeAudioExtractor()
|
32 |
-
|
33 |
task="automatic-speech-recognition",
|
34 |
-
model=
|
35 |
chunk_length_s=30,
|
36 |
device=device_dict[device],
|
37 |
)
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
openai_model = "text-davinci-003"
|
41 |
text_processor = TextProcessor(openai_model)
|
42 |
|
@@ -47,19 +54,33 @@ image_pipeline = StableDiffusionPipeline.from_pretrained(image_model_id,
|
|
47 |
|
48 |
image_pipeline = image_pipeline.to(device)
|
49 |
|
50 |
-
|
|
|
51 |
# Init TTS
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
def datapipeline(url: str
|
|
|
|
|
|
|
56 |
audio_path_file = audio_extractor.extract(url)
|
57 |
print(f"Audio file created at: {audio_path_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
transcribed_text = audio_transcriber.transcribe(audio_path_file)
|
59 |
print("Audio transcription ready!")
|
60 |
-
json_scenes = text_processor.get_json_scenes(transcribed_text
|
|
|
61 |
print("Scenes ready")
|
62 |
-
video = video_creator.create_video(json_scenes)
|
63 |
print("Video at", video)
|
64 |
return video, video
|
65 |
|
@@ -184,6 +205,11 @@ css = """
|
|
184 |
#generated_id{
|
185 |
min-height: 700px
|
186 |
}
|
|
|
|
|
|
|
|
|
|
|
187 |
"""
|
188 |
block = gr.Blocks(css=css)
|
189 |
|
@@ -214,6 +240,7 @@ with block as demo:
|
|
214 |
Some samples videos you can try:
|
215 |
<ul>
|
216 |
<li>https://www.youtube.com/watch?v=Hk5evm1NgzA (Little Red Riding Hood. Infer time: c.a. 196 seconds)</li>
|
|
|
217 |
<li>https://www.youtube.com/watch?v=sRmmQBBln9Q (Cook recipe. Infer time: c.a. 200 seconds)</li>
|
218 |
<li>https://www.youtube.com/watch?v=qz4Wc48KITA (Poem by Edgar Allan Poe. Infer time: c.a. 200 seconds)</li>
|
219 |
<li>https://www.youtube.com/watch?v=2D8CaoIY7Lk (The history of Christmas trees. Infer time: c.a. 130 seconds)</li>
|
@@ -224,12 +251,32 @@ with block as demo:
|
|
224 |
</div>
|
225 |
"""
|
226 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
with gr.Group():
|
228 |
with gr.Box():
|
229 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
230 |
|
231 |
url = gr.Textbox(
|
232 |
-
label="Enter the URL of the YouTubeVideo",
|
|
|
|
|
|
|
233 |
).style(
|
234 |
border=(True, False, True, True),
|
235 |
rounded=(True, False, False, True),
|
@@ -243,9 +290,15 @@ with block as demo:
|
|
243 |
file_output = gr.File()
|
244 |
|
245 |
btn.click(datapipeline,
|
246 |
-
inputs=[url
|
|
|
|
|
|
|
247 |
outputs=[video_output, file_output])
|
248 |
|
|
|
|
|
|
|
249 |
gr.HTML(
|
250 |
"""
|
251 |
<div class="footer">
|
|
|
5 |
from transformers import pipeline
|
6 |
from diffusers import StableDiffusionPipeline
|
7 |
from TTS.api import TTS
|
8 |
+
import whisper
|
9 |
|
10 |
import utils
|
11 |
from youtubeaudioextractor import PytubeAudioExtractor
|
12 |
+
from transcriber import SpanishTranscriber, WhisperTranscriber
|
13 |
from textprocessor import TextProcessor
|
14 |
from videocreator import VideoCreator
|
15 |
|
16 |
+
spanish_transcribe_model = "juancopi81/whisper-medium-es"
|
17 |
+
languages = {"Spanish": "es", "English": "en"}
|
18 |
|
19 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
device_dict = {"cuda": 0, "cpu": -1}
|
|
|
30 |
|
31 |
# Initialize components
|
32 |
audio_extractor = PytubeAudioExtractor()
|
33 |
+
es_transcription_pipe = pipeline(
|
34 |
task="automatic-speech-recognition",
|
35 |
+
model=spanish_transcribe_model,
|
36 |
chunk_length_s=30,
|
37 |
device=device_dict[device],
|
38 |
)
|
39 |
+
es_transcription_pipe.model.config.forced_decoder_ids = es_transcription_pipe.tokenizer.get_decoder_prompt_ids(language="es",
|
40 |
+
task="transcribe")
|
41 |
+
es_audio_transcriber = SpanishTranscriber(es_transcription_pipe)
|
42 |
+
|
43 |
+
en_transcription_pipe = whisper.load_model("base")
|
44 |
+
|
45 |
+
en_audio_transcriber = WhisperTranscriber(en_transcription_pipe)
|
46 |
+
|
47 |
openai_model = "text-davinci-003"
|
48 |
text_processor = TextProcessor(openai_model)
|
49 |
|
|
|
54 |
|
55 |
image_pipeline = image_pipeline.to(device)
|
56 |
|
57 |
+
es_vo_model_name = TTS.list_models()[22]
|
58 |
+
en_vo_model_name = TTS.list_models()[8]
|
59 |
# Init TTS
|
60 |
+
es_tts = TTS(es_vo_model_name)
|
61 |
+
en_tts = TTS(en_vo_model_name)
|
62 |
|
63 |
+
def datapipeline(url: str,
|
64 |
+
video_language: str,
|
65 |
+
summary_language: str,
|
66 |
+
video_styles: str) -> Any:
|
67 |
audio_path_file = audio_extractor.extract(url)
|
68 |
print(f"Audio file created at: {audio_path_file}")
|
69 |
+
# Select transcriber
|
70 |
+
if video_language == "Spanish":
|
71 |
+
audio_transcriber = es_audio_transcriber
|
72 |
+
video_creator = VideoCreator(es_tts, image_pipeline)
|
73 |
+
elif video_language == "English":
|
74 |
+
audio_transcriber = en_audio_transcriber
|
75 |
+
video_creator = VideoCreator(en_tts, image_pipeline)
|
76 |
+
else:
|
77 |
+
return "Language not supported"
|
78 |
transcribed_text = audio_transcriber.transcribe(audio_path_file)
|
79 |
print("Audio transcription ready!")
|
80 |
+
json_scenes = text_processor.get_json_scenes(transcribed_text,
|
81 |
+
summary_language)
|
82 |
print("Scenes ready")
|
83 |
+
video = video_creator.create_video(json_scenes, video_styles)
|
84 |
print("Video at", video)
|
85 |
return video, video
|
86 |
|
|
|
205 |
#generated_id{
|
206 |
min-height: 700px
|
207 |
}
|
208 |
+
#setting_id{
|
209 |
+
margin-bottom: 12px;
|
210 |
+
text-align: center;
|
211 |
+
font-weight: 900;
|
212 |
+
}
|
213 |
"""
|
214 |
block = gr.Blocks(css=css)
|
215 |
|
|
|
240 |
Some samples videos you can try:
|
241 |
<ul>
|
242 |
<li>https://www.youtube.com/watch?v=Hk5evm1NgzA (Little Red Riding Hood. Infer time: c.a. 196 seconds)</li>
|
243 |
+
<li>https://www.youtube.com/watch?v=nJxWS9jZ9-c (Elon Musk's Biography. Infer time: c.a. 176 seconds)</li>
|
244 |
<li>https://www.youtube.com/watch?v=sRmmQBBln9Q (Cook recipe. Infer time: c.a. 200 seconds)</li>
|
245 |
<li>https://www.youtube.com/watch?v=qz4Wc48KITA (Poem by Edgar Allan Poe. Infer time: c.a. 200 seconds)</li>
|
246 |
<li>https://www.youtube.com/watch?v=2D8CaoIY7Lk (The history of Christmas trees. Infer time: c.a. 130 seconds)</li>
|
|
|
251 |
</div>
|
252 |
"""
|
253 |
)
|
254 |
+
with gr.Group():
|
255 |
+
with gr.Box():
|
256 |
+
with gr.Row(elem_id="setting_id").style(mobile_collapse=False, equal_height=True):
|
257 |
+
gr.HTML("<h1>Setting</h1>")
|
258 |
+
with gr.Row():
|
259 |
+
with gr.Column():
|
260 |
+
video_language = gr.Radio(choices=["Spanish", "English"],
|
261 |
+
label="Language of your input video:",
|
262 |
+
value="Spanish")
|
263 |
+
with gr.Column():
|
264 |
+
summary_language = gr.Radio(choices=["Spanish", "English"],
|
265 |
+
label="Language of your output video:",
|
266 |
+
value="Spanish")
|
267 |
+
with gr.Row():
|
268 |
+
video_styles = gr.Textbox(label="(OPTIONAL) Enter the styles for your ouput video",
|
269 |
+
value="",
|
270 |
+
placeholder="illustration, highly detailed, digital painting, concept art, matte, art by wlop and artgerm and greg rutkowski and alphonse mucha, masterpiece")
|
271 |
with gr.Group():
|
272 |
with gr.Box():
|
273 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
274 |
|
275 |
url = gr.Textbox(
|
276 |
+
label="Enter the URL of the YouTubeVideo",
|
277 |
+
show_label=False,
|
278 |
+
max_lines=1,
|
279 |
+
placeholder="YouTube URL"
|
280 |
).style(
|
281 |
border=(True, False, True, True),
|
282 |
rounded=(True, False, False, True),
|
|
|
290 |
file_output = gr.File()
|
291 |
|
292 |
btn.click(datapipeline,
|
293 |
+
inputs=[url,
|
294 |
+
video_language,
|
295 |
+
summary_language,
|
296 |
+
video_styles],
|
297 |
outputs=[video_output, file_output])
|
298 |
|
299 |
+
#gr.Examples(
|
300 |
+
# examples=[[], []]
|
301 |
+
#)
|
302 |
gr.HTML(
|
303 |
"""
|
304 |
<div class="footer">
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ accelerate
|
|
8 |
TTS
|
9 |
moviepy
|
10 |
imageio==2.4.1
|
11 |
-
tensorboard
|
|
|
|
8 |
TTS
|
9 |
moviepy
|
10 |
imageio==2.4.1
|
11 |
+
tensorboard
|
12 |
+
git+https://github.com/openai/whisper.git
|
textprocessor.py
CHANGED
@@ -9,9 +9,9 @@ You are a creator of illustrated books building a series of scenes for your book
|
|
9 |
Your boss asked you to write a summary and illustrations of this text:
|
10 |
$TRANSCRIPTION
|
11 |
You have to write the summary using a maximum of 7 scenes in a JSON object following these instructions:
|
12 |
-
[{Scene": int, "Summary":
|
13 |
"Scene": The number of the scene.
|
14 |
-
"Summary":
|
15 |
"Illustration": English string with a detailed English description of an illustration for this scene. It must be written in English and in less than 20 words. It should include many details and an artistic style for the image that matches the text.
|
16 |
Just answer with the JSON object:
|
17 |
"""
|
@@ -34,8 +34,10 @@ class TextProcessor:
|
|
34 |
self.presence_penalty = presence_penalty
|
35 |
|
36 |
def get_json_scenes(self,
|
37 |
-
prompt: str
|
|
|
38 |
gpt_prompt = context_prompt.replace("$TRANSCRIPTION", prompt)
|
|
|
39 |
print("gpt_prompt", gpt_prompt)
|
40 |
response = openai.Completion.create(
|
41 |
model=self.model,
|
|
|
9 |
Your boss asked you to write a summary and illustrations of this text:
|
10 |
$TRANSCRIPTION
|
11 |
You have to write the summary using a maximum of 7 scenes in a JSON object following these instructions:
|
12 |
+
[{Scene": int, "Summary": $SUMMARY_LANGUAGE str, "Illustration": English str}, ...] where:
|
13 |
"Scene": The number of the scene.
|
14 |
+
"Summary": $SUMMARY_LANGUAGE string with a summary of the scene. It should be in $SUMMARY_LANGUAGE, and it should be less than 30 words. Readers should understand it without looking at the illustration.
|
15 |
"Illustration": English string with a detailed English description of an illustration for this scene. It must be written in English and in less than 20 words. It should include many details and an artistic style for the image that matches the text.
|
16 |
Just answer with the JSON object:
|
17 |
"""
|
|
|
34 |
self.presence_penalty = presence_penalty
|
35 |
|
36 |
def get_json_scenes(self,
|
37 |
+
prompt: str,
|
38 |
+
summary_language: str) -> Dict:
|
39 |
gpt_prompt = context_prompt.replace("$TRANSCRIPTION", prompt)
|
40 |
+
gpt_prompt = gpt_prompt.replace("$SUMMARY_LANGUAGE ", summary_language)
|
41 |
print("gpt_prompt", gpt_prompt)
|
42 |
response = openai.Completion.create(
|
43 |
model=self.model,
|
transcriber.py
CHANGED
@@ -1,6 +1,14 @@
|
|
|
|
|
|
1 |
from transformers import pipeline
|
2 |
|
3 |
-
class Transcriber:
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def __init__(self, pipe: pipeline) -> None:
|
5 |
self.pipe = pipe
|
6 |
|
@@ -8,4 +16,16 @@ class Transcriber:
|
|
8 |
print("Pipe:", self.pipe)
|
9 |
print("Audo file at:", file_path)
|
10 |
transcription = self.pipe(file_path)["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
return transcription
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
from transformers import pipeline
|
4 |
|
5 |
+
class Transcriber(ABC):
|
6 |
+
|
7 |
+
@abstractmethod
|
8 |
+
def transcribe(self, file_path: str) -> str:
|
9 |
+
pass
|
10 |
+
|
11 |
+
class SpanishTranscriber(Transcriber):
|
12 |
def __init__(self, pipe: pipeline) -> None:
|
13 |
self.pipe = pipe
|
14 |
|
|
|
16 |
print("Pipe:", self.pipe)
|
17 |
print("Audo file at:", file_path)
|
18 |
transcription = self.pipe(file_path)["text"]
|
19 |
+
return transcription
|
20 |
+
|
21 |
+
class WhisperTranscriber(Transcriber):
|
22 |
+
def __init__(self, model, without_timestamps: bool=True) -> None:
|
23 |
+
self.model = model
|
24 |
+
self.without_timestamps = without_timestamps
|
25 |
+
|
26 |
+
def transcribe(self, file_path: str = "yt_audio.mp3") -> str:
|
27 |
+
print("Model:", self.model)
|
28 |
+
print("Audo file at:", file_path)
|
29 |
+
transcription = self.model.transcribe(file_path,
|
30 |
+
without_timestamps=self.without_timestamps)["text"]
|
31 |
return transcription
|
videocreator.py
CHANGED
@@ -11,17 +11,20 @@ class VideoCreator:
|
|
11 |
self.tts_pipeline = tts_pipeline
|
12 |
self.image_pipeline = image_pipeline
|
13 |
|
14 |
-
def create_video(self,
|
|
|
|
|
15 |
videos_dict = {}
|
16 |
for index, scene in enumerate(scenes):
|
17 |
-
video_scene = self._create_video_from_scene(scenes[scene]
|
|
|
18 |
videos_dict[index] = video_scene
|
19 |
merged_video = self._merge_videos(videos_dict)
|
20 |
return merged_video
|
21 |
|
22 |
-
def _create_video_from_scene(self, scene: Dict) -> str:
|
23 |
audio_file = self._get_audio_from_text(scene["Summary"])
|
24 |
-
bg_image = self._get_bg_image_from_description(scene["Illustration"])
|
25 |
video = gr.make_waveform(audio=audio_file,
|
26 |
bg_image=bg_image)
|
27 |
return video
|
@@ -31,8 +34,8 @@ class VideoCreator:
|
|
31 |
file_path="output.wav")
|
32 |
return "output.wav"
|
33 |
|
34 |
-
def _get_bg_image_from_description(self, img_desc: str):
|
35 |
-
images = self.image_pipeline(img_desc)
|
36 |
print("Image generated!")
|
37 |
image_output = images.images[0]
|
38 |
image_output.save("img.png")
|
|
|
11 |
self.tts_pipeline = tts_pipeline
|
12 |
self.image_pipeline = image_pipeline
|
13 |
|
14 |
+
def create_video(self,
|
15 |
+
scenes: Dict,
|
16 |
+
video_styles: str) -> str:
|
17 |
videos_dict = {}
|
18 |
for index, scene in enumerate(scenes):
|
19 |
+
video_scene = self._create_video_from_scene(scenes[scene],
|
20 |
+
video_styles)
|
21 |
videos_dict[index] = video_scene
|
22 |
merged_video = self._merge_videos(videos_dict)
|
23 |
return merged_video
|
24 |
|
25 |
+
def _create_video_from_scene(self, scene: Dict, video_styles: str) -> str:
|
26 |
audio_file = self._get_audio_from_text(scene["Summary"])
|
27 |
+
bg_image = self._get_bg_image_from_description(scene["Illustration"], video_styles)
|
28 |
video = gr.make_waveform(audio=audio_file,
|
29 |
bg_image=bg_image)
|
30 |
return video
|
|
|
34 |
file_path="output.wav")
|
35 |
return "output.wav"
|
36 |
|
37 |
+
def _get_bg_image_from_description(self, img_desc: str, video_styles: str):
|
38 |
+
images = self.image_pipeline(img_desc + video_styles)
|
39 |
print("Image generated!")
|
40 |
image_output = images.images[0]
|
41 |
image_output.save("img.png")
|