juancopi81 commited on
Commit
595153d
1 Parent(s): 259b168

Add English support

Browse files
Files changed (5) hide show
  1. app.py +68 -15
  2. requirements.txt +2 -1
  3. textprocessor.py +5 -3
  4. transcriber.py +21 -1
  5. videocreator.py +9 -6
app.py CHANGED
@@ -5,15 +5,16 @@ import torch
5
  from transformers import pipeline
6
  from diffusers import StableDiffusionPipeline
7
  from TTS.api import TTS
 
8
 
9
  import utils
10
  from youtubeaudioextractor import PytubeAudioExtractor
11
- from transcriber import Transcriber
12
  from textprocessor import TextProcessor
13
  from videocreator import VideoCreator
14
 
15
- TRANSCRIBER_MODEL_NAME = "juancopi81/whisper-medium-es"
16
- lang = "es"
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  device_dict = {"cuda": 0, "cpu": -1}
@@ -29,14 +30,20 @@ device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
29
 
30
  # Initialize components
31
  audio_extractor = PytubeAudioExtractor()
32
- transcription_pipe = pipeline(
33
  task="automatic-speech-recognition",
34
- model=TRANSCRIBER_MODEL_NAME,
35
  chunk_length_s=30,
36
  device=device_dict[device],
37
  )
38
- transcription_pipe.model.config.forced_decoder_ids = transcription_pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
39
- audio_transcriber = Transcriber(transcription_pipe)
 
 
 
 
 
 
40
  openai_model = "text-davinci-003"
41
  text_processor = TextProcessor(openai_model)
42
 
@@ -47,19 +54,33 @@ image_pipeline = StableDiffusionPipeline.from_pretrained(image_model_id,
47
 
48
  image_pipeline = image_pipeline.to(device)
49
 
50
- vo_model_name = TTS.list_models()[22]
 
51
  # Init TTS
52
- tts = TTS(vo_model_name)
53
- video_creator = VideoCreator(tts, image_pipeline)
54
 
55
- def datapipeline(url: str) -> Any:
 
 
 
56
  audio_path_file = audio_extractor.extract(url)
57
  print(f"Audio file created at: {audio_path_file}")
 
 
 
 
 
 
 
 
 
58
  transcribed_text = audio_transcriber.transcribe(audio_path_file)
59
  print("Audio transcription ready!")
60
- json_scenes = text_processor.get_json_scenes(transcribed_text)
 
61
  print("Scenes ready")
62
- video = video_creator.create_video(json_scenes)
63
  print("Video at", video)
64
  return video, video
65
 
@@ -184,6 +205,11 @@ css = """
184
  #generated_id{
185
  min-height: 700px
186
  }
 
 
 
 
 
187
  """
188
  block = gr.Blocks(css=css)
189
 
@@ -214,6 +240,7 @@ with block as demo:
214
  Some samples videos you can try:
215
  <ul>
216
  <li>https://www.youtube.com/watch?v=Hk5evm1NgzA (Little Red Riding Hood. Infer time: c.a. 196 seconds)</li>
 
217
  <li>https://www.youtube.com/watch?v=sRmmQBBln9Q (Cook recipe. Infer time: c.a. 200 seconds)</li>
218
  <li>https://www.youtube.com/watch?v=qz4Wc48KITA (Poem by Edgar Allan Poe. Infer time: c.a. 200 seconds)</li>
219
  <li>https://www.youtube.com/watch?v=2D8CaoIY7Lk (The history of Christmas trees. Infer time: c.a. 130 seconds)</li>
@@ -224,12 +251,32 @@ with block as demo:
224
  </div>
225
  """
226
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  with gr.Group():
228
  with gr.Box():
229
  with gr.Row().style(mobile_collapse=False, equal_height=True):
230
 
231
  url = gr.Textbox(
232
- label="Enter the URL of the YouTubeVideo", show_label=False, max_lines=1
 
 
 
233
  ).style(
234
  border=(True, False, True, True),
235
  rounded=(True, False, False, True),
@@ -243,9 +290,15 @@ with block as demo:
243
  file_output = gr.File()
244
 
245
  btn.click(datapipeline,
246
- inputs=[url],
 
 
 
247
  outputs=[video_output, file_output])
248
 
 
 
 
249
  gr.HTML(
250
  """
251
  <div class="footer">
 
5
  from transformers import pipeline
6
  from diffusers import StableDiffusionPipeline
7
  from TTS.api import TTS
8
+ import whisper
9
 
10
  import utils
11
  from youtubeaudioextractor import PytubeAudioExtractor
12
+ from transcriber import SpanishTranscriber, WhisperTranscriber
13
  from textprocessor import TextProcessor
14
  from videocreator import VideoCreator
15
 
16
+ spanish_transcribe_model = "juancopi81/whisper-medium-es"
17
+ languages = {"Spanish": "es", "English": "en"}
18
 
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
  device_dict = {"cuda": 0, "cpu": -1}
 
30
 
31
  # Initialize components
32
  audio_extractor = PytubeAudioExtractor()
33
+ es_transcription_pipe = pipeline(
34
  task="automatic-speech-recognition",
35
+ model=spanish_transcribe_model,
36
  chunk_length_s=30,
37
  device=device_dict[device],
38
  )
39
+ es_transcription_pipe.model.config.forced_decoder_ids = es_transcription_pipe.tokenizer.get_decoder_prompt_ids(language="es",
40
+ task="transcribe")
41
+ es_audio_transcriber = SpanishTranscriber(es_transcription_pipe)
42
+
43
+ en_transcription_pipe = whisper.load_model("base")
44
+
45
+ en_audio_transcriber = WhisperTranscriber(en_transcription_pipe)
46
+
47
  openai_model = "text-davinci-003"
48
  text_processor = TextProcessor(openai_model)
49
 
 
54
 
55
  image_pipeline = image_pipeline.to(device)
56
 
57
+ es_vo_model_name = TTS.list_models()[22]
58
+ en_vo_model_name = TTS.list_models()[8]
59
  # Init TTS
60
+ es_tts = TTS(es_vo_model_name)
61
+ en_tts = TTS(en_vo_model_name)
62
 
63
+ def datapipeline(url: str,
64
+ video_language: str,
65
+ summary_language: str,
66
+ video_styles: str) -> Any:
67
  audio_path_file = audio_extractor.extract(url)
68
  print(f"Audio file created at: {audio_path_file}")
69
+ # Select transcriber
70
+ if video_language == "Spanish":
71
+ audio_transcriber = es_audio_transcriber
72
+ video_creator = VideoCreator(es_tts, image_pipeline)
73
+ elif video_language == "English":
74
+ audio_transcriber = en_audio_transcriber
75
+ video_creator = VideoCreator(en_tts, image_pipeline)
76
+ else:
77
+ return "Language not supported"
78
  transcribed_text = audio_transcriber.transcribe(audio_path_file)
79
  print("Audio transcription ready!")
80
+ json_scenes = text_processor.get_json_scenes(transcribed_text,
81
+ summary_language)
82
  print("Scenes ready")
83
+ video = video_creator.create_video(json_scenes, video_styles)
84
  print("Video at", video)
85
  return video, video
86
 
 
205
  #generated_id{
206
  min-height: 700px
207
  }
208
+ #setting_id{
209
+ margin-bottom: 12px;
210
+ text-align: center;
211
+ font-weight: 900;
212
+ }
213
  """
214
  block = gr.Blocks(css=css)
215
 
 
240
  Some samples videos you can try:
241
  <ul>
242
  <li>https://www.youtube.com/watch?v=Hk5evm1NgzA (Little Red Riding Hood. Infer time: c.a. 196 seconds)</li>
243
+ <li>https://www.youtube.com/watch?v=nJxWS9jZ9-c (Elon Musk's Biography. Infer time: c.a. 176 seconds)</li>
244
  <li>https://www.youtube.com/watch?v=sRmmQBBln9Q (Cook recipe. Infer time: c.a. 200 seconds)</li>
245
  <li>https://www.youtube.com/watch?v=qz4Wc48KITA (Poem by Edgar Allan Poe. Infer time: c.a. 200 seconds)</li>
246
  <li>https://www.youtube.com/watch?v=2D8CaoIY7Lk (The history of Christmas trees. Infer time: c.a. 130 seconds)</li>
 
251
  </div>
252
  """
253
  )
254
+ with gr.Group():
255
+ with gr.Box():
256
+ with gr.Row(elem_id="setting_id").style(mobile_collapse=False, equal_height=True):
257
+ gr.HTML("<h1>Setting</h1>")
258
+ with gr.Row():
259
+ with gr.Column():
260
+ video_language = gr.Radio(choices=["Spanish", "English"],
261
+ label="Language of your input video:",
262
+ value="Spanish")
263
+ with gr.Column():
264
+ summary_language = gr.Radio(choices=["Spanish", "English"],
265
+ label="Language of your output video:",
266
+ value="Spanish")
267
+ with gr.Row():
268
+ video_styles = gr.Textbox(label="(OPTIONAL) Enter the styles for your ouput video",
269
+ value="",
270
+ placeholder="illustration, highly detailed, digital painting, concept art, matte, art by wlop and artgerm and greg rutkowski and alphonse mucha, masterpiece")
271
  with gr.Group():
272
  with gr.Box():
273
  with gr.Row().style(mobile_collapse=False, equal_height=True):
274
 
275
  url = gr.Textbox(
276
+ label="Enter the URL of the YouTubeVideo",
277
+ show_label=False,
278
+ max_lines=1,
279
+ placeholder="YouTube URL"
280
  ).style(
281
  border=(True, False, True, True),
282
  rounded=(True, False, False, True),
 
290
  file_output = gr.File()
291
 
292
  btn.click(datapipeline,
293
+ inputs=[url,
294
+ video_language,
295
+ summary_language,
296
+ video_styles],
297
  outputs=[video_output, file_output])
298
 
299
+ #gr.Examples(
300
+ # examples=[[], []]
301
+ #)
302
  gr.HTML(
303
  """
304
  <div class="footer">
requirements.txt CHANGED
@@ -8,4 +8,5 @@ accelerate
8
  TTS
9
  moviepy
10
  imageio==2.4.1
11
- tensorboard
 
 
8
  TTS
9
  moviepy
10
  imageio==2.4.1
11
+ tensorboard
12
+ git+https://github.com/openai/whisper.git
textprocessor.py CHANGED
@@ -9,9 +9,9 @@ You are a creator of illustrated books building a series of scenes for your book
9
  Your boss asked you to write a summary and illustrations of this text:
10
  $TRANSCRIPTION
11
  You have to write the summary using a maximum of 7 scenes in a JSON object following these instructions:
12
- [{Scene": int, "Summary": Spanish str, "Illustration": English str}, ...] where:
13
  "Scene": The number of the scene.
14
- "Summary": Spanish string with a summary of the scene. It should be in Spanish, and it should be less than 30 words. Readers should understand it without looking at the illustration.
15
  "Illustration": English string with a detailed English description of an illustration for this scene. It must be written in English and in less than 20 words. It should include many details and an artistic style for the image that matches the text.
16
  Just answer with the JSON object:
17
  """
@@ -34,8 +34,10 @@ class TextProcessor:
34
  self.presence_penalty = presence_penalty
35
 
36
  def get_json_scenes(self,
37
- prompt: str) -> Dict:
 
38
  gpt_prompt = context_prompt.replace("$TRANSCRIPTION", prompt)
 
39
  print("gpt_prompt", gpt_prompt)
40
  response = openai.Completion.create(
41
  model=self.model,
 
9
  Your boss asked you to write a summary and illustrations of this text:
10
  $TRANSCRIPTION
11
  You have to write the summary using a maximum of 7 scenes in a JSON object following these instructions:
12
+ [{Scene": int, "Summary": $SUMMARY_LANGUAGE str, "Illustration": English str}, ...] where:
13
  "Scene": The number of the scene.
14
+ "Summary": $SUMMARY_LANGUAGE string with a summary of the scene. It should be in $SUMMARY_LANGUAGE, and it should be less than 30 words. Readers should understand it without looking at the illustration.
15
  "Illustration": English string with a detailed English description of an illustration for this scene. It must be written in English and in less than 20 words. It should include many details and an artistic style for the image that matches the text.
16
  Just answer with the JSON object:
17
  """
 
34
  self.presence_penalty = presence_penalty
35
 
36
  def get_json_scenes(self,
37
+ prompt: str,
38
+ summary_language: str) -> Dict:
39
  gpt_prompt = context_prompt.replace("$TRANSCRIPTION", prompt)
40
+ gpt_prompt = gpt_prompt.replace("$SUMMARY_LANGUAGE ", summary_language)
41
  print("gpt_prompt", gpt_prompt)
42
  response = openai.Completion.create(
43
  model=self.model,
transcriber.py CHANGED
@@ -1,6 +1,14 @@
 
 
1
  from transformers import pipeline
2
 
3
- class Transcriber:
 
 
 
 
 
 
4
  def __init__(self, pipe: pipeline) -> None:
5
  self.pipe = pipe
6
 
@@ -8,4 +16,16 @@ class Transcriber:
8
  print("Pipe:", self.pipe)
9
  print("Audo file at:", file_path)
10
  transcription = self.pipe(file_path)["text"]
 
 
 
 
 
 
 
 
 
 
 
 
11
  return transcription
 
1
+ from abc import ABC, abstractmethod
2
+
3
  from transformers import pipeline
4
 
5
+ class Transcriber(ABC):
6
+
7
+ @abstractmethod
8
+ def transcribe(self, file_path: str) -> str:
9
+ pass
10
+
11
+ class SpanishTranscriber(Transcriber):
12
  def __init__(self, pipe: pipeline) -> None:
13
  self.pipe = pipe
14
 
 
16
  print("Pipe:", self.pipe)
17
  print("Audo file at:", file_path)
18
  transcription = self.pipe(file_path)["text"]
19
+ return transcription
20
+
21
+ class WhisperTranscriber(Transcriber):
22
+ def __init__(self, model, without_timestamps: bool=True) -> None:
23
+ self.model = model
24
+ self.without_timestamps = without_timestamps
25
+
26
+ def transcribe(self, file_path: str = "yt_audio.mp3") -> str:
27
+ print("Model:", self.model)
28
+ print("Audo file at:", file_path)
29
+ transcription = self.model.transcribe(file_path,
30
+ without_timestamps=self.without_timestamps)["text"]
31
  return transcription
videocreator.py CHANGED
@@ -11,17 +11,20 @@ class VideoCreator:
11
  self.tts_pipeline = tts_pipeline
12
  self.image_pipeline = image_pipeline
13
 
14
- def create_video(self, scenes: Dict) -> str:
 
 
15
  videos_dict = {}
16
  for index, scene in enumerate(scenes):
17
- video_scene = self._create_video_from_scene(scenes[scene])
 
18
  videos_dict[index] = video_scene
19
  merged_video = self._merge_videos(videos_dict)
20
  return merged_video
21
 
22
- def _create_video_from_scene(self, scene: Dict) -> str:
23
  audio_file = self._get_audio_from_text(scene["Summary"])
24
- bg_image = self._get_bg_image_from_description(scene["Illustration"])
25
  video = gr.make_waveform(audio=audio_file,
26
  bg_image=bg_image)
27
  return video
@@ -31,8 +34,8 @@ class VideoCreator:
31
  file_path="output.wav")
32
  return "output.wav"
33
 
34
- def _get_bg_image_from_description(self, img_desc: str):
35
- images = self.image_pipeline(img_desc)
36
  print("Image generated!")
37
  image_output = images.images[0]
38
  image_output.save("img.png")
 
11
  self.tts_pipeline = tts_pipeline
12
  self.image_pipeline = image_pipeline
13
 
14
+ def create_video(self,
15
+ scenes: Dict,
16
+ video_styles: str) -> str:
17
  videos_dict = {}
18
  for index, scene in enumerate(scenes):
19
+ video_scene = self._create_video_from_scene(scenes[scene],
20
+ video_styles)
21
  videos_dict[index] = video_scene
22
  merged_video = self._merge_videos(videos_dict)
23
  return merged_video
24
 
25
+ def _create_video_from_scene(self, scene: Dict, video_styles: str) -> str:
26
  audio_file = self._get_audio_from_text(scene["Summary"])
27
+ bg_image = self._get_bg_image_from_description(scene["Illustration"], video_styles)
28
  video = gr.make_waveform(audio=audio_file,
29
  bg_image=bg_image)
30
  return video
 
34
  file_path="output.wav")
35
  return "output.wav"
36
 
37
+ def _get_bg_image_from_description(self, img_desc: str, video_styles: str):
38
+ images = self.image_pipeline(img_desc + video_styles)
39
  print("Image generated!")
40
  image_output = images.images[0]
41
  image_output.save("img.png")