jjz5463 commited on
Commit
544723a
·
1 Parent(s): e9388c9

fail to make narrative

Browse files
Files changed (3) hide show
  1. app.py +6 -19
  2. baseline_utils.py +3 -45
  3. requirements.txt +3 -2
app.py CHANGED
@@ -6,16 +6,13 @@ from baseline_utils import (detect_text_in_image,
6
  analyze_writer_image,
7
  generate_video,
8
  break_diary_to_scenes,
9
- scenes_caption,
10
- summarizer_for_audio,
11
- narration_generate)
12
  import os
13
 
14
  # Load secrets from Hugging Face Spaces environment
15
  openai_api_key = os.getenv("OPENAI_API_KEY")
16
  google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
17
  gemini_api_key = os.getenv("GEMINI_API_KEY")
18
- eleven_api_key = os.getenv("ELEVEN_API_KEY")
19
 
20
  # Initialize OpenAI
21
  openai.api_key = openai_api_key
@@ -26,7 +23,7 @@ def get_google_credentials():
26
  return service_account.Credentials.from_service_account_info(google_service_account_info)
27
 
28
 
29
- def process_images(diary_image, writer_image, audio_option):
30
  # Save the file-like objects as image files
31
  diary_image_path = "temp_upload_images/temp_diary_image.png"
32
  writer_image_path = "temp_upload_images/temp_writer_image.png"
@@ -45,12 +42,8 @@ def process_images(diary_image, writer_image, audio_option):
45
  scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
46
  scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
47
 
48
- # Generate the narration audio which is less than 10 second
49
- # This will create a mp3 file for narration
50
- narration_summarize = summarizer_for_audio(detected_text)
51
- narration_generate(narration_summarize, eleven_api_key)
52
  # Generate the video based on the summaries
53
- video_path = generate_video(scene_list, writer_summary, audio_option, fps=24)
54
 
55
  caption = scenes_caption(scene_list, openai_api_key)
56
 
@@ -58,9 +51,9 @@ def process_images(diary_image, writer_image, audio_option):
58
 
59
 
60
  # Define the Gradio interface
61
- def gradio_interface(diary_image, writer_image, audio_option):
62
  # Process the images and generate the video
63
- video_paths, prompts = process_images(diary_image, writer_image, audio_option)
64
 
65
  # Return the paths and corresponding prompts
66
  return video_paths, prompts
@@ -75,12 +68,6 @@ with gr.Blocks() as interface:
75
  with gr.Column():
76
  diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
77
  writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
78
- # Add a radio button for selecting audio options
79
- audio_option = gr.Radio(
80
- ["Narration", "Meow"],
81
- label="Choose Audio Option",
82
- value="Narration" # Default selection
83
- )
84
  submit_button = gr.Button("Generate Video")
85
 
86
  # Right column for generated video and caption
@@ -91,7 +78,7 @@ with gr.Blocks() as interface:
91
  # Bind the submit button click to trigger the video generation and display
92
  submit_button.click(
93
  fn=gradio_interface,
94
- inputs=[diary_image_input, writer_image_input, audio_option],
95
  outputs=[video_output, caption_output]
96
  )
97
 
 
6
  analyze_writer_image,
7
  generate_video,
8
  break_diary_to_scenes,
9
+ scenes_caption)
 
 
10
  import os
11
 
12
  # Load secrets from Hugging Face Spaces environment
13
  openai_api_key = os.getenv("OPENAI_API_KEY")
14
  google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
15
  gemini_api_key = os.getenv("GEMINI_API_KEY")
 
16
 
17
  # Initialize OpenAI
18
  openai.api_key = openai_api_key
 
23
  return service_account.Credentials.from_service_account_info(google_service_account_info)
24
 
25
 
26
+ def process_images(diary_image, writer_image):
27
  # Save the file-like objects as image files
28
  diary_image_path = "temp_upload_images/temp_diary_image.png"
29
  writer_image_path = "temp_upload_images/temp_writer_image.png"
 
42
  scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
43
  scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
44
 
 
 
 
 
45
  # Generate the video based on the summaries
46
+ video_path = generate_video(scene_list, writer_summary, fps=24)
47
 
48
  caption = scenes_caption(scene_list, openai_api_key)
49
 
 
51
 
52
 
53
  # Define the Gradio interface
54
+ def gradio_interface(diary_image, writer_image):
55
  # Process the images and generate the video
56
+ video_paths, prompts = process_images(diary_image, writer_image)
57
 
58
  # Return the paths and corresponding prompts
59
  return video_paths, prompts
 
68
  with gr.Column():
69
  diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
70
  writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
 
 
 
 
 
 
71
  submit_button = gr.Button("Generate Video")
72
 
73
  # Right column for generated video and caption
 
78
  # Bind the submit button click to trigger the video generation and display
79
  submit_button.click(
80
  fn=gradio_interface,
81
+ inputs=[diary_image_input, writer_image_input],
82
  outputs=[video_output, caption_output]
83
  )
84
 
baseline_utils.py CHANGED
@@ -8,9 +8,6 @@ from diffusers.utils import export_to_video
8
  import os
9
  import spaces
10
  from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
11
- from elevenlabs import generate, play
12
- import requests
13
- from transformers import pipeline
14
 
15
  # Utilize the Google Cloud Vision API to recognize text in the
16
  # input input_images (diary input_images), https://cloud.google.com/vision.
@@ -123,7 +120,7 @@ def scenes_caption(scenes, api_key):
123
 
124
 
125
  @spaces.GPU
126
- def generate_video(scene_list, writer_description, audio_option, fps=24): # Lower fps
127
 
128
  pipe = CogVideoXPipeline.from_pretrained(
129
  "THUDM/CogVideoX-5b",
@@ -164,14 +161,11 @@ def generate_video(scene_list, writer_description, audio_option, fps=24): # Low
164
 
165
  # Concatenate the generated videos into a single video
166
  concatenated_video_path = "videos/combined_video.mp4"
167
- if audio_option == "Narration":
168
- concatenate_videos(video_paths, concatenated_video_path, audio_path="narration.mp3")
169
- else:
170
- concatenate_videos(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
171
  return concatenated_video_path
172
 
173
 
174
- def concatenate_videos(video_paths, output_path, audio_path):
175
  # Load each video file as a VideoFileClip
176
  clips = [VideoFileClip(video) for video in video_paths]
177
 
@@ -189,39 +183,3 @@ def concatenate_videos(video_paths, output_path, audio_path):
189
 
190
  # Write the concatenated video to a file
191
  final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
192
-
193
- def narration_generate(input, api_key):
194
- url = "https://api.elevenlabs.io/v1/text-to-speech/9BWtsMINqrJLrRacOk9x"
195
- headers = {
196
- "Accept": "audio/mpeg",
197
- "Content-Type": "application/json",
198
- "xi-api-key": api_key
199
- }
200
-
201
- data = {
202
- "text": input,
203
- "model_id": "eleven_monolingual_v1",
204
- "voice_settings": {
205
- "stability": 0.5,
206
- "similarity_boost": 0.5
207
- }
208
- }
209
-
210
- response = requests.post(url, json=data, headers=headers)
211
- with open('narration.mp3', 'wb') as f:
212
- for chunk in response.iter_content(chunk_size=1024):
213
- if chunk:
214
- f.write(chunk)
215
-
216
- def summarizer_for_audio(input_text):
217
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
218
-
219
- # Generate the summary
220
- summary = summarizer(
221
- input_text,
222
- max_length=25,
223
- min_length=20,
224
- do_sample=False
225
- )[0]["summary_text"]
226
-
227
- return summary
 
8
  import os
9
  import spaces
10
  from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
 
 
 
11
 
12
  # Utilize the Google Cloud Vision API to recognize text in the
13
  # input input_images (diary input_images), https://cloud.google.com/vision.
 
120
 
121
 
122
  @spaces.GPU
123
+ def generate_video(scene_list, writer_description, fps=24): # Lower fps
124
 
125
  pipe = CogVideoXPipeline.from_pretrained(
126
  "THUDM/CogVideoX-5b",
 
161
 
162
  # Concatenate the generated videos into a single video
163
  concatenated_video_path = "videos/combined_video.mp4"
164
+ concatenate_videos(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
 
 
 
165
  return concatenated_video_path
166
 
167
 
168
+ def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tiktok.mp3"):
169
  # Load each video file as a VideoFileClip
170
  clips = [VideoFileClip(video) for video in video_paths]
171
 
 
183
 
184
  # Write the concatenated video to a file
185
  final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -4,7 +4,8 @@ google-auth
4
  google-generativeai
5
  diffusers
6
  torch
 
7
  transformers
8
  accelerate
9
- SentencePiece
10
- moviepy
 
4
  google-generativeai
5
  diffusers
6
  torch
7
+ streamlit
8
  transformers
9
  accelerate
10
+ moviepy
11
+ SentencePiece