Spaces:
Paused
Paused
fail to make narrative
Browse files- app.py +6 -19
- baseline_utils.py +3 -45
- requirements.txt +3 -2
app.py
CHANGED
@@ -6,16 +6,13 @@ from baseline_utils import (detect_text_in_image,
|
|
6 |
analyze_writer_image,
|
7 |
generate_video,
|
8 |
break_diary_to_scenes,
|
9 |
-
scenes_caption
|
10 |
-
summarizer_for_audio,
|
11 |
-
narration_generate)
|
12 |
import os
|
13 |
|
14 |
# Load secrets from Hugging Face Spaces environment
|
15 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
16 |
google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
|
17 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
18 |
-
eleven_api_key = os.getenv("ELEVEN_API_KEY")
|
19 |
|
20 |
# Initialize OpenAI
|
21 |
openai.api_key = openai_api_key
|
@@ -26,7 +23,7 @@ def get_google_credentials():
|
|
26 |
return service_account.Credentials.from_service_account_info(google_service_account_info)
|
27 |
|
28 |
|
29 |
-
def process_images(diary_image, writer_image
|
30 |
# Save the file-like objects as image files
|
31 |
diary_image_path = "temp_upload_images/temp_diary_image.png"
|
32 |
writer_image_path = "temp_upload_images/temp_writer_image.png"
|
@@ -45,12 +42,8 @@ def process_images(diary_image, writer_image, audio_option):
|
|
45 |
scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
|
46 |
scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
|
47 |
|
48 |
-
# Generate the narration audio which is less than 10 second
|
49 |
-
# This will create a mp3 file for narration
|
50 |
-
narration_summarize = summarizer_for_audio(detected_text)
|
51 |
-
narration_generate(narration_summarize, eleven_api_key)
|
52 |
# Generate the video based on the summaries
|
53 |
-
video_path = generate_video(scene_list, writer_summary,
|
54 |
|
55 |
caption = scenes_caption(scene_list, openai_api_key)
|
56 |
|
@@ -58,9 +51,9 @@ def process_images(diary_image, writer_image, audio_option):
|
|
58 |
|
59 |
|
60 |
# Define the Gradio interface
|
61 |
-
def gradio_interface(diary_image, writer_image
|
62 |
# Process the images and generate the video
|
63 |
-
video_paths, prompts = process_images(diary_image, writer_image
|
64 |
|
65 |
# Return the paths and corresponding prompts
|
66 |
return video_paths, prompts
|
@@ -75,12 +68,6 @@ with gr.Blocks() as interface:
|
|
75 |
with gr.Column():
|
76 |
diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
|
77 |
writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
|
78 |
-
# Add a radio button for selecting audio options
|
79 |
-
audio_option = gr.Radio(
|
80 |
-
["Narration", "Meow"],
|
81 |
-
label="Choose Audio Option",
|
82 |
-
value="Narration" # Default selection
|
83 |
-
)
|
84 |
submit_button = gr.Button("Generate Video")
|
85 |
|
86 |
# Right column for generated video and caption
|
@@ -91,7 +78,7 @@ with gr.Blocks() as interface:
|
|
91 |
# Bind the submit button click to trigger the video generation and display
|
92 |
submit_button.click(
|
93 |
fn=gradio_interface,
|
94 |
-
inputs=[diary_image_input, writer_image_input
|
95 |
outputs=[video_output, caption_output]
|
96 |
)
|
97 |
|
|
|
6 |
analyze_writer_image,
|
7 |
generate_video,
|
8 |
break_diary_to_scenes,
|
9 |
+
scenes_caption)
|
|
|
|
|
10 |
import os
|
11 |
|
12 |
# Load secrets from Hugging Face Spaces environment
|
13 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
14 |
google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
|
15 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
|
|
16 |
|
17 |
# Initialize OpenAI
|
18 |
openai.api_key = openai_api_key
|
|
|
23 |
return service_account.Credentials.from_service_account_info(google_service_account_info)
|
24 |
|
25 |
|
26 |
+
def process_images(diary_image, writer_image):
|
27 |
# Save the file-like objects as image files
|
28 |
diary_image_path = "temp_upload_images/temp_diary_image.png"
|
29 |
writer_image_path = "temp_upload_images/temp_writer_image.png"
|
|
|
42 |
scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
|
43 |
scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
|
44 |
|
|
|
|
|
|
|
|
|
45 |
# Generate the video based on the summaries
|
46 |
+
video_path = generate_video(scene_list, writer_summary, fps=24)
|
47 |
|
48 |
caption = scenes_caption(scene_list, openai_api_key)
|
49 |
|
|
|
51 |
|
52 |
|
53 |
# Define the Gradio interface
|
54 |
+
def gradio_interface(diary_image, writer_image):
|
55 |
# Process the images and generate the video
|
56 |
+
video_paths, prompts = process_images(diary_image, writer_image)
|
57 |
|
58 |
# Return the paths and corresponding prompts
|
59 |
return video_paths, prompts
|
|
|
68 |
with gr.Column():
|
69 |
diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
|
70 |
writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
submit_button = gr.Button("Generate Video")
|
72 |
|
73 |
# Right column for generated video and caption
|
|
|
78 |
# Bind the submit button click to trigger the video generation and display
|
79 |
submit_button.click(
|
80 |
fn=gradio_interface,
|
81 |
+
inputs=[diary_image_input, writer_image_input],
|
82 |
outputs=[video_output, caption_output]
|
83 |
)
|
84 |
|
baseline_utils.py
CHANGED
@@ -8,9 +8,6 @@ from diffusers.utils import export_to_video
|
|
8 |
import os
|
9 |
import spaces
|
10 |
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
|
11 |
-
from elevenlabs import generate, play
|
12 |
-
import requests
|
13 |
-
from transformers import pipeline
|
14 |
|
15 |
# Utilize the Google Cloud Vision API to recognize text in the
|
16 |
# input input_images (diary input_images), https://cloud.google.com/vision.
|
@@ -123,7 +120,7 @@ def scenes_caption(scenes, api_key):
|
|
123 |
|
124 |
|
125 |
@spaces.GPU
|
126 |
-
def generate_video(scene_list, writer_description,
|
127 |
|
128 |
pipe = CogVideoXPipeline.from_pretrained(
|
129 |
"THUDM/CogVideoX-5b",
|
@@ -164,14 +161,11 @@ def generate_video(scene_list, writer_description, audio_option, fps=24): # Low
|
|
164 |
|
165 |
# Concatenate the generated videos into a single video
|
166 |
concatenated_video_path = "videos/combined_video.mp4"
|
167 |
-
|
168 |
-
concatenate_videos(video_paths, concatenated_video_path, audio_path="narration.mp3")
|
169 |
-
else:
|
170 |
-
concatenate_videos(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
|
171 |
return concatenated_video_path
|
172 |
|
173 |
|
174 |
-
def concatenate_videos(video_paths, output_path, audio_path):
|
175 |
# Load each video file as a VideoFileClip
|
176 |
clips = [VideoFileClip(video) for video in video_paths]
|
177 |
|
@@ -189,39 +183,3 @@ def concatenate_videos(video_paths, output_path, audio_path):
|
|
189 |
|
190 |
# Write the concatenated video to a file
|
191 |
final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
192 |
-
|
193 |
-
def narration_generate(input, api_key):
|
194 |
-
url = "https://api.elevenlabs.io/v1/text-to-speech/9BWtsMINqrJLrRacOk9x"
|
195 |
-
headers = {
|
196 |
-
"Accept": "audio/mpeg",
|
197 |
-
"Content-Type": "application/json",
|
198 |
-
"xi-api-key": api_key
|
199 |
-
}
|
200 |
-
|
201 |
-
data = {
|
202 |
-
"text": input,
|
203 |
-
"model_id": "eleven_monolingual_v1",
|
204 |
-
"voice_settings": {
|
205 |
-
"stability": 0.5,
|
206 |
-
"similarity_boost": 0.5
|
207 |
-
}
|
208 |
-
}
|
209 |
-
|
210 |
-
response = requests.post(url, json=data, headers=headers)
|
211 |
-
with open('narration.mp3', 'wb') as f:
|
212 |
-
for chunk in response.iter_content(chunk_size=1024):
|
213 |
-
if chunk:
|
214 |
-
f.write(chunk)
|
215 |
-
|
216 |
-
def summarizer_for_audio(input_text):
|
217 |
-
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
218 |
-
|
219 |
-
# Generate the summary
|
220 |
-
summary = summarizer(
|
221 |
-
input_text,
|
222 |
-
max_length=25,
|
223 |
-
min_length=20,
|
224 |
-
do_sample=False
|
225 |
-
)[0]["summary_text"]
|
226 |
-
|
227 |
-
return summary
|
|
|
8 |
import os
|
9 |
import spaces
|
10 |
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
|
|
|
|
|
|
|
11 |
|
12 |
# Utilize the Google Cloud Vision API to recognize text in the
|
13 |
# input input_images (diary input_images), https://cloud.google.com/vision.
|
|
|
120 |
|
121 |
|
122 |
@spaces.GPU
|
123 |
+
def generate_video(scene_list, writer_description, fps=24): # Lower fps
|
124 |
|
125 |
pipe = CogVideoXPipeline.from_pretrained(
|
126 |
"THUDM/CogVideoX-5b",
|
|
|
161 |
|
162 |
# Concatenate the generated videos into a single video
|
163 |
concatenated_video_path = "videos/combined_video.mp4"
|
164 |
+
concatenate_videos(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
|
|
|
|
|
|
|
165 |
return concatenated_video_path
|
166 |
|
167 |
|
168 |
+
def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tiktok.mp3"):
|
169 |
# Load each video file as a VideoFileClip
|
170 |
clips = [VideoFileClip(video) for video in video_paths]
|
171 |
|
|
|
183 |
|
184 |
# Write the concatenated video to a file
|
185 |
final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -4,7 +4,8 @@ google-auth
|
|
4 |
google-generativeai
|
5 |
diffusers
|
6 |
torch
|
|
|
7 |
transformers
|
8 |
accelerate
|
9 |
-
|
10 |
-
|
|
|
4 |
google-generativeai
|
5 |
diffusers
|
6 |
torch
|
7 |
+
streamlit
|
8 |
transformers
|
9 |
accelerate
|
10 |
+
moviepy
|
11 |
+
SentencePiece
|