IrisDeng commited on
Commit
f1d429a
Β·
verified Β·
1 Parent(s): ee1b029

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -48
app.py CHANGED
@@ -6,62 +6,55 @@ import torch
6
 
7
  st.set_page_config(page_title="Image to Audio Story", page_icon="🦜")
8
 
 
 
 
 
 
9
  def extract_image_caption(image_data):
10
- img_obj = Image.open(image_data)
11
- caption_pipeline = pipeline(
12
- "image-to-text",
13
- model="Salesforce/blip-image-captioning-base",
14
- )
15
- caption_results = caption_pipeline(img_obj)
16
- caption_text = caption_results[0]['generated_text']
17
- return caption_text
18
 
19
  def compose_story_from_caption(caption_detail):
20
- story_pipeline = pipeline(
21
- "text-generation",
22
- model="Qwen/Qwen2-1.5B",
23
- )
24
- prompt_text = (
25
- "You are a talented and imaginative storyteller for children aged 3 to 10. "
26
- "Using the details derived from the image below, craft a captivating tale that goes beyond merely describing the scene. "
27
- "Let your creativity shine by introducing engaging characters, adventurous journeys, and delightful surprises. "
28
- "Your story should be vivid, original, and between 100 and 300 words in length.\n\n"
29
- f"Image Details: {caption_detail}\n\nStory:"
30
- )
31
- story_results = story_pipeline(prompt_text, num_return_sequences=1)
32
- story_text = story_results[0]['generated_text']
33
- if "Story:" in story_text:
34
- story = story_text.split("Story:", 1)[1].strip()
35
- else:
36
- story = story_text.strip()
37
- return story
38
 
39
  def convert_text_to_audio(text_content, audio_path="output.mp3"):
40
- tts_engine = gTTS(text=text_content, lang="en")
41
- tts_engine.save(audio_path)
42
- return audio_path
43
 
44
  def run_app():
45
- st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
46
- st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
47
- uploaded_image = st.file_uploader("Select an Image", type=["png", "jpg", "jpeg"])
48
-
49
- if uploaded_image is not None:
50
- image_display = Image.open(uploaded_image)
51
- st.image(image_display, caption="Uploaded Image", use_container_width=True)
52
 
53
- with st.spinner("Generating caption for the image..."):
54
- caption_text = extract_image_caption(uploaded_image)
55
- st.write("**Generated Caption:**", caption_text)
56
-
57
- with st.spinner("Composing story..."):
58
- story_text = compose_story_from_caption(caption_text)
59
- st.write("**Story:**")
60
- st.write(story_text)
61
-
62
- with st.spinner("Converting text to audio..."):
63
- audio_file = convert_text_to_audio(story_text)
64
- st.audio(audio_file, format="audio/mp3")
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  if __name__ == "__main__":
67
  run_app()
 
6
 
7
  st.set_page_config(page_title="Image to Audio Story", page_icon="🦜")
8
 
9
+ # Load models once
10
+ caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
11
+ story_pipeline = pipeline("text-generation", model="Qwen/Qwen2-1.5B")
12
+
13
+
14
  def extract_image_caption(image_data):
15
+ img_obj = Image.open(image_data)
16
+ caption_results = caption_pipeline(img_obj)
17
+ return caption_results[0]['generated_text']
 
 
 
 
 
18
 
19
  def compose_story_from_caption(caption_detail):
20
+ prompt_text = (
21
+ "You are a talented and imaginative storyteller for children aged 3 to 10. "
22
+ "Using the details derived from the image below, craft a captivating tale that goes beyond merely describing the scene. "
23
+ "Let your creativity shine by introducing engaging characters, adventurous journeys, and delightful surprises. "
24
+ "Your story should be vivid, original, and between 100 and 300 words in length.\n\n"
25
+ f"Image Details: {caption_detail}\n\nStory:"
26
+ )
27
+ story_results = story_pipeline(prompt_text, num_return_sequences=1)
28
+ story_text = story_results[0]['generated_text']
29
+ return story_text.split("Story:", 1)[1].strip() if "Story:" in story_text else story_text.strip()
 
 
 
 
 
 
 
 
30
 
31
  def convert_text_to_audio(text_content, audio_path="output.mp3"):
32
+ tts_engine = gTTS(text=text_content, lang="en")
33
+ tts_engine.save(audio_path)
34
+ return audio_path
35
 
36
  def run_app():
37
+ st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
38
+ st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
 
 
 
 
 
39
 
40
+ uploaded_image = st.file_uploader("Select an Image", type=["png", "jpg", "jpeg"])
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ if uploaded_image is not None:
43
+ image_display = Image.open(uploaded_image)
44
+ st.image(image_display, caption="Uploaded Image", use_container_width=True)
45
+
46
+ with st.spinner("Generating caption for the image..."):
47
+ caption_text = extract_image_caption(uploaded_image)
48
+ st.write("**Generated Caption:**", caption_text)
49
+
50
+ with st.spinner("Composing story..."):
51
+ story_text = compose_story_from_caption(caption_text)
52
+ st.write("**Story:**")
53
+ st.write(story_text)
54
+
55
+ with st.spinner("Converting text to audio..."):
56
+ audio_file = convert_text_to_audio(story_text)
57
+ st.audio(audio_file, format="audio/mp3")
58
+
59
  if __name__ == "__main__":
60
  run_app()