Spaces:

leoxia711
/

assignment3

Running

App Files Files Community

leoxia711 commited on 15 days ago

Commit

03b1165

verified ·

1 Parent(s): fa695ee

Update function.py

Browse files

Files changed (1) hide show

function.py +44 -36

function.py CHANGED Viewed

@@ -1,62 +1,74 @@
 from transformers import pipeline
 import torch
-from datasets import load_dataset
-import soundfile as sf
 from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel
 import numpy as np
 import re
-# Convert image to text description using a vision-language model
 def img2text(url):
-    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
     text = image_to_text_model(url)[0]["generated_text"]
-    # Remove art-related words to make the description more neutral
     for word in ["illustration", "drawing", "painting", "rendering"]:
         text = text.replace(word, "").strip()
     return text
-# Generate a short story from a given text prompt
 def text2story(caption):
     """
-    Generates a child-friendly story (50–100 words) from a given image caption.
-    Ensures it avoids dark/adult themes and encourages a whimsical tone.
     """
-    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-    model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    # Prompt to guide the model
     prompt = (
-    f"Write a short, cheerful story for a 5-year-old based entirely on: {caption}. "
-    f"Make it magical, fun, and avoid anything scary or sad.\n\nStory:"
     )
-    inputs = tokenizer(prompt, return_tensors="pt")
-    outputs = model.generate(
         inputs.input_ids,
-        max_length=150,
         do_sample=True,
         top_p=0.95,
-        temperature=0.9,
-        pad_token_id=tokenizer.eos_token_id
     )
-    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Remove prompt prefix if present
     if "Story:" in output_text:
         output_text = output_text.split("Story:")[-1].strip()
-    # Limit to ~100 words, but try to cut at a sentence ending (., !, ?)
     word_list = output_text.split()
-    cut_text = " ".join(word_list[:130])  # give buffer for sentence endings
     sentences = re.split(r'(?<=[.!?])\s+', cut_text)
     trimmed_story = ""
     total_words = 0
     for sentence in sentences:
         sentence = sentence.strip()
         word_count = len(sentence.split())
@@ -68,7 +80,6 @@ def text2story(caption):
     story = trimmed_story.strip()
-    # If no sentence-ending punctuation found, just force cut at 100 words
     if not story:
         story = " ".join(word_list[:100])
         if not story.endswith(('.', '!', '?')):
@@ -76,20 +87,17 @@ def text2story(caption):
     return story
-# Convert text story into audio using a speech synthesis model
-def text2audio(story_text):
-    model = VitsModel.from_pretrained("facebook/mms-tts-eng")
-    tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
-    inputs = tokenizer(story_text, return_tensors="pt")
-    # Important: convert input IDs to LongTensor to avoid runtime error
-    inputs["input_ids"] = inputs["input_ids"].long()
     with torch.no_grad():
-        output = model(**inputs).waveform
-    # Convert tensor to NumPy array and save it as a .wav file
     audio_np = output.squeeze().cpu().numpy()
     output_path = "generated_audio.wav"
     sf.write(output_path, audio_np, 22050)

 from transformers import pipeline
 import torch
+import soundfile as sf
 from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel
 import numpy as np
 import re
+# ====================
+# Load models globally
+# ====================
+# Image captioning pipeline
+image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
+# Story generation model (DistilGPT2)
+story_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+story_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+# Text-to-speech model (Facebook MMS)
+tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
+# ====================
+# Function 1: Image → Text
+# ====================
 def img2text(url):
     text = image_to_text_model(url)[0]["generated_text"]
     for word in ["illustration", "drawing", "painting", "rendering"]:
         text = text.replace(word, "").strip()
     return text
+# ====================
+# Function 2: Text → Story
+# ====================
 def text2story(caption):
     """
+    Generates a child-friendly story (up to 100 words) from a given image caption.
+    Uses DistilGPT2 for fast story generation.
     """
     prompt = (
+    f"Write a short, cheerful story for a 5-year-old. The story must mention {caption}. "
+    f"The characters and location should be entirely based on {caption}.\n\nStory:"
     )
+    inputs = story_tokenizer(prompt, return_tensors="pt")
+    outputs = story_model.generate(
         inputs.input_ids,
+        max_length=120,  # faster than 200, still enough for ~90 words
         do_sample=True,
         top_p=0.95,
+        temperature=0.8,
+        pad_token_id=story_tokenizer.eos_token_id
     )
+    output_text = story_tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Remove prompt prefix if present
     if "Story:" in output_text:
         output_text = output_text.split("Story:")[-1].strip()
+    # Trim to 100 words max, cutting at sentence boundaries
     word_list = output_text.split()
+    cut_text = " ".join(word_list[:130])  # small buffer
     sentences = re.split(r'(?<=[.!?])\s+', cut_text)
     trimmed_story = ""
     total_words = 0
     for sentence in sentences:
         sentence = sentence.strip()
         word_count = len(sentence.split())
     story = trimmed_story.strip()
     if not story:
         story = " ".join(word_list[:100])
         if not story.endswith(('.', '!', '?')):
     return story
+# ====================
+# Function 3: Story → Audio
+# ====================
+def text2audio(story_text):
+    inputs = tts_tokenizer(story_text, return_tensors="pt")
+    inputs["input_ids"] = inputs["input_ids"].long()  # Ensure correct type for VitsModel
     with torch.no_grad():
+        output = tts_model(**inputs).waveform
     audio_np = output.squeeze().cpu().numpy()
     output_path = "generated_audio.wav"
     sf.write(output_path, audio_np, 22050)