Spaces:
Running
Running
from transformers import pipeline | |
import torch | |
from datasets import load_dataset | |
import soundfile as sf | |
from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel | |
import numpy as np | |
import re | |
# Convert image to text description using a vision-language model | |
def img2text(url): | |
image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
text = image_to_text_model(url)[0]["generated_text"] | |
# Remove art-related words to make the description more neutral | |
for word in ["illustration", "drawing", "painting", "rendering"]: | |
text = text.replace(word, "").strip() | |
return text | |
# Generate a short story from a given text prompt | |
def text2story(caption): | |
""" | |
Generates a child-friendly story (50–100 words) from a given image caption. | |
Ensures it avoids dark/adult themes and encourages a whimsical tone. | |
""" | |
tokenizer = AutoTokenizer.from_pretrained("pranavpsv/gpt2-genre-story-generator") | |
model = AutoModelForCausalLM.from_pretrained("pranavpsv/gpt2-genre-story-generator") | |
# Prompt to guide the model | |
prompt = ( | |
f"Write a heartwarming story for a child. " | |
f"Must use {caption} as places and characters in the story. " | |
f"\n\nStory:" | |
) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate( | |
inputs.input_ids, | |
max_length=180, | |
do_sample=True, | |
top_p=0.95, | |
temperature=0.7, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Remove prompt prefix if present | |
if "Story:" in output_text: | |
output_text = output_text.split("Story:")[-1].strip() | |
# Limit to ~100 words, but try to cut at a sentence ending (., !, ?) | |
word_list = output_text.split() | |
cut_text = " ".join(word_list[:130]) # give buffer for sentence endings | |
sentences = re.split(r'(?<=[.!?])\s+', cut_text) | |
trimmed_story = "" | |
total_words = 0 | |
for sentence in sentences: | |
sentence = sentence.strip() | |
word_count = len(sentence.split()) | |
if total_words + word_count > 100: | |
break | |
if sentence: | |
trimmed_story += sentence + " " | |
total_words += word_count | |
story = trimmed_story.strip() | |
# If no sentence-ending punctuation found, just force cut at 100 words | |
if not story: | |
story = " ".join(word_list[:100]) | |
if not story.endswith(('.', '!', '?')): | |
story += "." | |
return story | |
# Convert text story into audio using a speech synthesis model | |
def text2audio(story_text): | |
model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
inputs = tokenizer(story_text, return_tensors="pt") | |
# Important: convert input IDs to LongTensor to avoid runtime error | |
inputs["input_ids"] = inputs["input_ids"].long() | |
with torch.no_grad(): | |
output = model(**inputs).waveform | |
# Convert tensor to NumPy array and save it as a .wav file | |
audio_np = output.squeeze().cpu().numpy() | |
output_path = "generated_audio.wav" | |
sf.write(output_path, audio_np, 22050) | |
return output_path | |