Spaces:
Sleeping
Sleeping
| from transformers import pipeline | |
| import torch | |
| from datasets import load_dataset | |
| import soundfile as sf | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel | |
| import numpy as np | |
| import re | |
| # Convert image to text description using a vision-language model | |
| def img2text(url): | |
| image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
| text = image_to_text_model(url)[0]["generated_text"] | |
| # Remove art-related words to make the description more neutral | |
| for word in ["illustration", "drawing", "painting", "rendering"]: | |
| text = text.replace(word, "").strip() | |
| return text | |
| # Generate a short story from a given text prompt | |
| def text2story(caption): | |
| """ | |
| Generates a child-friendly story (50–100 words) from a given image caption. | |
| Ensures it avoids dark/adult themes and encourages a whimsical tone. | |
| """ | |
| tokenizer = AutoTokenizer.from_pretrained("pranavpsv/gpt2-genre-story-generator") | |
| model = AutoModelForCausalLM.from_pretrained("pranavpsv/gpt2-genre-story-generator") | |
| # Prompt to guide the model | |
| prompt = ( | |
| f"Write a heartwarming story for a child. " | |
| f"Must use {caption} as places and characters in the story. " | |
| f"\n\nStory:" | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate( | |
| inputs.input_ids, | |
| max_length=180, | |
| do_sample=True, | |
| top_p=0.95, | |
| temperature=0.7, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Remove prompt prefix if present | |
| if "Story:" in output_text: | |
| output_text = output_text.split("Story:")[-1].strip() | |
| # Limit to ~100 words, but try to cut at a sentence ending (., !, ?) | |
| word_list = output_text.split() | |
| cut_text = " ".join(word_list[:130]) # give buffer for sentence endings | |
| sentences = re.split(r'(?<=[.!?])\s+', cut_text) | |
| trimmed_story = "" | |
| total_words = 0 | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| word_count = len(sentence.split()) | |
| if total_words + word_count > 100: | |
| break | |
| if sentence: | |
| trimmed_story += sentence + " " | |
| total_words += word_count | |
| story = trimmed_story.strip() | |
| # If no sentence-ending punctuation found, just force cut at 100 words | |
| if not story: | |
| story = " ".join(word_list[:100]) | |
| if not story.endswith(('.', '!', '?')): | |
| story += "." | |
| return story | |
| # Convert text story into audio using a speech synthesis model | |
| def text2audio(story_text): | |
| model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
| inputs = tokenizer(story_text, return_tensors="pt") | |
| # Important: convert input IDs to LongTensor to avoid runtime error | |
| inputs["input_ids"] = inputs["input_ids"].long() | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| # Convert tensor to NumPy array and save it as a .wav file | |
| audio_np = output.squeeze().cpu().numpy() | |
| output_path = "generated_audio.wav" | |
| sf.write(output_path, audio_np, 22050) | |
| return output_path | |