Spaces:

leoxia711
/

assignment3

Running

File size: 3,234 Bytes

9d6d02a
 
677581f
 
446a484
fba8d44
a500113
62e1e6a
677581f
9d6d02a
677581f
9d6d02a
677581f
 
beddf4f
 
9d6d02a
677581f
03b1165
677581f
76542b7
 
677581f
 
76542b7
677581f
 
 
 
7bdd7ac
67d33bf
 
9362e79
a5b8345
0979135
677581f
ac3282b
677581f
ac3282b
14bac1b
ac3282b
 
677581f
 
ac3282b
 
677581f
ac3282b
 
 
 
 
677581f
ac3282b
677581f
ac3282b
 
677581f
ac3282b
 
 
 
 
 
76542b7
ac3282b
 
 
 
 
991a2bc
677581f
76542b7
ac3282b
 
 
991a2bc
76542b7
b9a776a
677581f
03b1165
677581f
 
 
 
6ecc8b7
 
2c0c53b
677581f
8236e9e
677581f
8236e9e
6ecc8b7
3530cb2
6ecc8b7
 
a4176f4

from transformers import pipeline
import torch
from datasets import load_dataset
import soundfile as sf  
from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel
import numpy as np
import re

# Convert image to text description using a vision-language model
def img2text(url):
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    text = image_to_text_model(url)[0]["generated_text"]

    # Remove art-related words to make the description more neutral
    for word in ["illustration", "drawing", "painting", "rendering"]:
        text = text.replace(word, "").strip()

    return text

# Generate a short story from a given text prompt
def text2story(caption):
    """
    Generates a child-friendly story (50–100 words) from a given image caption.
    Ensures it avoids dark/adult themes and encourages a whimsical tone.
    """
    tokenizer = AutoTokenizer.from_pretrained("pranavpsv/gpt2-genre-story-generator")
    model = AutoModelForCausalLM.from_pretrained("pranavpsv/gpt2-genre-story-generator")

    # Prompt to guide the model
    prompt = (
    f"Write a heartwarming story for a child. "
    f"Must use {caption} as places and characters in the story. "
    f"\n\nStory:"
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        inputs.input_ids,
        max_length=180,
        do_sample=True,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove prompt prefix if present
    if "Story:" in output_text:
        output_text = output_text.split("Story:")[-1].strip()

    # Limit to ~100 words, but try to cut at a sentence ending (., !, ?)
    word_list = output_text.split()
    cut_text = " ".join(word_list[:130])  # give buffer for sentence endings

    sentences = re.split(r'(?<=[.!?])\s+', cut_text)

    trimmed_story = ""
    total_words = 0
    for sentence in sentences:
        sentence = sentence.strip()
        word_count = len(sentence.split())
        if total_words + word_count > 100:
            break
        if sentence:
            trimmed_story += sentence + " "
            total_words += word_count

    story = trimmed_story.strip()

    # If no sentence-ending punctuation found, just force cut at 100 words
    if not story:
        story = " ".join(word_list[:100])
        if not story.endswith(('.', '!', '?')):
            story += "."

    return story

# Convert text story into audio using a speech synthesis model
def text2audio(story_text):
    model = VitsModel.from_pretrained("facebook/mms-tts-eng")
    tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

    inputs = tokenizer(story_text, return_tensors="pt")

    # Important: convert input IDs to LongTensor to avoid runtime error
    inputs["input_ids"] = inputs["input_ids"].long()

    with torch.no_grad():
        output = model(**inputs).waveform

    # Convert tensor to NumPy array and save it as a .wav file
    audio_np = output.squeeze().cpu().numpy()
    output_path = "generated_audio.wav"
    sf.write(output_path, audio_np, 22050)

    return output_path