|
import gradio as gr |
|
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration |
|
from diffusers import StableDiffusionPipeline |
|
import torch |
|
|
|
|
|
prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn") |
|
|
|
def generate_prompt(description: str) -> str: |
|
|
|
prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text'] |
|
return prompt |
|
|
|
|
|
stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") |
|
stable_diffusion.to("cpu") |
|
|
|
def generate_image(prompt: str): |
|
|
|
image = stable_diffusion(prompt).images[0] |
|
return image |
|
|
|
|
|
processor = WhisperProcessor.from_pretrained("openai/whisper-large") |
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") |
|
|
|
def transcribe_audio(audio): |
|
|
|
audio_input = processor(audio, return_tensors="pt").input_features |
|
predicted_ids = model.generate(audio_input) |
|
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) |
|
return transcription |
|
|
|
|
|
def process_input(description: str, creativity: float, include_background: bool): |
|
|
|
prompt = generate_prompt(description) |
|
|
|
|
|
if include_background: |
|
prompt += " with a detailed, vibrant background." |
|
|
|
|
|
image = generate_image(prompt) |
|
|
|
return prompt, image |
|
|
|
def process_audio_input(audio): |
|
|
|
description = transcribe_audio(audio) |
|
|
|
prompt = generate_prompt(description) |
|
image = generate_image(prompt) |
|
return prompt, image |
|
|
|
|
|
text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky") |
|
creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7) |
|
background_checkbox = gr.Checkbox(label="Include Background", value=True) |
|
|
|
audio_input = gr.Audio(type="numpy", label="Speak your Description") |
|
|
|
|
|
interface = gr.Interface( |
|
fn=process_input, |
|
inputs=[ |
|
text_input, |
|
creativity_slider, |
|
background_checkbox |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Generated Prompt"), |
|
gr.Image(label="Generated Image") |
|
], |
|
title="Magical Image Generator", |
|
description="Enter a short description or speak it to generate a magical image! Adjust creativity and background options.", |
|
theme="huggingface" |
|
) |
|
|
|
|
|
interface_with_audio = gr.Interface( |
|
fn=process_audio_input, |
|
inputs=[audio_input], |
|
outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")], |
|
title="Magical Image Generator with Voice Input", |
|
description="Speak a short description and generate a magical image!" |
|
) |
|
|
|
|
|
gr.TabbedInterface([interface, interface_with_audio]).launch() |