Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 11

Commit

2019ee0

verified ·

1 Parent(s): 3229fa2

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -130

app.py CHANGED Viewed

@@ -1,142 +1,120 @@
-import spaces
-import os
-import tempfile
 import gradio as gr
-from dotenv import load_dotenv
 import torch
-from scipy.io.wavfile import write
-from diffusers import DiffusionPipeline
-from transformers import pipeline
-from pathlib import Path
-load_dotenv()
-hf_token = os.getenv("HF_TKN")
-device_id = 0 if torch.cuda.is_available() else -1
-captioning_pipeline = pipeline(
-    "image-to-text",
-    model="nlpconnect/vit-gpt2-image-captioning",
-    device=device_id
-)
-pipe = DiffusionPipeline.from_pretrained(
-    "cvssp/audioldm2",
-    use_auth_token=hf_token
 )
-@spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
     try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_file.write(image_file)
-            temp_image_path = temp_file.name
-        results = captioning_pipeline(temp_image_path)
-        if not results or not isinstance(results, list):
-            return "Error: Could not generate caption.", True
-        caption = results[0].get("generated_text", "").strip()
-        if not caption:
-            return "No caption was generated.", True
-        return caption, False
     except Exception as e:
-        return f"Error analyzing image: {e}", True
-@spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption):
     try:
-        pipe.to("cuda")
-        audio_output = pipe(
-            prompt=caption,
-            num_inference_steps=50,
-            guidance_scale=7.5
         )
-        pipe.to("cpu")
-        audio = audio_output.audios[0]
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            write(temp_wav.name, 16000, audio)
-            return temp_wav.name
     except Exception as e:
-        print(f"Error generating audio from caption: {e}")
-        return None
-css = """
-#col-container{
-    margin: 0 auto;
-    max-width: 800px;
-    }
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.HTML("""
-    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-    <p style="text-align: center;">
-        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-    </p>
-        """)
-    gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
-    **💡 How it works:**
-    1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
-    Enjoy the journey from visual to auditory sensation with just a few clicks!
-    """)
-    image_upload = gr.File(label="Upload Image", type="binary")
-    generate_description_button = gr.Button("Generate Description")
-    caption_display = gr.Textbox(label="Image Description", interactive=False)
-    generate_sound_button = gr.Button("Generate Sound Effect")
-    audio_output = gr.Audio(label="Generated Sound Effect")
-    gr.Markdown("""
-    ## 👥 How You Can Contribute
-    We welcome contributions and suggestions for improvements. Your feedback is invaluable
-    to the continuous enhancement of this application.
-    For support, questions, or to contribute, please contact us at
-    [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
-    Support our work and get involved by donating through
-    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
-    """)
-    gr.Markdown("""
-    ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
-    """)
-    def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
-        return description
-    def generate_sound(description):
-        if not description or description.startswith("Error"):
-            return None
-        audio_path = get_audioldm_from_caption(description)
-        return audio_path
-    generate_description_button.click(
-        fn=update_caption,
-        inputs=image_upload,
-        outputs=caption_display
-    )
-    generate_sound_button.click(
-        fn=generate_sound,
-        inputs=caption_display,
-        outputs=audio_output
-    )
-demo.launch(debug=True, share=True)

 import gradio as gr
+import os
 import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    pipeline,
+    AutoProcessor,
+    MusicgenForConditionalGeneration
 )
+import scipy.io.wavfile as wav
+# ---------------------------------------------------------------------
+# Load Llama 3 Model with Zero GPU
+# ---------------------------------------------------------------------
+def load_llama_pipeline_zero_gpu(model_id: str, token: str):
     try:
+        if not torch.cuda.is_available():
+            raise RuntimeError("ZeroGPU is not properly initialized or GPU is unavailable.")
+        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            use_auth_token=token,
+            torch_dtype=torch.float16,
+            device_map="auto",  # Use device map to offload computations
+            trust_remote_code=True  # Enables execution of remote code for Zero GPU
+        )
+        return pipeline("text-generation", model=model, tokenizer=tokenizer)
     except Exception as e:
+        return str(e)
+# ---------------------------------------------------------------------
+# Generate Radio Script
+# ---------------------------------------------------------------------
+def generate_script(user_input: str, pipeline_llama):
     try:
+        system_prompt = (
+            "You are a top-tier radio imaging producer using Llama 3. "
+            "Take the user's concept and craft a short, creative promo script."
         )
+        combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
+        result = pipeline_llama(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
+        return result[0]['generated_text'].split("Refined script:")[-1].strip()
     except Exception as e:
+        return f"Error generating script: {e}"
+# ---------------------------------------------------------------------
+# Load MusicGen Model
+# ---------------------------------------------------------------------
+def load_musicgen_model():
+    try:
+        model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+        return model, processor
+    except Exception as e:
+        return None, str(e)
+# ---------------------------------------------------------------------
+# Generate Audio
+# ---------------------------------------------------------------------
+def generate_audio(prompt: str, audio_length: int, mg_model, mg_processor):
+    try:
+        inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
+        outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
+        sr = mg_model.config.audio_encoder.sampling_rate
+        audio_data = outputs[0, 0].cpu().numpy()
+        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
+        output_file = "radio_jingle.wav"
+        wav.write(output_file, rate=sr, data=normalized_audio)
+        return sr, normalized_audio
+    except Exception as e:
+        return str(e)
+# ---------------------------------------------------------------------
+# Gradio Interface
+# ---------------------------------------------------------------------
+def radio_imaging_app(user_prompt, llama_model_id, hf_token, audio_length):
+    # Load Llama 3 Pipeline with Zero GPU
+    pipeline_llama = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
+    if isinstance(pipeline_llama, str):
+        return pipeline_llama, None
+    # Generate Script
+    script = generate_script(user_prompt, pipeline_llama)
+    # Load MusicGen
+    mg_model, mg_processor = load_musicgen_model()
+    if isinstance(mg_processor, str):
+        return script, mg_processor
+    # Generate Audio
+    audio_data = generate_audio(script, audio_length, mg_model, mg_processor)
+    if isinstance(audio_data, str):
+        return script, audio_data
+    return script, audio_data
+# ---------------------------------------------------------------------
+# Interface
+# ---------------------------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")
+    with gr.Row():
+        user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show, fun and energetic.")
+        llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
+        hf_token = gr.Textbox(label="Hugging Face Token", type="password")
+        audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
+    generate_button = gr.Button("Generate Promo Script and Audio")
+    script_output = gr.Textbox(label="Generated Script")
+    audio_output = gr.Audio(label="Generated Audio", type="numpy")
+    generate_button.click(radio_imaging_app,
+                          inputs=[user_prompt, llama_model_id, hf_token, audio_length],
+                          outputs=[script_output, audio_output])
+# ---------------------------------------------------------------------
+# Launch App
+# ---------------------------------------------------------------------
+demo.launch()