Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -1,142 +1,120 @@ | |
| 1 | 
            -
            import spaces
         | 
| 2 | 
            -
            import os
         | 
| 3 | 
            -
            import tempfile
         | 
| 4 | 
             
            import gradio as gr
         | 
| 5 | 
            -
             | 
| 6 | 
             
            import torch
         | 
| 7 | 
            -
            from  | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
            hf_token = os.getenv("HF_TKN")
         | 
| 14 | 
            -
             | 
| 15 | 
            -
            device_id = 0 if torch.cuda.is_available() else -1
         | 
| 16 | 
            -
             | 
| 17 | 
            -
            captioning_pipeline = pipeline(
         | 
| 18 | 
            -
                "image-to-text",
         | 
| 19 | 
            -
                model="nlpconnect/vit-gpt2-image-captioning",
         | 
| 20 | 
            -
                device=device_id
         | 
| 21 | 
            -
            )
         | 
| 22 | 
            -
             | 
| 23 | 
            -
            pipe = DiffusionPipeline.from_pretrained(
         | 
| 24 | 
            -
                "cvssp/audioldm2",
         | 
| 25 | 
            -
                use_auth_token=hf_token
         | 
| 26 | 
             
            )
         | 
|  | |
| 27 |  | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
|  | |
|  | |
| 30 | 
             
                try:
         | 
| 31 | 
            -
                     | 
| 32 | 
            -
                         | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
                         | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
                     | 
| 41 | 
            -
             | 
| 42 | 
            -
                    return caption, False
         | 
| 43 | 
            -
             | 
| 44 | 
             
                except Exception as e:
         | 
| 45 | 
            -
                    return  | 
| 46 |  | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
|  | |
|  | |
| 49 | 
             
                try:
         | 
| 50 | 
            -
                     | 
| 51 | 
            -
             | 
| 52 | 
            -
                         | 
| 53 | 
            -
                        num_inference_steps=50,
         | 
| 54 | 
            -
                        guidance_scale=7.5
         | 
| 55 | 
             
                    )
         | 
| 56 | 
            -
                     | 
| 57 | 
            -
                     | 
| 58 | 
            -
             | 
| 59 | 
            -
                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
         | 
| 60 | 
            -
                        write(temp_wav.name, 16000, audio)
         | 
| 61 | 
            -
                        return temp_wav.name
         | 
| 62 | 
            -
             | 
| 63 | 
             
                except Exception as e:
         | 
| 64 | 
            -
                     | 
| 65 | 
            -
                    return None
         | 
| 66 | 
            -
             | 
| 67 | 
            -
            css = """
         | 
| 68 | 
            -
            #col-container{
         | 
| 69 | 
            -
                margin: 0 auto;
         | 
| 70 | 
            -
                max-width: 800px;
         | 
| 71 | 
            -
                }
         | 
| 72 | 
            -
            """
         | 
| 73 |  | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
                 | 
| 79 | 
            -
                     | 
| 80 | 
            -
             | 
| 81 | 
            -
                     | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
                Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a 
         | 
| 85 | 
            -
                descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
         | 
| 86 | 
            -
                
         | 
| 87 | 
            -
                **💡 How it works:**
         | 
| 88 | 
            -
                1. **Upload an image**: Choose an image that you'd like to analyze.
         | 
| 89 | 
            -
                2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
         | 
| 90 | 
            -
                3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a 
         | 
| 91 | 
            -
                   sound effect that matches the image context.
         | 
| 92 | 
            -
                
         | 
| 93 | 
            -
                Enjoy the journey from visual to auditory sensation with just a few clicks!
         | 
| 94 | 
            -
                """)
         | 
| 95 | 
            -
             | 
| 96 | 
            -
                image_upload = gr.File(label="Upload Image", type="binary")
         | 
| 97 | 
            -
                generate_description_button = gr.Button("Generate Description")
         | 
| 98 | 
            -
                caption_display = gr.Textbox(label="Image Description", interactive=False)
         | 
| 99 | 
            -
                generate_sound_button = gr.Button("Generate Sound Effect")
         | 
| 100 | 
            -
                audio_output = gr.Audio(label="Generated Sound Effect")
         | 
| 101 | 
            -
             | 
| 102 | 
            -
                gr.Markdown("""
         | 
| 103 | 
            -
                ## 👥 How You Can Contribute
         | 
| 104 | 
            -
                We welcome contributions and suggestions for improvements. Your feedback is invaluable 
         | 
| 105 | 
            -
                to the continuous enhancement of this application. 
         | 
| 106 | 
            -
                
         | 
| 107 | 
            -
                For support, questions, or to contribute, please contact us at 
         | 
| 108 | 
            -
                [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
         | 
| 109 | 
            -
                
         | 
| 110 | 
            -
                Support our work and get involved by donating through 
         | 
| 111 | 
            -
                [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
         | 
| 112 | 
            -
                """)
         | 
| 113 | 
            -
             | 
| 114 | 
            -
                gr.Markdown("""
         | 
| 115 | 
            -
                ## 📢 Stay Connected
         | 
| 116 | 
            -
                This app is a testament to the creative possibilities that emerge when technology meets art. 
         | 
| 117 | 
            -
                Enjoy exploring the auditory landscape of your images!
         | 
| 118 | 
            -
                """)
         | 
| 119 | 
            -
             | 
| 120 | 
            -
                def update_caption(image_file):
         | 
| 121 | 
            -
                    description, _ = analyze_image_with_free_model(image_file)
         | 
| 122 | 
            -
                    return description
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                def generate_sound(description):
         | 
| 125 | 
            -
                    if not description or description.startswith("Error"):
         | 
| 126 | 
            -
                        return None
         | 
| 127 | 
            -
                    audio_path = get_audioldm_from_caption(description)
         | 
| 128 | 
            -
                    return audio_path
         | 
| 129 | 
            -
             | 
| 130 | 
            -
                generate_description_button.click(
         | 
| 131 | 
            -
                    fn=update_caption,
         | 
| 132 | 
            -
                    inputs=image_upload,
         | 
| 133 | 
            -
                    outputs=caption_display
         | 
| 134 | 
            -
                )
         | 
| 135 | 
            -
             | 
| 136 | 
            -
                generate_sound_button.click(
         | 
| 137 | 
            -
                    fn=generate_sound,
         | 
| 138 | 
            -
                    inputs=caption_display,
         | 
| 139 | 
            -
                    outputs=audio_output
         | 
| 140 | 
            -
                )
         | 
| 141 |  | 
| 142 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
|  | |
|  | |
|  | |
| 1 | 
             
            import gradio as gr
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
             
            import torch
         | 
| 4 | 
            +
            from transformers import (
         | 
| 5 | 
            +
                AutoTokenizer, 
         | 
| 6 | 
            +
                AutoModelForCausalLM, 
         | 
| 7 | 
            +
                pipeline,
         | 
| 8 | 
            +
                AutoProcessor, 
         | 
| 9 | 
            +
                MusicgenForConditionalGeneration
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 10 | 
             
            )
         | 
| 11 | 
            +
            import scipy.io.wavfile as wav
         | 
| 12 |  | 
| 13 | 
            +
            # ---------------------------------------------------------------------
         | 
| 14 | 
            +
            # Load Llama 3 Model with Zero GPU
         | 
| 15 | 
            +
            # ---------------------------------------------------------------------
         | 
| 16 | 
            +
            def load_llama_pipeline_zero_gpu(model_id: str, token: str):
         | 
| 17 | 
             
                try:
         | 
| 18 | 
            +
                    if not torch.cuda.is_available():
         | 
| 19 | 
            +
                        raise RuntimeError("ZeroGPU is not properly initialized or GPU is unavailable.")
         | 
| 20 | 
            +
                    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
         | 
| 21 | 
            +
                    model = AutoModelForCausalLM.from_pretrained(
         | 
| 22 | 
            +
                        model_id,
         | 
| 23 | 
            +
                        use_auth_token=token,
         | 
| 24 | 
            +
                        torch_dtype=torch.float16,
         | 
| 25 | 
            +
                        device_map="auto",  # Use device map to offload computations
         | 
| 26 | 
            +
                        trust_remote_code=True  # Enables execution of remote code for Zero GPU
         | 
| 27 | 
            +
                    )
         | 
| 28 | 
            +
                    return pipeline("text-generation", model=model, tokenizer=tokenizer)
         | 
|  | |
|  | |
| 29 | 
             
                except Exception as e:
         | 
| 30 | 
            +
                    return str(e)
         | 
| 31 |  | 
| 32 | 
            +
            # ---------------------------------------------------------------------
         | 
| 33 | 
            +
            # Generate Radio Script
         | 
| 34 | 
            +
            # ---------------------------------------------------------------------
         | 
| 35 | 
            +
            def generate_script(user_input: str, pipeline_llama):
         | 
| 36 | 
             
                try:
         | 
| 37 | 
            +
                    system_prompt = (
         | 
| 38 | 
            +
                        "You are a top-tier radio imaging producer using Llama 3. "
         | 
| 39 | 
            +
                        "Take the user's concept and craft a short, creative promo script."
         | 
|  | |
|  | |
| 40 | 
             
                    )
         | 
| 41 | 
            +
                    combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
         | 
| 42 | 
            +
                    result = pipeline_llama(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
         | 
| 43 | 
            +
                    return result[0]['generated_text'].split("Refined script:")[-1].strip()
         | 
|  | |
|  | |
|  | |
|  | |
| 44 | 
             
                except Exception as e:
         | 
| 45 | 
            +
                    return f"Error generating script: {e}"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 46 |  | 
| 47 | 
            +
            # ---------------------------------------------------------------------
         | 
| 48 | 
            +
            # Load MusicGen Model
         | 
| 49 | 
            +
            # ---------------------------------------------------------------------
         | 
| 50 | 
            +
            def load_musicgen_model():
         | 
| 51 | 
            +
                try:
         | 
| 52 | 
            +
                    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
         | 
| 53 | 
            +
                    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
         | 
| 54 | 
            +
                    return model, processor
         | 
| 55 | 
            +
                except Exception as e:
         | 
| 56 | 
            +
                    return None, str(e)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 57 |  | 
| 58 | 
            +
            # ---------------------------------------------------------------------
         | 
| 59 | 
            +
            # Generate Audio
         | 
| 60 | 
            +
            # ---------------------------------------------------------------------
         | 
| 61 | 
            +
            def generate_audio(prompt: str, audio_length: int, mg_model, mg_processor):
         | 
| 62 | 
            +
                try:
         | 
| 63 | 
            +
                    inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
         | 
| 64 | 
            +
                    outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
         | 
| 65 | 
            +
                    sr = mg_model.config.audio_encoder.sampling_rate
         | 
| 66 | 
            +
                    audio_data = outputs[0, 0].cpu().numpy()
         | 
| 67 | 
            +
                    normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
         | 
| 68 | 
            +
                    output_file = "radio_jingle.wav"
         | 
| 69 | 
            +
                    wav.write(output_file, rate=sr, data=normalized_audio)
         | 
| 70 | 
            +
                    return sr, normalized_audio
         | 
| 71 | 
            +
                except Exception as e:
         | 
| 72 | 
            +
                    return str(e)
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            # ---------------------------------------------------------------------
         | 
| 75 | 
            +
            # Gradio Interface
         | 
| 76 | 
            +
            # ---------------------------------------------------------------------
         | 
| 77 | 
            +
            def radio_imaging_app(user_prompt, llama_model_id, hf_token, audio_length):
         | 
| 78 | 
            +
                # Load Llama 3 Pipeline with Zero GPU
         | 
| 79 | 
            +
                pipeline_llama = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
         | 
| 80 | 
            +
                if isinstance(pipeline_llama, str):
         | 
| 81 | 
            +
                    return pipeline_llama, None
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                # Generate Script
         | 
| 84 | 
            +
                script = generate_script(user_prompt, pipeline_llama)
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                # Load MusicGen
         | 
| 87 | 
            +
                mg_model, mg_processor = load_musicgen_model()
         | 
| 88 | 
            +
                if isinstance(mg_processor, str):
         | 
| 89 | 
            +
                    return script, mg_processor
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                # Generate Audio
         | 
| 92 | 
            +
                audio_data = generate_audio(script, audio_length, mg_model, mg_processor)
         | 
| 93 | 
            +
                if isinstance(audio_data, str):
         | 
| 94 | 
            +
                    return script, audio_data
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                return script, audio_data
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            # ---------------------------------------------------------------------
         | 
| 99 | 
            +
            # Interface
         | 
| 100 | 
            +
            # ---------------------------------------------------------------------
         | 
| 101 | 
            +
            with gr.Blocks() as demo:
         | 
| 102 | 
            +
                gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")
         | 
| 103 | 
            +
                with gr.Row():
         | 
| 104 | 
            +
                    user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show, fun and energetic.")
         | 
| 105 | 
            +
                    llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
         | 
| 106 | 
            +
                    hf_token = gr.Textbox(label="Hugging Face Token", type="password")
         | 
| 107 | 
            +
                    audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                generate_button = gr.Button("Generate Promo Script and Audio")
         | 
| 110 | 
            +
                script_output = gr.Textbox(label="Generated Script")
         | 
| 111 | 
            +
                audio_output = gr.Audio(label="Generated Audio", type="numpy")
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                generate_button.click(radio_imaging_app, 
         | 
| 114 | 
            +
                                      inputs=[user_prompt, llama_model_id, hf_token, audio_length], 
         | 
| 115 | 
            +
                                      outputs=[script_output, audio_output])
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            # ---------------------------------------------------------------------
         | 
| 118 | 
            +
            # Launch App
         | 
| 119 | 
            +
            # ---------------------------------------------------------------------
         | 
| 120 | 
            +
            demo.launch()
         | 
