import streamlit as st import os import tempfile from moviepy.editor import ImageSequenceClip, concatenate_videoclips from PIL import Image import torch from diffusers import AudioLDMPipeline from transformers import AutoProcessor, ClapModel, BlipProcessor, BlipForConditionalGeneration # make Space compatible with CPU duplicates if torch.cuda.is_available(): device = "cuda" torch_dtype = torch.float16 else: device = "cpu" torch_dtype = torch.float32 # load the diffusers pipeline repo_id = "cvssp/audioldm-m-full" pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) pipe.unet = torch.compile(pipe.unet) # CLAP model (only required for automatic scoring) clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device) processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full") generator = torch.Generator(device) # Charger le modĂšle et le processeur Blip pour la description d'images image_caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") image_caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Streamlit app setup st.set_page_config( page_title="Text to Media", page_icon="đŸ“· đŸŽ”", ) st.title("GĂ©nĂ©rateur de Diaporama VidĂ©o et Musique") # SĂ©lectionnez les images uploaded_files = st.file_uploader("SĂ©lectionnez des images (PNG, JPG, JPEG)", type=["png", "jpg", "jpeg"], accept_multiple_files=True) if uploaded_files: # CrĂ©ez un rĂ©pertoire temporaire pour stocker les images temp_dir = tempfile.mkdtemp() # Enregistrez les images tĂ©lĂ©chargĂ©es dans le rĂ©pertoire temporaire image_paths = [] descriptions = [] # Pour stocker les descriptions gĂ©nĂ©rĂ©es for i, uploaded_file in enumerate(uploaded_files): image_path = os.path.join(temp_dir, uploaded_file.name) with open(image_path, 'wb') as f: f.write(uploaded_file.read()) image_paths.append(image_path) # GĂ©nĂ©rez la lĂ©gende pour chaque image try: image = Image.open(image_path).convert("RGB") inputs = image_caption_processor(image, return_tensors="pt") out = image_caption_model.generate(**inputs) caption = image_caption_processor.decode(out[0], skip_special_tokens=True) descriptions.append(caption) except Exception as e: descriptions.append("Erreur lors de la gĂ©nĂ©ration de la lĂ©gende") # CrĂ©ez une vidĂ©o Ă  partir des images st.header("CrĂ©ation d'une Diapositive VidĂ©o") # SĂ©lectionnez la durĂ©e d'affichage de chaque image avec une barre horizontale (en secondes) image_duration = st.slider("SĂ©lectionnez la durĂ©e d'affichage de chaque image (en secondes)", 1, 10, 4) # DĂ©bit d'images par seconde (calculĂ© en fonction de la durĂ©e de chaque image) frame_rate = 1 / image_duration image_clips = [ImageSequenceClip([image_path], fps=frame_rate, durations=[image_duration]) for image_path in image_paths] final_clip = concatenate_videoclips(image_clips, method="compose") # GĂ©nĂ©rez de la musique Ă  partir des descriptions st.header("GĂ©nĂ©ration de Musique Ă  partir des Descriptions") # Utilisez les descriptions gĂ©nĂ©rĂ©es pour la musique music_input = "\n".join(descriptions) st.text_area("Descriptions pour la musique", music_input, height=200) # Configuration de la musique seed = st.number_input("Seed", value=45) music_duration = final_clip.duration # Assurez-vous que la durĂ©e de la musique correspond Ă  la vidĂ©o if st.button("GĂ©nĂ©rer de la musique"): waveforms = pipe( music_input, audio_length_in_s=music_duration, guidance_scale=2.5, # Vous pouvez ajuster cette valeur num_inference_steps=100, num_waveforms_per_prompt=1, generator=generator.manual_seed(int(seed)), )["audios"] # Assurez-vous que la musique gĂ©nĂ©rĂ©e a la mĂȘme durĂ©e que la vidĂ©o if waveforms[0].shape[0] > final_clip.fps * final_clip.duration: waveforms[0] = waveforms[0][: int(final_clip.fps * final_clip.duration)] # IntĂ©grez la musique dans la vidĂ©o final_clip = final_clip.set_audio(waveforms[0].numpy()) # Affichez la vidĂ©o avec la musique intĂ©grĂ©e st.video(final_clip, format="video/mp4", start_time=0)