from PIL import Image import numpy as np import gradio as gr import paddlehub as hub import urllib import cv2 import re import os import requests from share_btn import community_icon_html, loading_icon_html, share_js import torch from spectro import wav_bytes_from_spectrogram_image from diffusers import StableDiffusionPipeline import io from os import path from pydub import AudioSegment import moviepy.video.io.ImageSequenceClip from moviepy.editor import * import mutagen from mutagen.mp3 import MP3 img_to_text = gr.Blocks.load(name="spaces/pharma/CLIP-Interrogator") text_to_music = gr.Interface.load("spaces/fffiloni/text-2-music") language_translation_model = hub.Module(name='baidu_translate') language_recognition_model = hub.Module(name='baidu_language_recognition') style_list = ['古风', '油画', '水彩', '卡通', '二次元', '浮世绘', '蒸汽波艺术', 'low poly', '像素风格', '概念艺术', '未来主义', '赛博朋克', '写实风格', '洛丽塔风格', '巴洛克风格', '超现实主义', '默认'] style_list_EN = ['Chinese Ancient Style', 'Oil painting', 'Watercolor', 'Cartoon', 'Anime', 'Ukiyoe', 'Vaporwave', 'low poly', 'Pixel Style', 'Conceptual Art', 'Futurism', 'Cyberpunk', 'Realistic style', 'Lolita style', 'Baroque style', 'Surrealism', 'Default'] tips = {"en": "Tips: The input text will be translated into English for generation", "jp": "ヒント: 入力テキストは生成のために中国語に翻訳されます", "kor": "힌트: 입력 텍스트는 생성을 위해 중국어로 번역됩니다"} count = 0 model_id = "runwayml/stable-diffusion-v1-5" pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe = pipe.to("cuda") model_id2 = "riffusion/riffusion-model-v1" pipe2 = StableDiffusionPipeline.from_pretrained(model_id2, torch_dtype=torch.float16) pipe2 = pipe2.to("cuda") def translate_language_example(text_prompts, style_indx): return translate_language(text_prompts) def translate_language(text_prompts): global count try: count += 1 tips_text = None language_code = language_recognition_model.recognize(text_prompts) if language_code != 'en': text_prompts = language_translation_model.translate(text_prompts, language_code, 'en') except Exception as e: error_text = str(e) return {status_text:error_text, language_tips_text:gr.update(visible=False)} if language_code in tips: tips_text = tips[language_code] else: tips_text = tips['en'] if language_code == 'zh': return {language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)} else: return {language_tips_text:gr.update(visible=True, value=tips_text), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)} def get_result(text_prompts, style_indx, musicAI_indx): style = style_list[style_indx] prompt = style + "," + text_prompts sdresult = pipe(prompt) image_output = sdresult.images[0] if not sdresult.nsfw_content_detected[0] else Image.open("nsfw_placeholder.jpg") print("Generated image with prompt " + prompt) # Encode your PIL Image as a JPEG without writing to disk imagefile = "imageoutput.png" #img_np = np.array(image_output[0]) #img_nparray= cv2.cvtColor(img_np, cv2.COLOR_BGR2RGBA) #img_blue_correction = Image.fromarray(img_nparray) #img_blue_correction.save(imagefile, img_blue_correction.format) image_output.save(imagefile, image_output.format) interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0] print(interrogate_prompt) spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx], musicAI_indx) video_merged = merge_video(music_output, image_output) return {spec_result:spec_image, video_result:video_merged, status_text:'Success'} def get_music(prompt, musicAI_indx): if musicAI_indx == 0: spec = pipe2(prompt).images[0] print(spec) wav = wav_bytes_from_spectrogram_image(spec) with open("output.wav", "wb") as f: f.write(wav[0].getbuffer()) #Convert to mp3 mp3file_name = "audio.mp3" wavfile = AudioSegment.from_wav(wavfilename) wav.export(mp3file_name, format="mp3") return spec, mp3file_name else: result = text_to_music(prompt, fn_index=0) print(f"""————— NEW RESULTS prompt : {prompt} music : {result} ——————— """) url = result mp3file_name = "file.mp3" data = urllib.request.urlopen(url) f = open(mp3file_name,'wb') f.write(data.read()) f.close() #wave_file="file.wav" #sound = AudioSegment.from_mp3(save_as) #sound.export(wave_file, format="wav") return None, mp3file_name def merge_video(mp3file_name, image): print('wav audio converted to mp3 audio' ) print('now getting duration of this mp3 audio' ) #getting audio clip's duration audio_length = int(MP3(mp3file_name).info.length) print('Audio length is :',audio_length) file_name = 'video_no_audio.mp4' fps = 12 slide_time = audio_length fourcc = cv2.VideoWriter.fourcc(*'MJPG') out = cv2.VideoWriter(file_name, fourcc, fps, (512, 512)) # for image in img_list: # cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # for _ in range(slide_time * fps): # #cv_img = cv2.resize(np.array(cv_img), (1024, 1024)) # out.write(cv_img) cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for _ in range(slide_time * fps): #cv_img = cv2.resize(np.array(cv_img), out.write(cv_img) out.release() #String a list of images into a video and write to memory print('video clip created successfully from images') # loading video file print('Starting video and audio merge') videoclip = VideoFileClip(file_name) #("/content/gdrive/My Drive/AI/my_video1.mp4") print('loading video-clip') # loading audio file audioclip = AudioFileClip(mp3file_name) #.subclip(0, 15) print('loading mp3-format audio') # adding audio to the video clip mergedclip = videoclip.set_audio(audioclip) print('video and audio merged successfully') #Getting size and frame count of merged video file print('Getting size and frame count of merged video file') duration = mergedclip.duration frame_count = mergedclip.fps print('duration is:',duration) print('frame count :', frame_count) mergedclip.to_videofile('mergedvideo.mp4') return 'mergedvideo.mp4' title="文生图生音乐视频 Text to Image to Music to Video with Riffusion" description="An AI art generation pipeline, which supports text-to-image-to-music task." css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: white; border-color: black; background: black; } input[type='range'] { accent-color: black; } .dark input[type='range'] { accent-color: #dfdfdf; } .container { max-width: 730px; margin: auto; padding-top: 1.5rem; } #gallery { min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius: .5rem !important; border-bottom-left-radius: .5rem !important; } #gallery>div>.h-full { min-height: 20rem; } .details:hover { text-decoration: underline; } .gr-button { white-space: nowrap; } .gr-button:focus { border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1; --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); --tw-ring-opacity: .5; } .footer { margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .prompt h4{ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%; } """ block = gr.Blocks(css=css) examples = [ [ '蒙娜丽莎,赛博朋克,宝丽来,33毫米', '蒸汽波艺术(Vaporwave)' ], [ '一条由闪电制成的令人敬畏的龙', '概念艺术(Conceptual Art)' ], [ 'An awesome dragon made of lightning', '概念艺术(Conceptual Art)' ], [ '少女在时代广场,舞蹈', '写实风格(Realistic style)' ], [ 'Peking Opera at New York', '默认(Default)' ], [ '古风少女', '水彩(Watercolor)' ], [ '辐射游戏角色', '默认(Default)' ], [ 'Fallout game character', '默认(Default)' ], [ 'Traditional Chinese Painting', '古风(Ancient Style)' ], [ '原神游戏截图,pixiv, 二次元绘画作品', '二次元(Anime)' ], [ 'Genshin Impact Game Screenshot, pixiv, Anime Painting Artworks', '二次元(Anime)' ], [ '原神角色设定, 哪吒, pixiv, 二次元绘画', '二次元(Anime)' ], [ 'Genshin Impact Character Design, Harry Potter, pixiv, Anime Painting', '二次元(Anime)' ], [ '巨狼,飘雪,蓝色大片烟雾,毛发细致,烟雾缭绕,高清,3d,cg感,侧面照', '默认(Default)' ], [ '汉服少女,中国山水画,青山绿水,溪水长流,古风,科技都市,丹青水墨,中国风', '赛博朋克(Cyberpunk)' ], [ '戴着墨镜的赛博朋克女孩肖像,在夕阳下的城市中, 油画风格', '赛博朋克(Cyberpunk)' ], [ 'Portrait of a cyberpunk girl with sunglasses, in the city sunset, oil painting', '赛博朋克(Cyberpunk)' ], [ '暗黑破坏神', '默认(Default)' ], [ '火焰,凤凰,少女,未来感,高清,3d,精致面容,cg感,古风,唯美,毛发细致,上半身立绘', '默认(Default)' ], [ '浮世绘日本科幻哑光绘画,概念艺术,动漫风格神道寺禅园英雄动作序列,包豪斯', '默认(Default)' ], [ '一只猫坐在椅子上,戴着一副墨镜,海盗风格', '默认(Default)' ], [ '稲妻で作られた畏敬の念を抱かせる竜、コンセプトアート', '油画(Oil painting)' ], [ '번개로 만든 경외스러운 용, 개념 예술', '油画(Oil painting)' ], [ '梵高猫头鹰', '蒸汽波艺术(Vaporwave)' ], [ '萨尔瓦多·达利描绘古代文明的超现实主义梦幻油画', '写实风格(Realistic style)' ], [ '夕阳日落时,阳光落在云层上,海面波涛汹涌,风景,胶片感', '默认(Default)' ], [ 'Sunset, the sun falls on the clouds, the sea is rough, the scenery is filmy', '油画(Oil painting)' ], [ '夕日が沈むと、雲の上に太陽の光が落ち、海面は波が荒く、風景、フィルム感', '油画(Oil painting)' ], [ '석양이 질 때 햇빛이 구름 위에 떨어지고, 해수면의 파도가 용솟음치며, 풍경, 필름감', '油画(Oil painting)' ], ] with block: gr.HTML( """

文生图生音乐视频

Text to Image to Music to Video

Powered by Riffusion Model V1, Mubert AI, Stable Diffusion V1.5, CLIP Interrogator, fffiloni's Riffusion Text-to-Music and Baidu Language Translation projects

""" ) with gr.Group(): with gr.Box(): with gr.Row().style(mobile_collapse=False, equal_height=True): text = gr.Textbox( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt, multiple languages are supported now.", ).style( border=(True, False, True, True), rounded=(True, False, False, True), container=False, ) btn = gr.Button("Generate image").style( margin=False, rounded=(False, True, True, False), ) language_tips_text = gr.Textbox(label="language tips", show_label=False, visible=False, max_lines=1) styles = gr.Dropdown(label="风格(style)", choices=['古风(Ancient Style)', '油画(Oil painting)', '水彩(Watercolor)', '卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly', '像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)', '洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index") musicAI = gr.Dropdown(label="音乐生成技术(AI Music Generator)", choices=['Riffusion', 'Mubert AI'], value='Riffusion', type="index") status_text = gr.Textbox( label="处理状态(Process status)", show_label=True, max_lines=1, interactive=False ) video_result = gr.Video(type=None, label='Final Merged video') spec_result = gr.Image() trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion. translated_language = gr.Textbox(vaule="", visible=False) ex = gr.Examples(examples=examples, fn=translate_language_example, inputs=[text, styles], outputs=[language_tips_text, status_text, trigger_component, translated_language], cache_examples=False) ex.dataset.headers = [""] text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language]) btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language]) trigger_component.change(fn=get_result, inputs=[translated_language, styles, musicAI], outputs=[spec_result, video_result, status_text]) gr.Markdown( """ Space by [@DGSpitzer](https://www.youtube.com/channel/UCzzsYBF4qwtMwJaPJZ5SuPg)❤️ [@大谷的游戏创作小屋](https://space.bilibili.com/176003) [![Twitter Follow](https://img.shields.io/twitter/follow/DGSpitzer?label=%40DGSpitzer&style=social)](https://twitter.com/DGSpitzer) ![visitors](https://visitor-badge.glitch.me/badge?page_id=dgspitzer_txt2img2video) """ ) gr.HTML(''' ''') block.queue(concurrency_count=128).launch()