from PIL import Image import numpy as np import gradio as gr #import paddlehub as hub import urllib import cv2 import re import os import requests from share_btn import community_icon_html, loading_icon_html, share_js import torch from spectro import wav_bytes_from_spectrogram_image from diffusers import StableDiffusionPipeline from diffusers import EulerAncestralDiscreteScheduler import io from os import path from pydub import AudioSegment import moviepy.video.io.ImageSequenceClip from moviepy.editor import * import mutagen from mutagen.mp3 import MP3 from mutagen.wave import WAVE import time import base64 import gradio as gr from sentence_transformers import SentenceTransformer import httpx import json from utils import get_tags_for_prompts, get_mubert_tags_embeddings, get_pat minilm = SentenceTransformer('all-MiniLM-L6-v2') mubert_tags_embeddings = get_mubert_tags_embeddings(minilm) def get_track_by_tags(tags, pat, duration, maxit=20, loop=False): if loop: mode = "loop" else: mode = "track" r = httpx.post('https://api-b2b.mubert.com/v2/RecordTrackTTM', json={ "method": "RecordTrackTTM", "params": { "pat": pat, "duration": duration, "tags": tags, "mode": mode } }) rdata = json.loads(r.text) assert rdata['status'] == 1, rdata['error']['text'] trackurl = rdata['data']['tasks'][0]['download_link'] print('Generating track ', end='') for i in range(maxit): r = httpx.get(trackurl) if r.status_code == 200: return trackurl time.sleep(1) def generate_track_by_prompt(prompt): try: pat = get_pat("mail@mail.com") _, tags = get_tags_for_prompts(minilm, mubert_tags_embeddings, [prompt, ])[0] result = get_track_by_tags(tags, pat, int(30), loop=False) print(result) return result except Exception as e: return str(e) #img_to_text = gr.Blocks.load(name="spaces/fffiloni/CLIP-Interrogator-2") #text_to_music = gr.Interface.load("spaces/fffiloni/text-2-music") #language_translation_model = hub.Module(name='baidu_translate') #language_recognition_model = hub.Module(name='baidu_language_recognition') # style_list = ['古风', '油画', '水彩', '卡通', '二次元', '浮世绘', '蒸汽波艺术', 'low poly', '像素风格', '概念艺术', '未来主义', '赛博朋克', '写实风格', '洛丽塔风格', '巴洛克风格', '超现实主义', '默认'] style_list_EN = ['Chinese Ancient Style', 'Oil painting', 'Watercolor', 'Cartoon', 'Anime', 'Ukiyoe', 'Vaporwave', 'low poly', 'Pixel Style', 'Conceptual Art', 'Futurism', 'Cyberpunk', 'Realistic style', 'Lolita style', 'Baroque style', 'Surrealism', 'Detailed'] tips = {"en": "Tips: The input text will be translated into English for generation", "jp": "ヒント: 入力テキストは生成のために中国語に翻訳されます", "kor": "힌트: 입력 텍스트는 생성을 위해 중국어로 번역됩니다"} count = 0 model_id = "runwayml/stable-diffusion-v1-5" eulera = EulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, scheduler=eulera) pipe = pipe.to("cuda") model_id2 = "riffusion/riffusion-model-v1" pipe2 = StableDiffusionPipeline.from_pretrained(model_id2, torch_dtype=torch.float16) pipe2 = pipe2.to("cuda") def translate_language_example(text_prompts, style_indx): return translate_language(text_prompts) def translate_language(text_prompts): global count try: count += 1 tips_text = None #language_code = language_recognition_model.recognize(text_prompts) language_code = 'en' #if language_code != 'en': #text_prompts = language_translation_model.translate(text_prompts, language_code, 'en') except Exception as e: error_text = str(e) return {status_text:error_text, language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)} if language_code in tips: tips_text = tips[language_code] else: tips_text = tips['en'] if language_code == 'en': return {language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)} else: return {language_tips_text:gr.update(visible=True, value=tips_text), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)} def get_result(text_prompts, style_indx, musicAI_indx, duration): style = style_list_EN[style_indx] prompt = style + "," + text_prompts sdresult = pipe(prompt, negative_prompt = "out of focus, scary, creepy, evil, disfigured, missing limbs, ugly, gross, missing fingers", num_inference_steps=50, guidance_scale=7, width=576, height=576) image_output = sdresult.images[0] if not sdresult.nsfw_content_detected[0] else Image.open("nsfw_placeholder.jpg") print("Generated image with prompt " + prompt) # Encode your PIL Image as a JPEG without writing to disk imagefile = "imageoutput.png" #img_np = np.array(image_output[0]) #img_nparray= cv2.cvtColor(img_np, cv2.COLOR_BGR2RGBA) #img_blue_correction = Image.fromarray(img_nparray) #img_blue_correction.save(imagefile, img_blue_correction.format) image_output.save(imagefile, image_output.format) interrogate_prompt = prompt #interrogate_prompt = img_to_text(imagefile, 'fast', 4, fn_index=1)[0] print(interrogate_prompt) spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx], musicAI_indx, duration) video_merged = merge_video(music_output, image_output) return {spec_result:spec_image, imgfile_result:image_output, musicfile_result:"audio.wav", video_result:video_merged, status_text:'Success', share_button:gr.update(visible=True), community_icon:gr.update(visible=True), loading_icon:gr.update(visible=True)} def get_music(prompt, musicAI_indx, duration): mp3file_name = "audio.mp3" wavfile_name = "audio.wav" if musicAI_indx == 0: if duration == 5: width_duration=512 else : width_duration = 512 + ((int(duration)-5) * 128) spec = pipe2(prompt, height=512, width=width_duration).images[0] print(spec) wav = wav_bytes_from_spectrogram_image(spec) with open(wavfile_name, "wb") as f: f.write(wav[0].getbuffer()) #Convert to mp3, for video merging function wavfile = AudioSegment.from_wav(wavfile_name) wavfile.export(mp3file_name, format="mp3") return spec, mp3file_name else: #result = text_to_music(prompt, fn_index=0) result = generate_track_by_prompt(prompt) print(f"""————— NEW RESULTS prompt : {prompt} music : {result} ——————— """) url = result data = urllib.request.urlopen(url) f = open(mp3file_name,'wb') f.write(data.read()) f.close() #Convert to wav, for sharing function only supports wav file mp3file = AudioSegment.from_mp3(mp3file_name) mp3file.export(wavfile_name, format="wav") return None, mp3file_name def merge_video(mp3file_name, image): print('wav audio converted to mp3 audio' ) print('now getting duration of this mp3 audio' ) #getting audio clip's duration audio_length = int(MP3(mp3file_name).info.length) print('Audio length is :',audio_length) file_name = 'video_no_audio.mp4' fps = 12 slide_time = audio_length fourcc = cv2.VideoWriter.fourcc(*'MJPG') #W, H should be the same as input image out = cv2.VideoWriter(file_name, fourcc, fps, (576, 576)) # for image in img_list: # cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # for _ in range(slide_time * fps): # #cv_img = cv2.resize(np.array(cv_img), (1024, 1024)) # out.write(cv_img) cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for _ in range(slide_time * fps): #cv_img = cv2.resize(np.array(cv_img), out.write(cv_img) out.release() #String a list of images into a video and write to memory print('video clip created successfully from images') # loading video file print('Starting video and audio merge') videoclip = VideoFileClip(file_name) #("/content/gdrive/My Drive/AI/my_video1.mp4") print('loading video-clip') # loading audio file audioclip = AudioFileClip(mp3file_name) #.subclip(0, 15) print('loading mp3-format audio') # adding audio to the video clip mergedclip = videoclip.set_audio(audioclip) print('video and audio merged successfully') #Getting size and frame count of merged video file print('Getting size and frame count of merged video file') duration = mergedclip.duration frame_count = mergedclip.fps print('duration is:',duration) print('frame count :', frame_count) mergedclip.to_videofile('mergedvideo.mp4') return 'mergedvideo.mp4' def change_music_generator(current_choice): if current_choice == 0: return gr.update(visible=True) return gr.update(visible=False) title="文生图生音乐视频 Text to Image to Music to Video with Riffusion" description="An AI art generation pipeline, which supports text-to-image-to-music task." css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: white; border-color: black; background: black; } input[type='range'] { accent-color: black; } .dark input[type='range'] { accent-color: #dfdfdf; } .container { max-width: 730px; margin: auto; padding-top: 1.5rem; } #gallery { min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius: .5rem !important; border-bottom-left-radius: .5rem !important; } #gallery>div>.h-full { min-height: 20rem; } .details:hover { text-decoration: underline; } .gr-button { white-space: nowrap; } .gr-button:focus { border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1; --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); --tw-ring-opacity: .5; } .footer { margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .prompt h4{ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%; } #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; } #share-btn { all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0; } #share-btn * { all: unset; } #share-btn-container div:nth-child(-n+2){ width: auto !important; min-height: 0px !important; } #share-btn-container .wrap { display: none !important; } """ block = gr.Blocks(css=css) examples = [ [ '蒙娜丽莎,赛博朋克,宝丽来,33毫米', '蒸汽波艺术(Vaporwave)' ], [ '一条由闪电制成的令人敬畏的龙', '概念艺术(Conceptual Art)' ], [ 'An awesome dragon made of lightning', '概念艺术(Conceptual Art)' ], [ '少女在时代广场,舞蹈', '写实风格(Realistic style)' ], [ 'Peking Opera at New York', '默认(Default)' ], [ '古风少女', '水彩(Watercolor)' ], [ '辐射游戏角色', '默认(Default)' ], [ 'Fallout game character', '默认(Default)' ], [ 'Traditional Chinese Painting', '古风(Ancient Style)' ], [ '原神游戏截图,pixiv, 二次元绘画作品', '二次元(Anime)' ], [ 'Genshin Impact Game Screenshot, pixiv, Anime Painting Artworks', '二次元(Anime)' ], [ '原神角色设定, 哪吒, pixiv, 二次元绘画', '二次元(Anime)' ], [ 'Genshin Impact Character Design, Harry Potter, pixiv, Anime Painting', '二次元(Anime)' ], [ '巨狼,飘雪,蓝色大片烟雾,毛发细致,烟雾缭绕,高清,3d,cg感,侧面照', '默认(Default)' ], [ '汉服少女,中国山水画,青山绿水,溪水长流,古风,科技都市,丹青水墨,中国风', '赛博朋克(Cyberpunk)' ], [ '戴着墨镜的赛博朋克女孩肖像,在夕阳下的城市中, 油画风格', '赛博朋克(Cyberpunk)' ], [ 'Portrait of a cyberpunk girl with sunglasses, in the city sunset, oil painting', '赛博朋克(Cyberpunk)' ], [ '暗黑破坏神', '默认(Default)' ], [ '火焰,凤凰,少女,未来感,高清,3d,精致面容,cg感,古风,唯美,毛发细致,上半身立绘', '默认(Default)' ], [ '浮世绘日本科幻哑光绘画,概念艺术,动漫风格神道寺禅园英雄动作序列,包豪斯', '默认(Default)' ], [ '一只猫坐在椅子上,戴着一副墨镜,海盗风格', '默认(Default)' ], [ '稲妻で作られた畏敬の念を抱かせる竜、コンセプトアート', '油画(Oil painting)' ], [ '번개로 만든 경외스러운 용, 개념 예술', '油画(Oil painting)' ], [ '梵高猫头鹰', '蒸汽波艺术(Vaporwave)' ], [ '萨尔瓦多·达利描绘古代文明的超现实主义梦幻油画', '写实风格(Realistic style)' ], [ '夕阳日落时,阳光落在云层上,海面波涛汹涌,风景,胶片感', '默认(Default)' ], [ 'Sunset, the sun falls on the clouds, the sea is rough, the scenery is filmy', '油画(Oil painting)' ], [ '夕日が沈むと、雲の上に太陽の光が落ち、海面は波が荒く、風景、フィルム感', '油画(Oil painting)' ], [ '석양이 질 때 햇빛이 구름 위에 떨어지고, 해수면의 파도가 용솟음치며, 풍경, 필름감', '油画(Oil painting)' ], ] with block: gr.HTML( """

文生图生音乐视频

Text to Image to Music to Video

Powered by Riffusion Model V1, Mubert AI, Stable Diffusion V1.5, CLIP Interrogator, fffiloni's Riffusion Text-to-Music and Baidu Language Translation projects

""" ) with gr.Group(): with gr.Box(): with gr.Row().style(mobile_collapse=False, equal_height=True): text = gr.Textbox( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt, multiple languages are supported now.", elem_id="input-prompt", ).style( border=(True, False, True, True), rounded=(True, False, False, True), container=False, ) btn = gr.Button("Generate image").style( margin=False, rounded=(False, True, True, False), ) language_tips_text = gr.Textbox(label="language tips", show_label=False, visible=False, max_lines=1) styles = gr.Dropdown(label="风格(style)", choices=['古风(Ancient Style)', '油画(Oil painting)', '水彩(Watercolor)', '卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly', '像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)', '洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index") musicAI = gr.Dropdown(label="音乐生成技术(AI Music Generator)", choices=['Riffusion', 'Mubert AI'], value='Riffusion', type="index") duration_input = gr.Slider(label="Duration in seconds", minimum=5, maximum=10, step=1, value=5, elem_id="duration-slider", visible=True) status_text = gr.Textbox( label="处理状态(Process status)", show_label=True, max_lines=1, interactive=False ) with gr.Column(elem_id="col-container"): with gr.Group(elem_id="share-btn-container"): community_icon = gr.HTML(community_icon_html, visible=False) loading_icon = gr.HTML(loading_icon_html, visible=False) share_button = gr.Button("Share to community", elem_id="share-btn", visible=False) share_button.click(None, [], [], _js=share_js) video_result = gr.Video(type=None, label='Final Merged video', elem_id="output-video") imgfile_result = gr.Image(label="Art Cover", elem_id="output-img") musicfile_result = gr.Audio(type='filepath', label="Generated Music Track", elem_id="output-music") spec_result = gr.Image(label="Spectrogram Image") trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion. translated_language = gr.Textbox(vaule="", visible=False) ex = gr.Examples(examples=examples, fn=translate_language_example, inputs=[text, styles], outputs=[language_tips_text, status_text, trigger_component, translated_language], cache_examples=False) ex.dataset.headers = [""] musicAI.change(fn=change_music_generator, inputs=[musicAI], outputs=[duration_input]) text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language]) btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language]) trigger_component.change(fn=get_result, inputs=[translated_language, styles, musicAI, duration_input], outputs=[spec_result, imgfile_result, musicfile_result, video_result, status_text, share_button, community_icon, loading_icon]) gr.Markdown( """ Space by [@DGSpitzer](https://www.youtube.com/channel/UCzzsYBF4qwtMwJaPJZ5SuPg)❤️ [@大谷的游戏创作小屋](https://space.bilibili.com/176003) [![Twitter Follow](https://img.shields.io/twitter/follow/DGSpitzer?label=%40DGSpitzer&style=social)](https://twitter.com/DGSpitzer) ![visitors](https://visitor-badge.glitch.me/badge?page_id=dgspitzer_txt2img2video) """ ) gr.HTML(''' ''') block.queue().launch()