TXT-2-IMG-2-MUSIC-2-VIDEO-w-RIFFUSION

Runtime error

TXT-2-IMG-2-MUSIC-2-VIDEO-w-RIFFUSION / app.py

Update app.py

677f5d3 over 1 year ago

17.8 kB

	from PIL import Image
	import numpy as np
	import gradio as gr
	import paddlehub as hub
	import urllib
	import cv2

	import torch

	from spectro import wav_bytes_from_spectrogram_image
	from diffusers import StableDiffusionPipeline

	import io
	from os import path
	from pydub import AudioSegment
	import moviepy.video.io.ImageSequenceClip
	from moviepy.editor import *
	import mutagen
	from mutagen.mp3 import MP3

	img_to_text = gr.Blocks.load(name="spaces/pharma/CLIP-Interrogator")

	language_translation_model = hub.Module(name='baidu_translate')
	language_recognition_model = hub.Module(name='baidu_language_recognition')

	style_list = ['古风', '油画', '水彩', '卡通', '二次元', '浮世绘', '蒸汽波艺术', 'low poly', '像素风格', '概念艺术', '未来主义', '赛博朋克', '写实风格', '洛丽塔风格', '巴洛克风格', '超现实主义', '探索无限']
	style_list_EN = ['Chinese Ancient Style', 'Oil painting', 'Watercolor', 'Cartoon', 'Anime', 'Ukiyoe', 'Vaporwave', 'low poly', 'Pixel Style', 'Conceptual Art', 'Futurism', 'Cyberpunk', 'Realistic style', 'Lolita style', 'Baroque style', 'Surrealism', '']

	tips = {"en": "Tips: The input text will be translated into Chinese for generation",
	"jp": "ヒント: 入力テキストは生成のために中国語に翻訳されます",
	"kor": "힌트: 입력 텍스트는 생성을 위해 중국어로 번역됩니다"}

	count = 0

	model_id2 = "riffusion/riffusion-model-v1"
	pipe2 = StableDiffusionPipeline.from_pretrained(model_id2, torch_dtype=torch.float16)
	pipe2 = pipe2.to("cuda")


	def translate_language_example(text_prompts, style_indx):
	return translate_language(text_prompts)

	def translate_language(text_prompts):
	global count
	try:
	count += 1
	tips_text = None
	language_code = language_recognition_model.recognize(text_prompts)
	if language_code != 'zh':
	text_prompts = language_translation_model.translate(text_prompts, language_code, 'zh')
	except Exception as e:
	error_text = str(e)
	return {status_text:error_text, language_tips_text:gr.update(visible=False)}
	if language_code in tips:
	tips_text = tips[language_code]
	else:
	tips_text = tips['en']
	if language_code == 'zh':
	return {language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}
	else:
	return {language_tips_text:gr.update(visible=True, value=tips_text), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}


	def get_result(text_prompts, style_indx):
	#results = text_to_img(text_prompts, style_indx, fn_index=3)
	print(text_prompts)
	#results = text_to_img(text_prompts, text_prompts, fn_index=3)
	print(style_indx)
	try:
	style = style_list[style_indx]

	#results = text_to_img(text_prompts, style_indx, fn_index=3)

	results = model.generate_image(
	text_prompts=text_prompts, style=style, visualization=False, topk=1)
	except Exception as e:
	error_text = str(e)
	return {video_result:None, status_text:error_text}

	#print("Ernie Vilg Output: " + str(results[:1]))
	print("Ernie Vilg Output test: " + str(results))

	#image_output = results[:1]
	image_output = results[:1]

	print("file name: " + image_output[0].filename)

	# Encode your PIL Image as a JPEG without writing to disk
	imagefile = "imageoutput.png"
	#img_np = np.array(image_output[0])
	#img_nparray= cv2.cvtColor(img_np, cv2.COLOR_BGR2RGBA)
	#img_blue_correction = Image.fromarray(img_nparray)
	#img_blue_correction.save(imagefile, img_blue_correction.format)
	image_output[0].save(imagefile, image_output[0].format)

	interrogate_prompt = img_to_text(imagefile, fn_index=1)[0]
	print(interrogate_prompt)
	music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx])

	video_merged = merge_video(music_output, image_output)
	return {video_result:video_merged, status_text:'Success'}

	def get_music(prompt):

	result = text_to_music(prompt, fn_index=0)

	print(f"""—————
	NEW RESULTS
	prompt : {prompt}
	music : {result}
	———————
	""")

	url = result
	save_as = "file.mp3"

	data = urllib.request.urlopen(url)

	f = open(save_as,'wb')
	f.write(data.read())
	f.close()

	#wave_file="file.wav"

	#sound = AudioSegment.from_mp3(save_as)
	#sound.export(wave_file, format="wav")

	return save_as

	def merge_video(music, img_list):
	#Convert to mp3
	#music.export("audio.mp3", format="mp3")
	print('wav audio converted to mp3 audio' )
	print('now getting duration of this mp3 audio' )
	#getting audio clip's duration
	audio_length = int(MP3(music).info.length)
	print('Audio length is :',audio_length)

	file_name = 'video_no_audio.mp4'
	fps = 12
	slide_time = audio_length
	fourcc = cv2.VideoWriter.fourcc(*'MJPG')
	out = cv2.VideoWriter(file_name, fourcc, fps, (1024, 1024))

	for image in img_list:
	cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	for _ in range(slide_time * fps):
	#cv_img = cv2.resize(np.array(cv_img), (1024, 1024))
	out.write(cv_img)

	out.release()


	#String a list of images into a video and write to memory
	print('video clip created successfully from images')

	# loading video file
	print('Starting video and audio merge')
	videoclip = VideoFileClip(file_name) #("/content/gdrive/My Drive/AI/my_video1.mp4")
	print('loading video-clip')

	# loading audio file
	audioclip = AudioFileClip(music) #.subclip(0, 15)
	print('loading mp3-format audio')
	# adding audio to the video clip
	mergedclip = videoclip.set_audio(audioclip)
	print('video and audio merged successfully')

	#Getting size and frame count of merged video file
	print('Getting size and frame count of merged video file')
	duration = mergedclip.duration
	frame_count = mergedclip.fps
	print('duration is:',duration)
	print('frame count :', frame_count)
	mergedclip.to_videofile('mergedvideo.mp4')
	return 'mergedvideo.mp4'

	title="文生图生音乐视频 Text to Image to Music to Video"

	description="An AI art generation pipeline, which supports text-to-image-to-music task."

	css = """
	.gradio-container {
	font-family: 'IBM Plex Sans', sans-serif;
	}
	.gr-button {
	color: white;
	border-color: black;
	background: black;
	}
	input[type='range'] {
	accent-color: black;
	}
	.dark input[type='range'] {
	accent-color: #dfdfdf;
	}
	.container {
	max-width: 730px;
	margin: auto;
	padding-top: 1.5rem;
	}
	#gallery {
	min-height: 22rem;
	margin-bottom: 15px;
	margin-left: auto;
	margin-right: auto;
	border-bottom-right-radius: .5rem !important;
	border-bottom-left-radius: .5rem !important;
	}
	#gallery>div>.h-full {
	min-height: 20rem;
	}
	.details:hover {
	text-decoration: underline;
	}
	.gr-button {
	white-space: nowrap;
	}
	.gr-button:focus {
	border-color: rgb(147 197 253 / var(--tw-border-opacity));
	outline: none;
	box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
	--tw-border-opacity: 1;
	--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
	--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
	--tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
	--tw-ring-opacity: .5;
	}
	.footer {
	margin-bottom: 45px;
	margin-top: 35px;
	text-align: center;
	border-bottom: 1px solid #e5e5e5;
	}
	.footer>p {
	font-size: .8rem;
	display: inline-block;
	padding: 0 10px;
	transform: translateY(10px);
	background: white;
	}
	.dark .footer {
	border-color: #303030;
	}
	.dark .footer>p {
	background: #0b0f19;
	}
	.prompt h4{
	margin: 1.25em 0 .25em 0;
	font-weight: bold;
	font-size: 115%;
	}
	"""

	block = gr.Blocks(css=css)

	examples = [
	[
	'蒙娜丽莎，赛博朋克，宝丽来，33毫米',
	'蒸汽波艺术(Vaporwave)'
	],
	[
	'一条由闪电制成的令人敬畏的龙',
	'概念艺术(Conceptual Art)'
	],
	[
	'An awesome dragon made of lightning',
	'概念艺术(Conceptual Art)'
	],
	[
	'嫦娥在时代广场，戏曲',
	'写实风格(Realistic style)'
	],
	[
	'Peking Opera at New York',
	'探索无限(Explore infinity)'
	],
	[
	'古风少女',
	'水彩(Watercolor)'
	],
	[
	'辐射游戏角色',
	'探索无限(Explore infinity)'
	],
	[
	'Fallout game character',
	'探索无限(Explore infinity)'
	],
	[
	'Traditional Chinese Painting',
	'古风(Ancient Style)'
	],
	[
	'原神游戏截图，pixiv, 二次元绘画作品',
	'二次元(Anime)'
	],
	[
	'Genshin Impact Game Screenshot, pixiv, Anime Painting Artworks',
	'二次元(Anime)'
	],
	[
	'原神角色设定, 哪吒, pixiv, 二次元绘画',
	'二次元(Anime)'
	],
	[
	'Genshin Impact Character Design, Harry Potter, pixiv, Anime Painting',
	'二次元(Anime)'
	],
	[
	'巨狼，飘雪，蓝色大片烟雾，毛发细致，烟雾缭绕，高清，3d，cg感，侧面照',
	'探索无限(Explore infinity)'
	],
	[
	'汉服少女，中国山水画，青山绿水，溪水长流，古风，科技都市，丹青水墨，中国风',
	'赛博朋克(Cyberpunk)'
	],
	[
	'戴着墨镜的赛博朋克女孩肖像，在夕阳下的城市中, 油画风格',
	'赛博朋克(Cyberpunk)'
	],
	[
	'Portrait of a cyberpunk girl with sunglasses, in the city sunset, oil painting',
	'赛博朋克(Cyberpunk)'
	],
	[
	'暗黑破坏神',
	'探索无限(Explore infinity)'
	],
	[
	'火焰，凤凰，少女，未来感，高清，3d，精致面容，cg感，古风，唯美，毛发细致，上半身立绘',
	'探索无限(Explore infinity)'
	],
	[
	'浮世绘日本科幻哑光绘画，概念艺术，动漫风格神道寺禅园英雄动作序列，包豪斯',
	'探索无限(Explore infinity)'
	],
	[
	'一只猫坐在椅子上，戴着一副墨镜,海盗风格',
	'探索无限(Explore infinity)'
	],
	[
	'稲妻で作られた畏敬の念を抱かせる竜、コンセプトアート',
	'油画(Oil painting)'
	],
	[
	'번개로 만든 경외스러운 용, 개념 예술',
	'油画(Oil painting)'
	],
	[
	'梵高猫头鹰',
	'蒸汽波艺术(Vaporwave)'
	],
	[
	'萨尔瓦多·达利描绘古代文明的超现实主义梦幻油画',
	'写实风格(Realistic style)'
	],
	[
	'夕阳日落时，阳光落在云层上，海面波涛汹涌，风景，胶片感',
	'探索无限(Explore infinity)'
	],
	[
	'Sunset, the sun falls on the clouds, the sea is rough, the scenery is filmy',
	'油画(Oil painting)'
	],
	[
	'夕日が沈むと、雲の上に太陽の光が落ち、海面は波が荒く、風景、フィルム感',
	'油画(Oil painting)'
	],
	[
	'석양이 질 때 햇빛이 구름 위에 떨어지고, 해수면의 파도가 용솟음치며, 풍경, 필름감',
	'油画(Oil painting)'
	],
	]

	with block:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 650px; margin: 0 auto;">
	<div
	style="
	display: inline-flex;
	gap: 0.8rem;
	font-size: 1.75rem;
	margin-bottom: 10px;
	margin-left: 220px;
	justify-content: center;
	"
	>
	</div>
	<div
	style="
	display: inline-flex;
	align-items: center;
	gap: 0.8rem;
	font-size: 1.75rem;
	margin-bottom: 10px;
	justify-content: center;
	">
	<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 15px;">文生图生音乐视频</h1>
	</div>
	<div
	style="
	display: inline-flex;
	align-items: center;
	gap: 0.8rem;
	font-size: 1.75rem;
	margin-bottom: 10px;
	justify-content: center;
	">
	<h1 style="font-weight: 900; margin-bottom: 7px;">Text to Image to Music to Video</h1>
	</div>
	<p style="margin-bottom: 10px; font-size: 94%">
	Powered by <a href="https://huggingface.co/spaces/PaddlePaddle/ERNIE-ViLG" target="_blank">ERNIE-ViLG 2.0</a>, <a href="https://huggingface.co/spaces/Mubert/Text-to-Music" target="_blank">Mubert AI</a>, <a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator" target="_blank">CLIP Interrogator</a> and fffiloni's <a href="https://huggingface.co/spaces/fffiloni/img-to-music" target="_blank">Image to Music</a> projects
	</p>
	</div>
	"""
	)
	with gr.Group():
	with gr.Box():
	with gr.Row().style(mobile_collapse=False, equal_height=True):
	text = gr.Textbox(
	label="Prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your prompt, multiple languages are supported now.",
	).style(
	border=(True, False, True, True),
	rounded=(True, False, False, True),
	container=False,
	)

	btn = gr.Button("Generate image").style(
	margin=False,
	rounded=(False, True, True, False),
	)
	language_tips_text = gr.Textbox(label="language tips", show_label=False, visible=False, max_lines=1)
	styles = gr.Dropdown(label="风格(style)", choices=['古风(Ancient Style)', '油画(Oil painting)', '水彩(Watercolor)',
	'卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly',
	'像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
	'洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '探索无限(Explore infinity)'], value='探索无限(Explore infinity)', type="index")
	status_text = gr.Textbox(
	label="处理状态(Process status)",
	show_label=True,
	max_lines=1,
	interactive=False
	)

	video_result = gr.Video(type=None, label='Final Merged video')

	trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
	translated_language = gr.Textbox(vaule="", visible=False)


	ex = gr.Examples(examples=examples, fn=translate_language_example, inputs=[text, styles], outputs=[language_tips_text, status_text, trigger_component, translated_language], cache_examples=False)
	ex.dataset.headers = [""]


	text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
	btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
	trigger_component.change(fn=get_result, inputs=[translated_language, styles], outputs=[video_result, status_text])


	gr.Markdown(
	"""
	### <u>[Prompt Tutorial 公式教程...](https://github.com/PaddlePaddle/PaddleHub/blob/develop/modules/image/text_to_image/ernie_vilg/README.md#四-prompt-指南)([Explore more...](https://github.com/PaddlePaddle/PaddleHub/blob/develop/modules/image/text_to_image/ernie_vilg/README.md#四-prompt-指南))</u>
	"""
	)
	gr.Markdown(
	"""
	Space by [@DGSpitzer](https://www.youtube.com/channel/UCzzsYBF4qwtMwJaPJZ5SuPg)❤️ [@大谷的游戏创作小屋](https://space.bilibili.com/176003)
	[![Twitter Follow](https://img.shields.io/twitter/follow/DGSpitzer?label=%40DGSpitzer&style=social)](https://twitter.com/DGSpitzer)
	![visitors](https://visitor-badge.glitch.me/badge?page_id=dgspitzer_txt2img2video)
	"""
	)
	gr.HTML('''
	<div class="footer">
	<p>Model：<a href="https://github.com/PaddlePaddle/PaddleHub" style="text-decoration: underline;" target="_blank">PaddleHub</a> and <a href="https://wenxin.baidu.com/ernie-vilg" style="text-decoration: underline;" target="_blank">文心大模型</a>
	</p>
	</div>
	''')

	block.queue(concurrency_count=128).launch()