Spaces:

PushkarA07
/

Cover-Gen-audio2image

Sleeping

App Files Files Community

PushkarA07 commited on Mar 18, 2023

Commit

e520534

1 Parent(s): 386d22a

Create app.py

Browse files

Files changed (1) hide show

app.py +129 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
+from diffusers import LDMTextToImagePipeline
+import gradio as gr
+import PIL.Image
+import numpy as np
+import random
+import torch
+import subprocess
+from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer
+from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
+from transformers import WhisperForConditionalGeneration, WhisperConfig, WhisperProcessor
+import torchaudio
+import nltk
+from pydub import AudioSegment
+import re
+from datasets import load_dataset
+from transformers import AutoModelWithLMHead, AutoTokenizer, set_seed, pipeline
+import torch
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+import torch
+from diffusers import StableDiffusionPipeline, AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DPMSolverMultistepScheduler, LMSDiscreteScheduler
+from transformers import CLIPTextModel, CLIPTokenizer
+from tqdm.auto import tqdm
+from torch import autocast
+from PIL import Image
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def generate_lyrics(sample):
+    model_name = "openai/whisper-tiny.en"
+    model_config = WhisperConfig.from_pretrained(model_name)
+    processor = WhisperProcessor.from_pretrained(model_name)
+    asr_model = WhisperForConditionalGeneration.from_pretrained(model_name, config=model_config)
+    asr_model.eval()
+    input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
+    transcript = asr_model.generate(input_features)
+    predicted_ids = asr_model.generate(input_features)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+    lyrics = transcription[0]
+    return lyrics
+def generate_summary(lyrics):
+    summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
+    summary = summarizer(lyrics)
+    return summary
+def generate_prompt(summary):
+    model_name = 'gpt2'
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+    model = GPT2LMHeadModel.from_pretrained(model_name)
+    model = model.to(device)
+    prompt = f"Create an image that represents the feeling of '{summary}'"
+    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
+    output = model.generate(input_ids, do_sample=True, max_length=100, temperature=0.7)
+    prompt_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    return prompt_text
+def generate_image(prompt,
+        height = 512,                        # default height of Stable Diffusion
+        width = 512 ,                        # default width of Stable Diffusion
+        num_inference_steps = 50  ,          # Number of denoising steps
+        guidance_scale = 7.5 ,               # Scale for classifier-free guidance
+        generator = torch.manual_seed(32),   # Seed generator to create the inital latent noise
+        batch_size = 1,):
+    pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+    pipe = pipe.to(torch_device)
+    vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+    scheduler = DPMSolverMultistepScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+    vae = vae.to(torch_device)
+    text_encoder = text_encoder.to(torch_device)
+    unet = unet.to(torch_device)
+    text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
+    with torch.no_grad():
+        uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    latents = torch.randn((batch_size, unet.in_channels, height // 8, width // 8), generator=generator,)
+    latents = latents.to(torch_device)
+    scheduler.set_timesteps(num_inference_steps)
+    latents = latents * scheduler.init_noise_sigma
+    for t in tqdm(scheduler.timesteps):
+        latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+        with torch.no_grad():
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    latents = 1 / 0.18215 * latents
+    with torch.no_grad():
+        image = vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    f_images = pil_images
+    return f_images
+def predict(audio, steps=100, seed=42, guidance_scale=6.0):
+    generator = torch.manual_seed(seed)
+    lyrics = generate_lyrics(audio)
+    summary_1 = generate_summary(lyrics)
+    prompt_text_1 = generate_prompt(summary_1[0]['summary_text'])
+    images = generate_image(prompt= prompt_text_1, generator= generator, num_inference_steps=steps, guidance_scale=guidance_scale)
+    return images[0]
+random_seed = random.randint(0, 2147483647)
+gr.Interface(
+    predict,
+    inputs=[
+        gr.Audio(source="upload", type="filepath"),
+        # gr.inputs.Textbox(label='Text', default='a chalk pastel drawing of a llama wearing a wizard hat'),
+        gr.inputs.Slider(1, 100, label='Inference Steps', default=50, step=1),
+        gr.inputs.Slider(0, 2147483647, label='Seed', default=random_seed, step=1),
+        gr.inputs.Slider(1.0, 20.0, label='Guidance Scale - how much the prompt will influence the results', default=6.0, step=0.1),
+    ],
+    examples=[[load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")][0]["audio"]],
+    outputs=gr.Image(shape=[256,256], type="pil", elem_id="output_image"),
+    css="#output_image{width: 256px}",
+    title="Cover Generator (audio-to-image)",
+    description="Application of OpenAI tools such as Whisper, ChatGPT, and DALL-E to produce covers for the given audio",
+).launch()