Spaces:

ysharma
/

text-to-ner-to-image-to-video

Runtime error

File size: 7,668 Bytes

1ebdc35
8d3fb0e
 
 
 
 
 
 
43bfed9
5a52517
 
8d3fb0e
 
 
 
 
aa64400
 
8d3fb0e
 
 
 
43bfed9
 
8d3fb0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82ea97e
 
 
 
8d3fb0e
 
 
bbff8a4
aa64400
 
43bfed9
92e8e59
bbff8a4
 
 
d2a9eb1
 
8d3fb0e
bbff8a4
5a52517
8d3fb0e
5a52517
92e8e59
 
 
 
 
 
 
 
8d3fb0e
92e8e59
 
 
 
 
 
 
82ea97e
92e8e59
 
8d3fb0e
 
92e8e59
8d3fb0e
82ea97e
5a52517
8d3fb0e
 
 
82ea97e
8d3fb0e
1ebdc35
 
 
 
92e8e59
7473aec
 
d2a9eb1
 
 
 
1ebdc35
 
530da97
8d3fb0e
66c69b6
 
 
 
1ebdc35
 
8d3fb0e
 
 
 
 
0e26b77
 
d5c7b7a
 
92e8e59
8d3fb0e
d5c7b7a
bbff8a4
 
5a52517
 
92e8e59
5a52517
d5c7b7a
92e8e59
 
 
8d3fb0e
d5c7b7a
82ea97e
8d3fb0e
d5c7b7a
bbff8a4
8d3fb0e
1ebdc35
 
 
 
856b53c
66c69b6
 
856b53c
321912e
dc6aef0
1ebdc35
 
 
 
8d3fb0e
82ea97e
1ebdc35
df1b15b
f99bb9b
 
f756b4f
8d3fb0e
1ebdc35
c0915d4
df1b15b
04086e6
 
1ebdc35
f4de1ca
1ebdc35

import gradio as gr
import moviepy.video.io.ImageSequenceClip
#image_folder= '/content/gdrive/My Drive/AI/sample_imgs/'
from PIL import Image
#import os, sys
from pydub import AudioSegment
# Import everything needed to edit video clips
from moviepy.editor import *
import numpy as np
import mutagen
from mutagen.wave import WAVE

#path = "/content/gdrive/My Drive/AI/sample_imgs/"
#dirs = os.listdir( path )

def resize(img_list):
    print("** inside resize **")
    print(img_list)
    resize_img_list = []
    for item in img_list:
        im = Image.open(item)
        imResize = im.resize((256,256), Image.ANTIALIAS)
        resize_img_list.append(np.array(imResize))
    print(type(resize_img_list[0]))
    return resize_img_list
                
#def resize():
#    for item in dirs:
#        if os.path.isfile(path+item):
#            im = Image.open(path+item)
#            f, e = os.path.splitext(path+item)
#            imResize = im.resize((256,256), Image.ANTIALIAS)
#            imResize.save(f + ' resized.jpg', 'JPEG', quality=90)

#resize_img_list = resize(img_list)


#image_files = [os.path.join(image_folder,img)
#               for img in resize_img_list
#               if img.endswith(".jpg")]
#print(image_files)

#def images_to_video(fps, resize_img_list):
#    clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
#    return clip
#clip.write_videofile('/content/gdrive/My Drive/AI/my_vid_20apr.mp4')


def merge_audio_video(entities_num, resize_img_list, text_input):
    print("** inside merge aud vid **")
    print(type(resize_img_list))
    print(type(resize_img_list[0]))
        
    
    #Convert text to speech using facebook's latest model from HF hub   
    speech = text2speech(text_input)
    print('type of speech : ',type(speech))
    print(speech)
    wav_audio = AudioSegment.from_file(speech, "flac")  #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
    #convert flac to mp3 audio format 
    print('flac audio read', type(wav_audio))
    wav_audio.export("audio.mp3", format="mp3")  #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
    print('flac audio converted to mp3 audio' )
    print('now getting duration of this mp3 audio' )
    #getting audio clip's duration
    audio_length = int(WAVE("audio.mp3").info.length)

    #Calculate the desired frame per second based on given audio length and entities identified
    fps= entities_num / audio_length #19 #length of audio file   #13 / 19
    fps = float(format(fps, '.5f'))
    print('fps is: ',fps)
    
    #String a list of images into a video and write to memory
    clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
    clip.write_videofile('my_vid_tmp.mp4')
    print('video clip created from images') 
        
    # loading video file
    print('Starting video and audio merge')
    videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
    print('loading video-clip audio')
       
    # loading audio file
    audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
    print('loading mp3-format audio')  
    # adding audio to the video clip
    mergedclip = videoclip.set_audio(audioclip)
    print('video and audio merged')  
    # showing video clip
    #videoclip.ipython_display()
    
    return mergedclip
    

fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

def text2speech(text):
    print('inside testtospeech')
    print(type(fastspeech))
    print(fastspeech)
    speech = fastspeech(text)
    print(type(speech))
    print(speech)
    return speech
    
def engine(text_input):
    print(" ** Inside Enngine **")
    #Extract entities from text
    ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
    entities = ner(text_input)
    entities = [tupl for tupl in entities if None not in tupl]
    entities_num = len(entities)
    
    #img = run(text_input,'50','256','256','1',10)  #entities[0][0]
    #Generate images using multimodelart's space for each entity identified above
    img_list = []
    for ent in entities:
        img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0] 
        img_list.append(img)
    
    print('img_list size:',len(img_list))
    #Resizing all images produced to same size
    resize_img_list = resize(img_list)
    print('back from resize')
     
    #Convert text to speech using facebook's latest model from HF hub   
    #speech = text2speech(text_input)
    #print('back in engine')
    
    #getting audio clip's duration
    #audio_length = int(WAVE(speech).info.length)
      
    #Calculate the desired frame per second based on given audio length and entities identified
    #fps= entities_num / audio_length #19 #length of audio file   #13 / 19
    #fps = float(format(fps, '.5f'))
    #print('fps is: ',fps)
     
    #Convert string of images into a video
    #clip = images_to_video(fps, resize_img_list)
     
    #Merge video and audio created above
    mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
    #{'prompt':text_input,'steps':'50','width':'256','height':'256','images':'1','scale':10}).launch()
    #img_intfc = gr.Interface.load("spaces/multimodalart/latentdiffusion", inputs=[gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text")],
    #outputs=[gr.outputs.Image(type="pil", label="output image"),gr.outputs.Carousel(label="Individual images",components=["image"]),gr.outputs.Textbox(label="Error")], )
    #title="Convert text to image")
    #img = img_intfc[0]
    #img = img_intfc(text_input,'50','256','256','1',10)
    #print(img)
    #print(type(img))
    #print(img)
    #print(type(img[1][0][0]))
    #print(img[1])
    #img = img[0]
    #inputs=['George',50,256,256,1,10]
    #run(prompt, steps, width, height, images, scale)
    
    
    return mergedclip #img, entities, speech
    
#image = gr.outputs.Image(type="pil", label="output image")
app = gr.Interface(engine, 
                   gr.inputs.Textbox(lines=5, label="Input Text"),
                   gr.outputs.Video(type=None, label=None),
                   #[gr.outputs.Image(type="auto", label="Output"), gr.outputs.Textbox(type="auto", label="Text"), gr.outputs.Audio(type="file", label="Speech Answer") ],
                   #live=True,
                   #outputs=[#gr.outputs.Textbox(type="auto", label="Text"),gr.outputs.Audio(type="file", label="Speech Answer"),
                   #outputs= img, #gr.outputs.Carousel(label="Individual images",components=["image"]), #, gr.outputs.Textbox(label="Error")],
                   examples = ['Apple'], 
                   description="Takes a text as input and reads it out to you." 
                   #examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile.  George likes watching Game of Thrones."]
                   ).launch(enable_queue=True) #(debug=True)
                   
 
 #get_audio = gr.Button("generate audio")
 #get_audio.click(text2speech, inputs=text, outputs=speech)
 
#def greet(name):
#    return "Hello " + name + "!!"

#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
#iface.launch()