from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation from transformers import pipeline from PIL import Image import gradio as gr import torch #this converts text to speech fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech") #this function detects the objects in the room def object_classify(img1,img2): feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512") model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512") object_detector = pipeline(task="image-segmentation", model = model, feature_extractor = feature_extractor) #list of dictionaries dict_obj1 = object_detector(img1) dict_obj2 = object_detector(img2) #list of object labels present in the image objects_1=[] objects_2=[] #this is will read by the fastspeech tts_words=['The missing items are'] #gets the label from each dictionary for i in dict_obj1: objects_1.append(i['label']) for j in dict_obj2: objects_2.append(j['label']) #gets the uncommon elements from the 2 lists missing_objects= list(set(objects_1)-set(objects_2)) if len(missing_objects)==0: tts_words.append('None') elif len(missing_objects)==1: tts_words[0]='The missing item is a' tts_words.extend(missing_objects) else: tts_words.extend(missing_objects) gonna_process=' '.join(tts_words) return missing_objects, fastspeech(gonna_process) TITLE = 'Missing Items using Nvidia Segformer' DESCRIPTION = 'Input two pictures. First image being the original and second is the one with the missing item/s. This will output a list of items that are missing and an audio version of it' EXAMPLES = [['Bedroom_1.jpg'],['Bedroom_2.jpg']] INPUTS=[gr.inputs.Image(type = 'pil', label='Original Image'),gr.inputs.Image(type = 'pil', label='Second Image')] OUTPUTS=[gr.outputs.Textbox(label='Missing Item/s is/are'),gr.outputs.Audio(type="auto", label="Missing Items Audio")] interface=gr.Interface(object_classify, INPUTS, OUTPUTS, examples = EXAMPLES, title = TITLE, description=DESCRIPTION, allow_flagging="never") interface.launch()