Spaces:

coco-gelamay
/

missing-items

Runtime error

File size: 2,333 Bytes

3011b0c
52bf25d
79ffc5d
52bf25d
79ffc5d
52bf25d
79ffc5d
 
 
 
52bf25d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79ffc5d
 
 
52bf25d
 
 
 
 
 
 
 
5828d1d
79ffc5d
 
 
 
16c599c
79ffc5d
 
 
 
 
 
 
 
52bf25d
 
79ffc5d
de325fe
ed429b5
52bf25d
f82ccf0
79ffc5d
 
52bf25d

from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from transformers import pipeline
from PIL import Image
import gradio as gr
import torch

#this converts text to speech
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

#this function detects the objects in the room
def object_classify(img1,img2):

  feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512")
  model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512")
  
  object_detector = pipeline(task="image-segmentation", model = model, feature_extractor = feature_extractor)
  
  #list of dictionaries
  dict_obj1 = object_detector(img1) 
  dict_obj2 = object_detector(img2)
  
  #list of object labels present in the image
  objects_1=[]
  objects_2=[]
  
  #this is will read by the fastspeech
  tts_words=['The missing items are']
  
  #gets the label from each dictionary
  for i in dict_obj1:
    objects_1.append(i['label'])
  
  for j in dict_obj2:
    objects_2.append(j['label'])
   
  #gets the uncommon elements from the 2 lists
  missing_objects= list(set(objects_1)-set(objects_2))
  
  if len(missing_objects)==0:
    tts_words.append('None')
  elif len(missing_objects)==1:
    tts_words[0]='The missing item is a'
    tts_words.extend(missing_objects)
  else:
    tts_words.extend(missing_objects)
    
  gonna_process=' '.join(tts_words)
 

  return missing_objects, fastspeech(gonna_process)
 
  
TITLE = 'Missing Items using Nvidia Segformer'
DESCRIPTION = 'Input two pictures. First image being the original and second is the one with the missing item/s. This will output a list of items that are missing and an audio version of it'
EXAMPLES = [['Bedroom_1.jpg'],['Bedroom_2.jpg']]

INPUTS=[gr.inputs.Image(type = 'pil', label='Original Image'),gr.inputs.Image(type = 'pil', label='Second Image')]
OUTPUTS=[gr.outputs.Textbox(label='Missing Item/s is/are'),gr.outputs.Audio(type="auto", label="Missing Items Audio")]


interface=gr.Interface(object_classify,
                       INPUTS,
                       OUTPUTS,
                       examples = EXAMPLES,
                       title = TITLE, 
                       description=DESCRIPTION, allow_flagging="never")


interface.launch()