File size: 2,333 Bytes
3011b0c
52bf25d
79ffc5d
52bf25d
79ffc5d
52bf25d
79ffc5d
 
 
 
52bf25d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79ffc5d
 
 
52bf25d
 
 
 
 
 
 
 
5828d1d
79ffc5d
 
 
 
16c599c
79ffc5d
 
 
 
 
 
 
 
52bf25d
 
79ffc5d
de325fe
ed429b5
52bf25d
f82ccf0
79ffc5d
 
52bf25d
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from transformers import pipeline
from PIL import Image
import gradio as gr
import torch

#this converts text to speech
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

#this function detects the objects in the room
def object_classify(img1,img2):

  feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512")
  model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512")
  
  object_detector = pipeline(task="image-segmentation", model = model, feature_extractor = feature_extractor)
  
  #list of dictionaries
  dict_obj1 = object_detector(img1) 
  dict_obj2 = object_detector(img2)
  
  #list of object labels present in the image
  objects_1=[]
  objects_2=[]
  
  #this is will read by the fastspeech
  tts_words=['The missing items are']
  
  #gets the label from each dictionary
  for i in dict_obj1:
    objects_1.append(i['label'])
  
  for j in dict_obj2:
    objects_2.append(j['label'])
   
  #gets the uncommon elements from the 2 lists
  missing_objects= list(set(objects_1)-set(objects_2))
  
  if len(missing_objects)==0:
    tts_words.append('None')
  elif len(missing_objects)==1:
    tts_words[0]='The missing item is a'
    tts_words.extend(missing_objects)
  else:
    tts_words.extend(missing_objects)
    
  gonna_process=' '.join(tts_words)
 

  return missing_objects, fastspeech(gonna_process)
 
  
TITLE = 'Missing Items using Nvidia Segformer'
DESCRIPTION = 'Input two pictures. First image being the original and second is the one with the missing item/s. This will output a list of items that are missing and an audio version of it'
EXAMPLES = [['Bedroom_1.jpg'],['Bedroom_2.jpg']]

INPUTS=[gr.inputs.Image(type = 'pil', label='Original Image'),gr.inputs.Image(type = 'pil', label='Second Image')]
OUTPUTS=[gr.outputs.Textbox(label='Missing Item/s is/are'),gr.outputs.Audio(type="auto", label="Missing Items Audio")]


interface=gr.Interface(object_classify,
                       INPUTS,
                       OUTPUTS,
                       examples = EXAMPLES,
                       title = TITLE, 
                       description=DESCRIPTION, allow_flagging="never")


interface.launch()