JuanMa360 commited on
Commit
7ea81c0
1 Parent(s): 8a23b20

feat: init

Browse files
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import gradio as gr
3
+ import requests
4
+ from transformers import CLIPProcessor, CLIPModel, pipeline, BlipProcessor, BlipForConditionalGeneration
5
+
6
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
7
+ processor = CLIPProcessor.from_pretrained("tokenizer")
8
+ vqa_pipeline = pipeline("visual-question-answering")
9
+
10
+ space_type_labels = ["living room", "bedroom", "kitchen", "terrace", "closet","bathroom", "dining room", "office", "garage", "garden",
11
+ "balcony", "attic", "hallway", "laundry room","home gym", "playroom", "storage room", "studio","is_exterior","empty_interior_room","others"]
12
+
13
+ equipment_questions = [
14
+ "Does the image show outdoor furniture?",
15
+ "Does the image show a parasol?",
16
+ "Does the image show a pergola?",
17
+ "Does the image show a grill?",
18
+ "Does the image show a heater?",
19
+ "Does the image show outdoor lighting?",
20
+ "Does the image show planters?",
21
+ "Does the image show water features?",
22
+ "Does the image show floor coverings?",
23
+ "Does the image show decorative items?",
24
+ "Does the image show entertainment equipment?",
25
+ "Does the image show protective materials?"
26
+ ]
27
+
28
+ weights = {
29
+ "Does the image show outdoor furniture?": 0.15,
30
+ "Does the image show a parasol?": 0.05,
31
+ "Does the image show a pergola?": 0.1,
32
+ "Does the image show a grill?": 0.15,
33
+ "Does the image show a heater?": 0.1,
34
+ "Does the image show outdoor lighting?": 0.1,
35
+ "Does the image show planters?": 0.05,
36
+ "Does the image show water features?": 0.1,
37
+ "Does the image show floor coverings?": 0.05,
38
+ "Does the image show decorative items?": 0.05,
39
+ "Does the image show entertainment equipment?": 0.05,
40
+ "Does the image show protective materials?": 0.05
41
+ }
42
+
43
+ luminosity_classes = [
44
+ 'A picture of a room filled with abundant natural light with a lot or few windows or a great balcony regardless of whether it is night, without objects that prevent the light from passing through.',
45
+ 'a picture of room in the dark',
46
+ 'A picture of a room with Artificial lights like lamps or headlamps'
47
+ ]
48
+ luminosity_labels = ['natural_light', 'no_light', 'artificial_light']
49
+
50
+ view_questions = [
51
+ "Is this a panoramic view?",
52
+ "Is this a city view?",
53
+ "Is this a view of greenery?",
54
+ "Is this a mountain view?",
55
+ "Is this a view of the sea?"
56
+ ]
57
+ view_labels = ['panoramic', 'city', 'greenery', 'mountain', 'sea']
58
+
59
+ certainty_classes = ['windows, balcony or terrace with a view outwards','Exterior appearance of a house or apartment','unreal image or fake of any view']
60
+
61
+ render_classes = ['is_unrealistic_image_render', 'is_image_real']
62
+
63
+ threshold = 0
64
+
65
+ def calculate_equipment_score(image_results, weights):
66
+ score = sum(weights[question] for question, present in image_results.items() if present)
67
+ return score
68
+
69
+ def calculate_luminosity_score(processed_image):
70
+ inputs = processor(text=luminosity_classes, images=processed_image, return_tensors="pt", padding=True)
71
+ outputs = model(**inputs)
72
+ logits_per_image = outputs.logits_per_image
73
+ probs = logits_per_image.softmax(dim=1)
74
+ probabilities_list = probs.squeeze().tolist()
75
+ luminosity_score = {class_name: probability for class_name, probability in zip(luminosity_labels, probabilities_list)}
76
+ return luminosity_score
77
+
78
+ def calculate_space_type(processed_image):
79
+ inputs = processor(text=space_type_labels, images=processed_image, return_tensors="pt", padding=True)
80
+ outputs = model(**inputs)
81
+ logits_per_image = outputs.logits_per_image
82
+ probs = logits_per_image.softmax(dim=1)
83
+ probabilities_list = probs.squeeze().tolist()
84
+ space_type_score = {class_name: probability for class_name, probability in zip(space_type_labels, probabilities_list)}
85
+ return space_type_score
86
+
87
+ def certainty(processed_image):
88
+ inputs = processor(text=certainty_classes, images=processed_image, return_tensors="pt", padding=True)
89
+ outputs = model(**inputs)
90
+ logits_per_image = outputs.logits_per_image
91
+ probs = logits_per_image.softmax(dim=1)
92
+ probabilities_list = probs.squeeze().tolist()
93
+ is_fake_score = {class_name: probability for class_name, probability in zip(certainty_classes, probabilities_list)}
94
+ return is_fake_score
95
+
96
+ def views(processed_image):
97
+ inputs = processor(text=view_questions, images=processed_image, return_tensors="pt", padding=True)
98
+ outputs = model(**inputs)
99
+ logits_per_image = outputs.logits_per_image
100
+ probs = logits_per_image.softmax(dim=1)
101
+ probabilities_list = probs.squeeze().tolist()
102
+ views_score = {class_name: probability for class_name, probability in zip(view_labels, probabilities_list)}
103
+ return views_score
104
+
105
+ def calculate_is_render(processed_image):
106
+ render_inputs = processor(text=render_classes, images=processed_image, return_tensors="pt", padding=True)
107
+ render_outputs = model(**render_inputs)
108
+ render_logits = render_outputs.logits_per_image
109
+ render_probs = render_logits.softmax(dim=1)
110
+ render_probabilities_list = render_probs.squeeze().tolist()
111
+ render_score = {class_name: probability for class_name, probability in zip(render_classes, render_probabilities_list)}
112
+ is_render_prob = render_score['is_unrealistic_image_render']
113
+ return is_render_prob
114
+
115
+ def generate_answer(image):
116
+
117
+ processed_image = image
118
+
119
+ image_data = {
120
+ "image_context": None,
121
+ "equipment_score": None,
122
+ "luminosity_score": None,
123
+ "view_type": {"views": None, "certainty_score": None}
124
+ }
125
+
126
+ space_type_score = calculate_space_type(processed_image)
127
+ max_space_type = max(space_type_score, key=space_type_score.get)
128
+ if space_type_score[max_space_type] >= threshold:
129
+ space_type = max_space_type.lower()
130
+ if space_type == "patio":
131
+ space_type = "terrace"
132
+ image_data["image_context"] = space_type
133
+
134
+ image_results = {}
135
+ if image_data["image_context"] == "terrace":
136
+ for question in equipment_questions:
137
+ result = vqa_pipeline(processed_image, question, top_k=1)
138
+ answer = result[0]['answer'].lower() == "yes"
139
+ image_results[question] = answer
140
+ equipment_score = calculate_equipment_score(image_results, weights)
141
+ image_data["equipment_score"] = equipment_score
142
+
143
+ luminosity_score = calculate_luminosity_score(processed_image)
144
+ image_data["luminosity_score"] = luminosity_score['natural_light']
145
+
146
+ view = views(processed_image)
147
+ image_data["view_type"]["views"] = view
148
+
149
+ certainty_score = certainty(processed_image)
150
+ certainty_score = list(certainty_score.values())[0]
151
+ image_data["view_type"]["certainty_score"] = certainty_score
152
+
153
+ is_render = calculate_is_render(processed_image)
154
+ image_data["is_render"] = is_render
155
+
156
+ return image_data
157
+
158
+
159
+ image_input = gr.Image(type="pil", label="Upload Image")
160
+
161
+ iface = gr.Interface(
162
+ fn=generate_answer,
163
+ inputs=[image_input],
164
+ outputs="text",
165
+ title="Vision intelligence",
166
+ description="Upload an image"
167
+ )
168
+
169
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ accelerate
tokenizer/.DS_Store ADDED
Binary file (6.15 kB). View file
 
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "resample": 3,
18
+ "size": 224
19
+ }
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<|endoftext|>"}
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "./clip_ViT_B_32/", "model_max_length": 77}
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff