Spaces:
Runtime error
Runtime error
update gradio ui
Browse files- app.py +38 -4
- main.py +0 -20
- main_gradio.py +0 -84
- models/__pycache__/blip2_model.cpython-38.pyc +0 -0
- models/__pycache__/controlnet_model.cpython-38.pyc +0 -0
- models/__pycache__/gpt_model.cpython-38.pyc +0 -0
- models/__pycache__/grit_model.cpython-38.pyc +0 -0
- models/__pycache__/image_text_transformation.cpython-38.pyc +0 -0
- models/__pycache__/region_semantic.cpython-38.pyc +0 -0
- models/blip2_model.py +5 -7
- models/controlnet_model.py +7 -2
- models/gpt_model.py +10 -6
- models/grit_model.py +5 -4
- models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc +0 -0
- models/grit_src/image_dense_captions.py +5 -3
- models/image_text_transformation.py +23 -8
- models/region_semantic.py +6 -5
- models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc +0 -0
- models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc +0 -0
- models/segment_models/semantic_segment_anything_model.py +2 -3
- models/segment_models/semgent_anything_model.py +3 -4
app.py
CHANGED
@@ -5,6 +5,32 @@ from PIL import Image
|
|
5 |
import base64
|
6 |
from io import BytesIO
|
7 |
from models.image_text_transformation import ImageTextTransformation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def pil_image_to_base64(image):
|
10 |
buffered = BytesIO()
|
@@ -17,7 +43,8 @@ def add_logo():
|
|
17 |
logo_base64 = base64.b64encode(f.read()).decode()
|
18 |
return logo_base64
|
19 |
|
20 |
-
def process_image(image_src, processor):
|
|
|
21 |
gen_text = processor.image_to_text(image_src)
|
22 |
gen_image = processor.text_to_image(gen_text)
|
23 |
gen_image_str = pil_image_to_base64(gen_image)
|
@@ -38,10 +65,11 @@ def process_image(image_src, processor):
|
|
38 |
|
39 |
return custom_output
|
40 |
|
41 |
-
processor = ImageTextTransformation()
|
42 |
|
43 |
# Create Gradio input and output components
|
44 |
image_input = gr.inputs.Image(type='filepath', label="Input Image")
|
|
|
45 |
|
46 |
logo_base64 = add_logo()
|
47 |
# Create the title with the logo
|
@@ -49,12 +77,18 @@ title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400"
|
|
49 |
|
50 |
# Create Gradio interface
|
51 |
interface = gr.Interface(
|
52 |
-
fn=lambda image: process_image(image,
|
53 |
-
inputs=image_input,
|
|
|
|
|
|
|
|
|
|
|
54 |
outputs=gr.outputs.HTML(),
|
55 |
title=title_with_logo,
|
56 |
description="""
|
57 |
This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
|
|
|
58 |
"""
|
59 |
)
|
60 |
|
|
|
5 |
import base64
|
6 |
from io import BytesIO
|
7 |
from models.image_text_transformation import ImageTextTransformation
|
8 |
+
import argparse
|
9 |
+
import torch
|
10 |
+
|
11 |
+
parser = argparse.ArgumentParser()
|
12 |
+
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
|
13 |
+
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
|
14 |
+
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
|
15 |
+
parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
|
16 |
+
parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
|
17 |
+
parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
|
18 |
+
parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
|
19 |
+
parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
|
20 |
+
|
21 |
+
args = parser.parse_args()
|
22 |
+
|
23 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
24 |
+
if device == "cuda":
|
25 |
+
args.image_caption_device = "cuda"
|
26 |
+
args.dense_caption_device = "cuda"
|
27 |
+
args.semantic_segment_device = "cuda"
|
28 |
+
args.contolnet_device = "cuda"
|
29 |
+
else:
|
30 |
+
args.image_caption_device = "cpu"
|
31 |
+
args.dense_caption_device = "cpu"
|
32 |
+
args.semantic_segment_device = "cpu"
|
33 |
+
args.contolnet_device = "cpu"
|
34 |
|
35 |
def pil_image_to_base64(image):
|
36 |
buffered = BytesIO()
|
|
|
43 |
logo_base64 = base64.b64encode(f.read()).decode()
|
44 |
return logo_base64
|
45 |
|
46 |
+
def process_image(image_src, options, processor):
|
47 |
+
processor.args.semantic_segment = "Semantic Segment" in options
|
48 |
gen_text = processor.image_to_text(image_src)
|
49 |
gen_image = processor.text_to_image(gen_text)
|
50 |
gen_image_str = pil_image_to_base64(gen_image)
|
|
|
65 |
|
66 |
return custom_output
|
67 |
|
68 |
+
processor = ImageTextTransformation(args)
|
69 |
|
70 |
# Create Gradio input and output components
|
71 |
image_input = gr.inputs.Image(type='filepath', label="Input Image")
|
72 |
+
semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
|
73 |
|
74 |
logo_base64 = add_logo()
|
75 |
# Create the title with the logo
|
|
|
77 |
|
78 |
# Create Gradio interface
|
79 |
interface = gr.Interface(
|
80 |
+
fn=lambda image, options, devices: process_image(image, options, devices, processor),
|
81 |
+
inputs=[image_input,
|
82 |
+
gr.CheckboxGroup(
|
83 |
+
label="Options",
|
84 |
+
choices=["Semantic Segment"],
|
85 |
+
),
|
86 |
+
],
|
87 |
outputs=gr.outputs.HTML(),
|
88 |
title=title_with_logo,
|
89 |
description="""
|
90 |
This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
|
91 |
+
\n Semantic segment is very slow in cpu(~8m), best use on gpu or run local.
|
92 |
"""
|
93 |
)
|
94 |
|
main.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from models.image_text_transformation import ImageTextTransformation
|
3 |
-
from utils.util import display_images_and_text
|
4 |
-
|
5 |
-
if __name__ == '__main__':
|
6 |
-
parser = argparse.ArgumentParser()
|
7 |
-
parser.add_argument('--image_src', default='examples/1.jpg')
|
8 |
-
parser.add_argument('--out_image_name', default='output/1_result.jpg')
|
9 |
-
args = parser.parse_args()
|
10 |
-
|
11 |
-
processor = ImageTextTransformation()
|
12 |
-
generated_text = processor.image_to_text(args.image_src)
|
13 |
-
generated_image = processor.text_to_image(generated_text)
|
14 |
-
## then text to image
|
15 |
-
print("*" * 50)
|
16 |
-
print("Generated Text:")
|
17 |
-
print(generated_text)
|
18 |
-
print("*" * 50)
|
19 |
-
|
20 |
-
results = display_images_and_text(args.image_src, generated_image, generated_text, args.out_image_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main_gradio.py
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import cv2
|
3 |
-
import numpy as np
|
4 |
-
from PIL import Image
|
5 |
-
import base64
|
6 |
-
from io import BytesIO
|
7 |
-
from models.image_text_transformation import ImageTextTransformation
|
8 |
-
|
9 |
-
def pil_image_to_base64(image):
|
10 |
-
buffered = BytesIO()
|
11 |
-
image.save(buffered, format="JPEG")
|
12 |
-
img_str = base64.b64encode(buffered.getvalue()).decode()
|
13 |
-
return img_str
|
14 |
-
|
15 |
-
def add_logo():
|
16 |
-
with open("examples/logo.png", "rb") as f:
|
17 |
-
logo_base64 = base64.b64encode(f.read()).decode()
|
18 |
-
return logo_base64
|
19 |
-
|
20 |
-
def process_image(image_src, processor):
|
21 |
-
gen_text = processor.image_to_text(image_src)
|
22 |
-
gen_image = processor.text_to_image(gen_text)
|
23 |
-
gen_image_str = pil_image_to_base64(gen_image)
|
24 |
-
# Combine the outputs into a single HTML output
|
25 |
-
custom_output = f'''
|
26 |
-
<h2>Image->Text->Image:</h2>
|
27 |
-
<div style="display: flex; flex-wrap: wrap;">
|
28 |
-
<div style="flex: 1;">
|
29 |
-
<h3>Image2Text</h3>
|
30 |
-
<p>{gen_text}</p>
|
31 |
-
</div>
|
32 |
-
<div style="flex: 1;">
|
33 |
-
<h3>Text2Image</h3>
|
34 |
-
<img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
|
35 |
-
</div>
|
36 |
-
</div>
|
37 |
-
<h2>Using Source Image to do Retrieval on COCO:</h2>
|
38 |
-
<div style="display: flex; flex-wrap: wrap;">
|
39 |
-
<div style="flex: 1;">
|
40 |
-
<h3>Retrieval Top-3 Text</h3>
|
41 |
-
<p>{gen_text}</p>
|
42 |
-
</div>
|
43 |
-
<div style="flex: 1;">
|
44 |
-
<h3>Retrieval Top-3 Image</h3>
|
45 |
-
<img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
|
46 |
-
</div>
|
47 |
-
</div>
|
48 |
-
<h2>Using Generated texts to do Retrieval on COCO:</h2>
|
49 |
-
<div style="display: flex; flex-wrap: wrap;">
|
50 |
-
<div style="flex: 1;">
|
51 |
-
<h3>Retrieval Top-3 Text</h3>
|
52 |
-
<p>{gen_text}</p>
|
53 |
-
</div>
|
54 |
-
<div style="flex: 1;">
|
55 |
-
<h3>Retrieval Top-3 Image</h3>
|
56 |
-
<img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
|
57 |
-
</div>
|
58 |
-
</div>
|
59 |
-
'''
|
60 |
-
|
61 |
-
return custom_output
|
62 |
-
|
63 |
-
processor = ImageTextTransformation()
|
64 |
-
|
65 |
-
# Create Gradio input and output components
|
66 |
-
image_input = gr.inputs.Image(type='filepath', label="Input Image")
|
67 |
-
|
68 |
-
logo_base64 = add_logo()
|
69 |
-
# Create the title with the logo
|
70 |
-
title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
|
71 |
-
|
72 |
-
# Create Gradio interface
|
73 |
-
interface = gr.Interface(
|
74 |
-
fn=lambda image: process_image(image, processor), # Pass the processor object using a lambda function
|
75 |
-
inputs=image_input,
|
76 |
-
outputs=gr.outputs.HTML(),
|
77 |
-
title=title_with_logo,
|
78 |
-
description="""
|
79 |
-
This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
|
80 |
-
"""
|
81 |
-
)
|
82 |
-
|
83 |
-
# Launch the interface
|
84 |
-
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/__pycache__/blip2_model.cpython-38.pyc
CHANGED
Binary files a/models/__pycache__/blip2_model.cpython-38.pyc and b/models/__pycache__/blip2_model.cpython-38.pyc differ
|
|
models/__pycache__/controlnet_model.cpython-38.pyc
CHANGED
Binary files a/models/__pycache__/controlnet_model.cpython-38.pyc and b/models/__pycache__/controlnet_model.cpython-38.pyc differ
|
|
models/__pycache__/gpt_model.cpython-38.pyc
CHANGED
Binary files a/models/__pycache__/gpt_model.cpython-38.pyc and b/models/__pycache__/gpt_model.cpython-38.pyc differ
|
|
models/__pycache__/grit_model.cpython-38.pyc
CHANGED
Binary files a/models/__pycache__/grit_model.cpython-38.pyc and b/models/__pycache__/grit_model.cpython-38.pyc differ
|
|
models/__pycache__/image_text_transformation.cpython-38.pyc
CHANGED
Binary files a/models/__pycache__/image_text_transformation.cpython-38.pyc and b/models/__pycache__/image_text_transformation.cpython-38.pyc differ
|
|
models/__pycache__/region_semantic.cpython-38.pyc
CHANGED
Binary files a/models/__pycache__/region_semantic.cpython-38.pyc and b/models/__pycache__/region_semantic.cpython-38.pyc differ
|
|
models/blip2_model.py
CHANGED
@@ -5,14 +5,11 @@ import torch
|
|
5 |
|
6 |
|
7 |
class ImageCaptioning:
|
8 |
-
def __init__(self)
|
9 |
-
self.device =
|
10 |
-
# self.processor, self.model = None, None
|
11 |
self.processor, self.model = self.initialize_model()
|
12 |
|
13 |
def initialize_model(self):
|
14 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
-
# self.device = "cpu" # for low gpu memory devices
|
16 |
if self.device == 'cpu':
|
17 |
self.data_type = torch.float32
|
18 |
else:
|
@@ -29,9 +26,10 @@ class ImageCaptioning:
|
|
29 |
inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
|
30 |
generated_ids = self.model.generate(**inputs)
|
31 |
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
32 |
-
print('*'*100 + '\
|
|
|
33 |
print(generated_text)
|
34 |
-
print('\
|
35 |
return generated_text
|
36 |
|
37 |
def image_caption_debug(self, image_src):
|
|
|
5 |
|
6 |
|
7 |
class ImageCaptioning:
|
8 |
+
def __init__(self, device):
|
9 |
+
self.device = device
|
|
|
10 |
self.processor, self.model = self.initialize_model()
|
11 |
|
12 |
def initialize_model(self):
|
|
|
|
|
13 |
if self.device == 'cpu':
|
14 |
self.data_type = torch.float32
|
15 |
else:
|
|
|
26 |
inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
|
27 |
generated_ids = self.model.generate(**inputs)
|
28 |
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
29 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
30 |
+
print('\nStep1, BLIP2 caption:')
|
31 |
print(generated_text)
|
32 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
33 |
return generated_text
|
34 |
|
35 |
def image_caption_debug(self, image_src):
|
models/controlnet_model.py
CHANGED
@@ -10,8 +10,8 @@ from diffusers import (
|
|
10 |
|
11 |
|
12 |
class TextToImage:
|
13 |
-
def __init__(self):
|
14 |
-
|
15 |
self.model = self.initialize_model()
|
16 |
|
17 |
def initialize_model(self):
|
@@ -29,6 +29,7 @@ class TextToImage:
|
|
29 |
pipeline.scheduler.config
|
30 |
)
|
31 |
pipeline.enable_model_cpu_offload()
|
|
|
32 |
return pipeline
|
33 |
|
34 |
@staticmethod
|
@@ -42,8 +43,12 @@ class TextToImage:
|
|
42 |
return image
|
43 |
|
44 |
def text_to_image(self, text, image):
|
|
|
|
|
45 |
image = self.preprocess_image(image)
|
46 |
generated_image = self.model(text, image, num_inference_steps=20).images[0]
|
|
|
|
|
47 |
return generated_image
|
48 |
|
49 |
def text_to_image_debug(self, text, image):
|
|
|
10 |
|
11 |
|
12 |
class TextToImage:
|
13 |
+
def __init__(self, device):
|
14 |
+
self.device = device
|
15 |
self.model = self.initialize_model()
|
16 |
|
17 |
def initialize_model(self):
|
|
|
29 |
pipeline.scheduler.config
|
30 |
)
|
31 |
pipeline.enable_model_cpu_offload()
|
32 |
+
pipeline.to(self.device)
|
33 |
return pipeline
|
34 |
|
35 |
@staticmethod
|
|
|
43 |
return image
|
44 |
|
45 |
def text_to_image(self, text, image):
|
46 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
47 |
+
print('\nStep5, Text to Image:')
|
48 |
image = self.preprocess_image(image)
|
49 |
generated_image = self.model(text, image, num_inference_steps=20).images[0]
|
50 |
+
print("Generated image has been svaed.")
|
51 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
52 |
return generated_image
|
53 |
|
54 |
def text_to_image_debug(self, text, image):
|
models/gpt_model.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
import openai
|
2 |
|
3 |
class ImageToText:
|
4 |
-
def __init__(self, api_key):
|
5 |
self.template = self.initialize_template()
|
6 |
openai.api_key = api_key
|
|
|
7 |
|
8 |
def initialize_template(self):
|
9 |
prompt_prefix_1 = """Generate only an informative and nature paragraph based on the given information(a,b,c,d):\n"""
|
@@ -16,6 +17,7 @@ class ImageToText:
|
|
16 |
Use nouns rather than coordinates to show position information of each object.
|
17 |
No more than 7 sentences.
|
18 |
Only use one paragraph.
|
|
|
19 |
Do not appear number.
|
20 |
"""
|
21 |
template = f"{prompt_prefix_1}{prompt_prefix_2}{{width}}X{{height}}{prompt_prefix_3}{{caption}}{prompt_prefix_4}{{dense_caption}}{prompt_prefix_5}{{region_semantic}}{prompt_suffix}"
|
@@ -23,15 +25,17 @@ class ImageToText:
|
|
23 |
|
24 |
def paragraph_summary_with_gpt(self, caption, dense_caption, region_semantic, width, height):
|
25 |
question = self.template.format(width=width, height=height, caption=caption, dense_caption=dense_caption, region_semantic=region_semantic)
|
26 |
-
print('*'*100)
|
27 |
-
print(
|
|
|
28 |
completion = openai.ChatCompletion.create(
|
29 |
-
model=
|
30 |
messages = [
|
31 |
{"role": "user", "content" : question}]
|
32 |
)
|
33 |
-
|
34 |
-
print('
|
|
|
35 |
return completion['choices'][0]['message']['content']
|
36 |
|
37 |
def paragraph_summary_with_gpt_debug(self, caption, dense_caption, width, height):
|
|
|
1 |
import openai
|
2 |
|
3 |
class ImageToText:
|
4 |
+
def __init__(self, api_key, gpt_version="gpt-3.5-turbo"):
|
5 |
self.template = self.initialize_template()
|
6 |
openai.api_key = api_key
|
7 |
+
self.gpt_version = gpt_version
|
8 |
|
9 |
def initialize_template(self):
|
10 |
prompt_prefix_1 = """Generate only an informative and nature paragraph based on the given information(a,b,c,d):\n"""
|
|
|
17 |
Use nouns rather than coordinates to show position information of each object.
|
18 |
No more than 7 sentences.
|
19 |
Only use one paragraph.
|
20 |
+
Describe position detailedly.
|
21 |
Do not appear number.
|
22 |
"""
|
23 |
template = f"{prompt_prefix_1}{prompt_prefix_2}{{width}}X{{height}}{prompt_prefix_3}{{caption}}{prompt_prefix_4}{{dense_caption}}{prompt_prefix_5}{{region_semantic}}{prompt_suffix}"
|
|
|
25 |
|
26 |
def paragraph_summary_with_gpt(self, caption, dense_caption, region_semantic, width, height):
|
27 |
question = self.template.format(width=width, height=height, caption=caption, dense_caption=dense_caption, region_semantic=region_semantic)
|
28 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
29 |
+
print('\nStep4, Paragraph Summary with GPT-3:')
|
30 |
+
print('\033[1;34m' + "Question:".ljust(10) + '\033[1;36m' + question + '\033[0m')
|
31 |
completion = openai.ChatCompletion.create(
|
32 |
+
model=self.gpt_version,
|
33 |
messages = [
|
34 |
{"role": "user", "content" : question}]
|
35 |
)
|
36 |
+
|
37 |
+
print('\033[1;34m' + "ChatGPT Response:".ljust(18) + '\033[1;32m' + completion['choices'][0]['message']['content'] + '\033[0m')
|
38 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
39 |
return completion['choices'][0]['message']['content']
|
40 |
|
41 |
def paragraph_summary_with_gpt_debug(self, caption, dense_caption, width, height):
|
models/grit_model.py
CHANGED
@@ -2,8 +2,8 @@ import os
|
|
2 |
from models.grit_src.image_dense_captions import image_caption_api
|
3 |
|
4 |
class DenseCaptioning():
|
5 |
-
def __init__(self)
|
6 |
-
self.
|
7 |
|
8 |
|
9 |
def initialize_model(self):
|
@@ -18,9 +18,10 @@ class DenseCaptioning():
|
|
18 |
return dense_caption
|
19 |
|
20 |
def image_dense_caption(self, image_src):
|
21 |
-
dense_caption = image_caption_api(image_src)
|
|
|
22 |
print("Step2, Dense Caption:\n")
|
23 |
print(dense_caption)
|
24 |
-
print('\
|
25 |
return dense_caption
|
26 |
|
|
|
2 |
from models.grit_src.image_dense_captions import image_caption_api
|
3 |
|
4 |
class DenseCaptioning():
|
5 |
+
def __init__(self, device):
|
6 |
+
self.device = device
|
7 |
|
8 |
|
9 |
def initialize_model(self):
|
|
|
18 |
return dense_caption
|
19 |
|
20 |
def image_dense_caption(self, image_src):
|
21 |
+
dense_caption = image_caption_api(image_src, self.device)
|
22 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
23 |
print("Step2, Dense Caption:\n")
|
24 |
print(dense_caption)
|
25 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
26 |
return dense_caption
|
27 |
|
models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc
CHANGED
Binary files a/models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc and b/models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc differ
|
|
models/grit_src/image_dense_captions.py
CHANGED
@@ -50,12 +50,14 @@ def setup_cfg(args):
|
|
50 |
return cfg
|
51 |
|
52 |
|
53 |
-
def get_parser():
|
54 |
arg_dict = {'config_file': "models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml", 'cpu': False, 'confidence_threshold': 0.5, 'test_task': 'DenseCap', 'opts': ["MODEL.WEIGHTS", "pretrained_models/grit_b_densecap_objectdet.pth"]}
|
|
|
|
|
55 |
return arg_dict
|
56 |
|
57 |
-
def image_caption_api(image_src):
|
58 |
-
args2 = get_parser()
|
59 |
cfg = setup_cfg(args2)
|
60 |
demo = VisualizationDemo(cfg)
|
61 |
if image_src:
|
|
|
50 |
return cfg
|
51 |
|
52 |
|
53 |
+
def get_parser(device):
|
54 |
arg_dict = {'config_file': "models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml", 'cpu': False, 'confidence_threshold': 0.5, 'test_task': 'DenseCap', 'opts': ["MODEL.WEIGHTS", "pretrained_models/grit_b_densecap_objectdet.pth"]}
|
55 |
+
if device == "cpu":
|
56 |
+
arg_dict["cpu"] = True
|
57 |
return arg_dict
|
58 |
|
59 |
+
def image_caption_api(image_src, device):
|
60 |
+
args2 = get_parser(device)
|
61 |
cfg = setup_cfg(args2)
|
62 |
demo = VisualizationDemo(cfg)
|
63 |
if image_src:
|
models/image_text_transformation.py
CHANGED
@@ -18,27 +18,42 @@ def pil_image_to_base64(image):
|
|
18 |
|
19 |
|
20 |
class ImageTextTransformation:
|
21 |
-
def __init__(self):
|
22 |
# Load your big model here
|
|
|
23 |
self.init_models()
|
24 |
self.ref_image = None
|
25 |
|
26 |
def init_models(self):
|
27 |
openai_key = os.environ['OPENAI_KEY']
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
30 |
self.gpt_model = ImageToText(openai_key)
|
31 |
-
self.controlnet_model = TextToImage()
|
32 |
-
self.region_semantic_model = RegionSemantic()
|
|
|
33 |
|
34 |
|
35 |
def image_to_text(self, img_src):
|
36 |
# the information to generate paragraph based on the context
|
37 |
self.ref_image = Image.open(img_src)
|
38 |
width, height = read_image_width_height(img_src)
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
generated_text = self.gpt_model.paragraph_summary_with_gpt(image_caption, dense_caption, region_semantic, width, height)
|
43 |
return generated_text
|
44 |
|
|
|
18 |
|
19 |
|
20 |
class ImageTextTransformation:
|
21 |
+
def __init__(self, args):
|
22 |
# Load your big model here
|
23 |
+
self.args = args
|
24 |
self.init_models()
|
25 |
self.ref_image = None
|
26 |
|
27 |
def init_models(self):
|
28 |
openai_key = os.environ['OPENAI_KEY']
|
29 |
+
print('\033[1;34m' + "Welcome to the Image2Paragraph toolbox...".center(50, '-') + '\033[0m')
|
30 |
+
print('\033[1;33m' + "Initializing models...".center(50, '-') + '\033[0m')
|
31 |
+
print('\033[1;31m' + "This is time-consuming, please wait...".center(50, '-') + '\033[0m')
|
32 |
+
self.image_caption_model = ImageCaptioning(device=self.args.image_caption_device)
|
33 |
+
self.dense_caption_model = DenseCaptioning(device=self.args.dense_caption_device)
|
34 |
self.gpt_model = ImageToText(openai_key)
|
35 |
+
self.controlnet_model = TextToImage(device=self.args.contolnet_device)
|
36 |
+
self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device)
|
37 |
+
print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
|
38 |
|
39 |
|
40 |
def image_to_text(self, img_src):
|
41 |
# the information to generate paragraph based on the context
|
42 |
self.ref_image = Image.open(img_src)
|
43 |
width, height = read_image_width_height(img_src)
|
44 |
+
print(self.args)
|
45 |
+
if self.args.image_caption:
|
46 |
+
image_caption = self.image_caption_model.image_caption(img_src)
|
47 |
+
else:
|
48 |
+
image_caption = " "
|
49 |
+
if self.args.dense_caption:
|
50 |
+
dense_caption = self.dense_caption_model.image_dense_caption(img_src)
|
51 |
+
else:
|
52 |
+
dense_caption = " "
|
53 |
+
if self.args.semantic_segment:
|
54 |
+
region_semantic = self.region_semantic_model.region_semantic(img_src)
|
55 |
+
else:
|
56 |
+
region_semantic = " "
|
57 |
generated_text = self.gpt_model.paragraph_summary_with_gpt(image_caption, dense_caption, region_semantic, width, height)
|
58 |
return generated_text
|
59 |
|
models/region_semantic.py
CHANGED
@@ -3,12 +3,13 @@ from models.segment_models.semantic_segment_anything_model import SemanticSegmen
|
|
3 |
|
4 |
|
5 |
class RegionSemantic():
|
6 |
-
def __init__(self)
|
|
|
7 |
self.init_models()
|
8 |
|
9 |
def init_models(self):
|
10 |
-
self.segment_model = SegmentAnything()
|
11 |
-
self.semantic_segment_model = SemanticSegment()
|
12 |
|
13 |
def semantic_prompt_gen(self, anns):
|
14 |
"""
|
@@ -21,12 +22,12 @@ class RegionSemantic():
|
|
21 |
# Select the top 10 largest regions
|
22 |
top_10_largest_regions = sorted_annotations[:10]
|
23 |
semantic_prompt = ""
|
24 |
-
print('*'*100)
|
25 |
print("\nStep3, Semantic Prompt:")
|
26 |
for region in top_10_largest_regions:
|
27 |
semantic_prompt += region['class_name'] + ': ' + str(region['bbox']) + "; "
|
28 |
print(semantic_prompt)
|
29 |
-
print('*'*100)
|
30 |
return semantic_prompt
|
31 |
|
32 |
def region_semantic(self, img_src):
|
|
|
3 |
|
4 |
|
5 |
class RegionSemantic():
|
6 |
+
def __init__(self, device):
|
7 |
+
self.device = device
|
8 |
self.init_models()
|
9 |
|
10 |
def init_models(self):
|
11 |
+
self.segment_model = SegmentAnything(self.device)
|
12 |
+
self.semantic_segment_model = SemanticSegment(self.device)
|
13 |
|
14 |
def semantic_prompt_gen(self, anns):
|
15 |
"""
|
|
|
22 |
# Select the top 10 largest regions
|
23 |
top_10_largest_regions = sorted_annotations[:10]
|
24 |
semantic_prompt = ""
|
25 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
26 |
print("\nStep3, Semantic Prompt:")
|
27 |
for region in top_10_largest_regions:
|
28 |
semantic_prompt += region['class_name'] + ': ' + str(region['bbox']) + "; "
|
29 |
print(semantic_prompt)
|
30 |
+
print('\033[1;35m' + '*' * 100 + '\033[0m')
|
31 |
return semantic_prompt
|
32 |
|
33 |
def region_semantic(self, img_src):
|
models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc
CHANGED
Binary files a/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc and b/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc differ
|
|
models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc
CHANGED
Binary files a/models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc and b/models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc differ
|
|
models/segment_models/semantic_segment_anything_model.py
CHANGED
@@ -15,12 +15,11 @@ from models.segment_models.configs.coco_id2label import CONFIG as CONFIG_COCO_ID
|
|
15 |
nlp = spacy.load('en_core_web_sm')
|
16 |
|
17 |
class SemanticSegment():
|
18 |
-
def __init__(self):
|
|
|
19 |
self.model_init()
|
20 |
|
21 |
def model_init(self):
|
22 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
23 |
-
# self.device = 'cpu'
|
24 |
self.init_clip()
|
25 |
self.init_oneformer_ade20k()
|
26 |
self.init_oneformer_coco()
|
|
|
15 |
nlp = spacy.load('en_core_web_sm')
|
16 |
|
17 |
class SemanticSegment():
|
18 |
+
def __init__(self, device):
|
19 |
+
self.device = device
|
20 |
self.model_init()
|
21 |
|
22 |
def model_init(self):
|
|
|
|
|
23 |
self.init_clip()
|
24 |
self.init_oneformer_ade20k()
|
25 |
self.init_oneformer_coco()
|
models/segment_models/semgent_anything_model.py
CHANGED
@@ -3,14 +3,13 @@ from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
|
|
3 |
import torch
|
4 |
|
5 |
class SegmentAnything:
|
6 |
-
def __init__(self, arch="vit_h", pretrained_weights="pretrained_models/sam_vit_h_4b8939.pth"):
|
7 |
-
|
8 |
self.model = self.initialize_model(arch, pretrained_weights)
|
9 |
|
10 |
def initialize_model(self, arch, pretrained_weights):
|
11 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
sam = sam_model_registry[arch](checkpoint=pretrained_weights)
|
13 |
-
sam.to(device=device)
|
14 |
mask_generator = SamAutomaticMaskGenerator(sam)
|
15 |
return mask_generator
|
16 |
|
|
|
3 |
import torch
|
4 |
|
5 |
class SegmentAnything:
|
6 |
+
def __init__(self, device, arch="vit_h", pretrained_weights="pretrained_models/sam_vit_h_4b8939.pth"):
|
7 |
+
self.device = device
|
8 |
self.model = self.initialize_model(arch, pretrained_weights)
|
9 |
|
10 |
def initialize_model(self, arch, pretrained_weights):
|
|
|
11 |
sam = sam_model_registry[arch](checkpoint=pretrained_weights)
|
12 |
+
sam.to(device=self.device)
|
13 |
mask_generator = SamAutomaticMaskGenerator(sam)
|
14 |
return mask_generator
|
15 |
|