Awiny commited on
Commit
b510b75
Β·
1 Parent(s): 5d6f4ba

updata sam version

Browse files
app_w_sam.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ import base64
6
+ from io import BytesIO
7
+ from models.image_text_transformation import ImageTextTransformation
8
+ import argparse
9
+ import torch
10
+
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
13
+ parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
14
+ parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
15
+ parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
16
+ parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
17
+ parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
18
+ parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
19
+ parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
20
+
21
+ args = parser.parse_args()
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ # device = "cpu"
25
+
26
+ if device == "cuda":
27
+ args.image_caption_device = "cpu"
28
+ args.dense_caption_device = "cuda"
29
+ args.semantic_segment_device = "cuda"
30
+ args.contolnet_device = "cuda"
31
+ else:
32
+ args.image_caption_device = "cpu"
33
+ args.dense_caption_device = "cpu"
34
+ args.semantic_segment_device = "cpu"
35
+ args.contolnet_device = "cpu"
36
+
37
+ def pil_image_to_base64(image):
38
+ buffered = BytesIO()
39
+ image.save(buffered, format="JPEG")
40
+ img_str = base64.b64encode(buffered.getvalue()).decode()
41
+ return img_str
42
+
43
+ def add_logo():
44
+ with open("examples/logo.png", "rb") as f:
45
+ logo_base64 = base64.b64encode(f.read()).decode()
46
+ return logo_base64
47
+
48
+ def process_image(image_src, options=None, processor=None):
49
+ print(options)
50
+ if options is None:
51
+ options = []
52
+ processor.args.semantic_segment = "Semantic Segment" in options
53
+ image_generation_status = "Image Generation" in options
54
+ image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
55
+ if image_generation_status:
56
+ gen_image = processor.text_to_image(gen_text)
57
+ gen_image_str = pil_image_to_base64(gen_image)
58
+ # Combine the outputs into a single HTML output
59
+ custom_output = f'''
60
+ <h2>Image->Text:</h2>
61
+ <div style="display: flex; flex-wrap: wrap;">
62
+ <div style="flex: 1;">
63
+ <h3>Image Caption</h3>
64
+ <p>{image_caption}</p>
65
+ </div>
66
+ <div style="flex: 1;">
67
+ <h3>Dense Caption</h3>
68
+ <p>{dense_caption}</p>
69
+ </div>
70
+ <div style="flex: 1;">
71
+ <h3>Region Semantic</h3>
72
+ <p>{region_semantic}</p>
73
+ </div>
74
+ </div>
75
+ <div style="display: flex; flex-wrap: wrap;">
76
+ <div style="flex: 1;">
77
+ <h3>GPT4 Reasoning:</h3>
78
+ <p>{gen_text}</p>
79
+ </div>
80
+ </div>
81
+ '''
82
+ if image_generation_status:
83
+ custom_output += f'''
84
+ <h2>Text->Image:</h2>
85
+ <div style="display: flex; flex-wrap: wrap;">
86
+ <div style="flex: 1;">
87
+ <h3>Generated Image</h3>
88
+ <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
89
+ </div>
90
+ </div>
91
+ '''
92
+ return custom_output
93
+
94
+ processor = ImageTextTransformation(args)
95
+
96
+ # Create Gradio input and output components
97
+ image_input = gr.inputs.Image(type='filepath', label="Input Image")
98
+ semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
99
+ image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
100
+
101
+
102
+ extra_title = r'![vistors](https://visitor-badge.glitch.me/badge?page_id=fingerrec.Image2Paragraph)' + '\n' + \
103
+ r'[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-md-dark.svg)](https://huggingface.co/spaces/Awiny/Image2Paragraph?duplicate=true)' + '\n\n'
104
+
105
+
106
+
107
+ logo_base64 = add_logo()
108
+ # Create the title with the logo
109
+ title_with_logo = \
110
+ f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
111
+
112
+ examples = [
113
+ ["examples/test_4.jpg"],
114
+ ]
115
+
116
+ # Create Gradio interface
117
+ interface = gr.Interface(
118
+ fn=lambda image, options: process_image(image, options, processor),
119
+ inputs=[image_input,
120
+ gr.CheckboxGroup(
121
+ label="Options",
122
+ choices=["Image Generation", "Semantic Segment"],
123
+ ),
124
+ ],
125
+ outputs=gr.outputs.HTML(),
126
+ title=title_with_logo,
127
+ examples=examples,
128
+ description=extra_title +"""
129
+ Image.txt. This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
130
+ \n Github: https://github.com/showlab/Image2Paragraph
131
+ \n Twitter: https://twitter.com/awinyimgprocess/status/1646225454599372800?s=46&t=HvOe9T2n35iFuCHP5aIHpQ
132
+ \n Since GPU is expensive, we use CPU for demo and not include semantic segment anything. Run code local with gpu or google colab we provided for fast speed.
133
+ \n Ttext2image model is controlnet ( very slow in cpu(~2m)), which used canny edge as reference.
134
+ \n To speed up, we generate image with small size 384, run the code local for high-quality sample.
135
+ """
136
+ )
137
+
138
+ # Launch the interface
139
+ interface.launch()
models/__pycache__/image_text_transformation.cpython-38.pyc CHANGED
Binary files a/models/__pycache__/image_text_transformation.cpython-38.pyc and b/models/__pycache__/image_text_transformation.cpython-38.pyc differ
 
models/blip2_model.py CHANGED
@@ -1,6 +1,6 @@
1
  from PIL import Image
2
  import requests
3
- from transformers import Blip2Processor, Blip2ForConditionalGeneration
4
  import torch
5
  from utils.util import resize_long_edge
6
 
@@ -15,10 +15,13 @@ class ImageCaptioning:
15
  self.data_type = torch.float32
16
  else:
17
  self.data_type = torch.float16
18
- processor = Blip2Processor.from_pretrained("pretrained_models/blip2-opt-2.7b")
19
- model = Blip2ForConditionalGeneration.from_pretrained(
20
- "pretrained_models/blip2-opt-2.7b", torch_dtype=self.data_type
21
- )
 
 
 
22
  model.to(self.device)
23
  return processor, model
24
 
 
1
  from PIL import Image
2
  import requests
3
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration, BlipProcessor, BlipForConditionalGeneration
4
  import torch
5
  from utils.util import resize_long_edge
6
 
 
15
  self.data_type = torch.float32
16
  else:
17
  self.data_type = torch.float16
18
+ # uncomment for load stronger captioner
19
+ # processor = Blip2Processor.from_pretrained("pretrained_models/blip2-opt-2.7b")
20
+ # model = Blip2ForConditionalGeneration.from_pretrained(
21
+ # "pretrained_models/blip2-opt-2.7b", torch_dtype=self.data_type
22
+ # )
23
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
24
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
25
  model.to(self.device)
26
  return processor, model
27
 
models/image_text_transformation.py CHANGED
@@ -35,7 +35,8 @@ class ImageTextTransformation:
35
  self.gpt_model = ImageToText(openai_key)
36
  self.controlnet_model = TextToImage(device=self.args.contolnet_device)
37
  # time-conusimg on CPU, run on local
38
- # self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device)
 
39
  print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
40
 
41
 
 
35
  self.gpt_model = ImageToText(openai_key)
36
  self.controlnet_model = TextToImage(device=self.args.contolnet_device)
37
  # time-conusimg on CPU, run on local
38
+ if self.args.semantic_segment:
39
+ self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device)
40
  print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
41
 
42
 
models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc CHANGED
Binary files a/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc and b/models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc differ
 
models/segment_models/semantic_segment_anything_model.py CHANGED
@@ -27,27 +27,30 @@ class SemanticSegment():
27
  self.init_clipseg()
28
 
29
  def init_clip(self):
30
- model_name = "pretrained_models/clip-vit-large-patch14"
 
31
  self.clip_processor = CLIPProcessor.from_pretrained(model_name)
32
  self.clip_model = CLIPModel.from_pretrained(model_name).to(self.device)
33
 
34
  def init_oneformer_ade20k(self):
35
- model_name = "pretrained_models/oneformer_ade20k_swin_large"
 
36
  self.oneformer_ade20k_processor = OneFormerProcessor.from_pretrained(model_name)
37
  self.oneformer_ade20k_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
38
 
39
  def init_oneformer_coco(self):
40
- model_name = "pretrained_models/oneformer_coco_swin_large"
41
  self.oneformer_coco_processor = OneFormerProcessor.from_pretrained(model_name)
42
  self.oneformer_coco_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
43
 
44
  def init_blip(self):
45
- model_name = "pretrained_models/blip-image-captioning-large"
 
46
  self.blip_processor = BlipProcessor.from_pretrained(model_name)
47
  self.blip_model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
48
 
49
  def init_clipseg(self):
50
- model_name = "pretrained_models/clipseg-rd64-refined"
51
  self.clipseg_processor = AutoProcessor.from_pretrained(model_name)
52
  self.clipseg_model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(self.device)
53
  self.clipseg_processor.image_processor.do_resize = False
 
27
  self.init_clipseg()
28
 
29
  def init_clip(self):
30
+ # model_name = "openai/clip-vit-large-patch14"
31
+ model_name = "openai/clip-vit-base-patch32"
32
  self.clip_processor = CLIPProcessor.from_pretrained(model_name)
33
  self.clip_model = CLIPModel.from_pretrained(model_name).to(self.device)
34
 
35
  def init_oneformer_ade20k(self):
36
+ # model_name = "shi-labs/oneformer_ade20k_swin_large"
37
+ model_name = "shi-labs/oneformer_ade20k_swin_tiny"
38
  self.oneformer_ade20k_processor = OneFormerProcessor.from_pretrained(model_name)
39
  self.oneformer_ade20k_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
40
 
41
  def init_oneformer_coco(self):
42
+ model_name = "shi-labs/oneformer_coco_swin_large"
43
  self.oneformer_coco_processor = OneFormerProcessor.from_pretrained(model_name)
44
  self.oneformer_coco_model = OneFormerForUniversalSegmentation.from_pretrained(model_name).to(self.device)
45
 
46
  def init_blip(self):
47
+ model_name = "Salesforce/blip-image-captioning-base"
48
+ # model_name = "Salesforce/blip-image-captioning-large"
49
  self.blip_processor = BlipProcessor.from_pretrained(model_name)
50
  self.blip_model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
51
 
52
  def init_clipseg(self):
53
+ model_name = "CIDAS/clipseg-rd64-refined"
54
  self.clipseg_processor = AutoProcessor.from_pretrained(model_name)
55
  self.clipseg_model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(self.device)
56
  self.clipseg_processor.image_processor.do_resize = False