ivelin commited on
Commit
45a140a
1 Parent(s): ec06acb

fix: hf app prediction coordinate adjustment

Browse files

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Inference_Playground_Donut_UI_RefExp_Gradio.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -19,8 +19,8 @@ processor.image_processor.do_align_long_axis = False
19
  processor.image_processor.do_resize = False
20
  processor.image_processor.do_thumbnail = False
21
  processor.image_processor.do_pad = False
22
- processor.image_processor.do_rescale = False
23
-
24
  print(f'processor image size: {processor.image_processor.size}')
25
 
26
  model = VisionEncoderDecoderModel.from_pretrained(
@@ -41,9 +41,10 @@ def prepare_image_for_encoder(image=None, output_image_size=None):
41
  """
42
  assert image is not None
43
  assert output_image_size is not None
44
- image.thumbnail(output_image_size)
45
- oimg = Image.new(mode=image.mode, size=output_image_size, color=0)
46
- oimg.paste(image, box=(0, 0))
 
47
  return oimg
48
 
49
 
@@ -93,7 +94,12 @@ def process_refexp(image: Image, prompt: str):
93
  prompt = prompt[:80].lower()
94
 
95
  # prepare encoder inputs
96
- pixel_values = processor(image, return_tensors="pt").pixel_values
 
 
 
 
 
97
 
98
  # prepare decoder inputs
99
  task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_target_center>"
@@ -146,21 +152,20 @@ def process_refexp(image: Image, prompt: str):
146
  except ValueError:
147
  y = 0
148
  # replace str with float coords
149
- center_point = {"x": x, "y": y, "decoder output sequence": sequence}
 
150
  print(f"predicted center_point with float coordinates: {center_point}")
151
 
152
- print(f"image object: {image}")
153
- print(f"image size: {image.size}")
154
- width, height = image.size
155
- print(f"image width, height: {width, height}")
156
  print(f"processed prompt: {prompt}")
157
 
158
  # convert coordinates from tensor image size to input image size
159
  out_size = (
160
  processor.image_processor.size['width'], processor.image_processor.size['height'])
161
  translate_point_coords_from_out_to_in(
162
- point=center_point, input_image_size=image.size, output_image_size=out_size)
163
 
 
164
  x = math.floor(width*center_point["x"])
165
  y = math.floor(height*center_point["y"])
166
 
 
19
  processor.image_processor.do_resize = False
20
  processor.image_processor.do_thumbnail = False
21
  processor.image_processor.do_pad = False
22
+ # processor.image_processor.do_rescale = False
23
+ processor.image_processor.do_normalize = True
24
  print(f'processor image size: {processor.image_processor.size}')
25
 
26
  model = VisionEncoderDecoderModel.from_pretrained(
 
41
  """
42
  assert image is not None
43
  assert output_image_size is not None
44
+ img2 = image.copy()
45
+ img2.thumbnail(output_image_size)
46
+ oimg = Image.new(mode=img2.mode, size=output_image_size, color=0)
47
+ oimg.paste(img2, box=(0, 0))
48
  return oimg
49
 
50
 
 
94
  prompt = prompt[:80].lower()
95
 
96
  # prepare encoder inputs
97
+ out_size = (
98
+ processor.image_processor.size['width'], processor.image_processor.size['height'])
99
+ in_size = image.size
100
+ prepped_image = prepare_image_for_encoder(
101
+ image, output_image_size=out_size)
102
+ pixel_values = processor(prepped_image, return_tensors="pt").pixel_values
103
 
104
  # prepare decoder inputs
105
  task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_target_center>"
 
152
  except ValueError:
153
  y = 0
154
  # replace str with float coords
155
+ center_point = {"x": x, "y": y,
156
+ "decoder output sequence (before x,y adjustment)": sequence}
157
  print(f"predicted center_point with float coordinates: {center_point}")
158
 
159
+ print(f"input image size: {in_size}")
 
 
 
160
  print(f"processed prompt: {prompt}")
161
 
162
  # convert coordinates from tensor image size to input image size
163
  out_size = (
164
  processor.image_processor.size['width'], processor.image_processor.size['height'])
165
  translate_point_coords_from_out_to_in(
166
+ point=center_point, input_image_size=in_size, output_image_size=out_size)
167
 
168
+ width, height = in_size
169
  x = math.floor(width*center_point["x"])
170
  y = math.floor(height*center_point["y"])
171
 
requirements.txt CHANGED
@@ -2,3 +2,4 @@ torch
2
  git+https://github.com/huggingface/transformers.git
3
  sentencepiece
4
  Pillow
 
 
2
  git+https://github.com/huggingface/transformers.git
3
  sentencepiece
4
  Pillow
5
+ gradio