ivelin commited on
Commit
ec06acb
1 Parent(s): 12b6ec4

fix: image preprocessing

Browse files

Signed-off-by: ivelin <ivelin.eth@gmail.com>

.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
Inference_Playground_Donut_UI_RefExp_Gradio.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -12,34 +12,64 @@ pretrained_revision = 'main'
12
  # use 'main' for latest revision
13
  print(f"Loading model checkpoint: {pretrained_repo_name}")
14
 
15
- processor = DonutProcessor.from_pretrained(pretrained_repo_name, revision=pretrained_revision, use_auth_token="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")
16
- model = VisionEncoderDecoderModel.from_pretrained(pretrained_repo_name, use_auth_token="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", revision=pretrained_revision)
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  model.to(device)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
22
  """
23
  Convert relative prediction coordinates from resized encoder tensor image
24
  to original input image size.
25
  Args:
26
- original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
27
  input_image_size: (width, height) tuple
28
  output_image_size: (width, height) tuple
29
- """
30
  assert point is not None
31
  assert input_image_size is not None
32
  assert output_image_size is not None
33
- # print(f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
 
34
  input_width, input_height = input_image_size
35
  output_width, output_height = output_image_size
36
-
37
  ratio = min(output_width/input_width, output_height/input_height)
38
-
39
  resized_height = int(input_height*ratio)
40
- # print(f'>>> resized_height={resized_height}')
41
  resized_width = int(input_width*ratio)
42
- # print(f'>>> resized_width={resized_width}')
 
43
 
44
  if resized_height == input_height and resized_width == input_width:
45
  return
@@ -51,8 +81,9 @@ def translate_point_coords_from_out_to_in(point=None, input_image_size=None, out
51
  if resized_height < output_height:
52
  # adjust for padding pixels
53
  point['y'] *= (output_height / resized_height)
54
- # print(f"translated point={point}, resized_image_size: {resized_width, resized_height}")
55
-
 
56
 
57
  def process_refexp(image: Image, prompt: str):
58
 
@@ -125,9 +156,11 @@ def process_refexp(image: Image, prompt: str):
125
  print(f"processed prompt: {prompt}")
126
 
127
  # convert coordinates from tensor image size to input image size
128
- out_size = (processor.image_processor.size[1], processor.image_processor.size[0])
129
- translate_point_coords_from_out_to_in(point=center_point, input_image_size=image.size, output_image_size=out_size)
130
-
 
 
131
  x = math.floor(width*center_point["x"])
132
  y = math.floor(height*center_point["y"])
133
 
@@ -183,4 +216,4 @@ demo = gr.Interface(fn=process_refexp,
183
  cache_examples=False
184
  )
185
 
186
- demo.launch()
 
12
  # use 'main' for latest revision
13
  print(f"Loading model checkpoint: {pretrained_repo_name}")
14
 
15
+ processor = DonutProcessor.from_pretrained(
16
+ pretrained_repo_name, revision=pretrained_revision, use_auth_token="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")
17
+ processor.image_processor.do_align_long_axis = False
18
+ # do not manipulate image size and position
19
+ processor.image_processor.do_resize = False
20
+ processor.image_processor.do_thumbnail = False
21
+ processor.image_processor.do_pad = False
22
+ processor.image_processor.do_rescale = False
23
+
24
+ print(f'processor image size: {processor.image_processor.size}')
25
+
26
+ model = VisionEncoderDecoderModel.from_pretrained(
27
+ pretrained_repo_name, use_auth_token="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", revision=pretrained_revision)
28
 
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  model.to(device)
31
 
32
+
33
+ def prepare_image_for_encoder(image=None, output_image_size=None):
34
+ """
35
+ First, resizes the input image to fill as much as possible of the output image size
36
+ while preserving aspect ratio. Positions the resized image at (0,0) and fills
37
+ the rest of the gap space in the output image with black(0).
38
+ Args:
39
+ image: PIL image
40
+ output_image_size: (width, height) tuple
41
+ """
42
+ assert image is not None
43
+ assert output_image_size is not None
44
+ image.thumbnail(output_image_size)
45
+ oimg = Image.new(mode=image.mode, size=output_image_size, color=0)
46
+ oimg.paste(image, box=(0, 0))
47
+ return oimg
48
+
49
+
50
  def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
51
  """
52
  Convert relative prediction coordinates from resized encoder tensor image
53
  to original input image size.
54
  Args:
55
+ original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
56
  input_image_size: (width, height) tuple
57
  output_image_size: (width, height) tuple
58
+ """
59
  assert point is not None
60
  assert input_image_size is not None
61
  assert output_image_size is not None
62
+ print(
63
+ f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
64
  input_width, input_height = input_image_size
65
  output_width, output_height = output_image_size
66
+
67
  ratio = min(output_width/input_width, output_height/input_height)
68
+
69
  resized_height = int(input_height*ratio)
 
70
  resized_width = int(input_width*ratio)
71
+ print(f'>>> resized_width={resized_width}')
72
+ print(f'>>> resized_height={resized_height}')
73
 
74
  if resized_height == input_height and resized_width == input_width:
75
  return
 
81
  if resized_height < output_height:
82
  # adjust for padding pixels
83
  point['y'] *= (output_height / resized_height)
84
+ print(
85
+ f"translated point={point}, resized_image_size: {resized_width, resized_height}")
86
+
87
 
88
  def process_refexp(image: Image, prompt: str):
89
 
 
156
  print(f"processed prompt: {prompt}")
157
 
158
  # convert coordinates from tensor image size to input image size
159
+ out_size = (
160
+ processor.image_processor.size['width'], processor.image_processor.size['height'])
161
+ translate_point_coords_from_out_to_in(
162
+ point=center_point, input_image_size=image.size, output_image_size=out_size)
163
+
164
  x = math.floor(width*center_point["x"])
165
  y = math.floor(height*center_point["y"])
166
 
 
216
  cache_examples=False
217
  )
218
 
219
+ demo.launch(share=True)