ui-refexp-click

Sleeping

App Files Files Community

ivelin commited on Feb 8, 2023

Commit

ec06acb

•

1 Parent(s): 12b6ec4

fix: image preprocessing

Browse files

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Files changed (3) hide show

.gitignore +1 -0
Inference_Playground_Donut_UI_RefExp_Gradio.ipynb +0 -0
app.py +48 -15

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

Inference_Playground_Donut_UI_RefExp_Gradio.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -12,34 +12,64 @@ pretrained_revision = 'main'
 # use 'main' for latest revision
 print(f"Loading model checkpoint: {pretrained_repo_name}")
-processor = DonutProcessor.from_pretrained(pretrained_repo_name, revision=pretrained_revision, use_auth_token="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")
-model = VisionEncoderDecoderModel.from_pretrained(pretrained_repo_name, use_auth_token="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", revision=pretrained_revision)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
     """
     Convert relative prediction coordinates from resized encoder tensor image
     to original input image size.
     Args:
-        original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
         input_image_size: (width, height) tuple
         output_image_size: (width, height) tuple
-    """
     assert point is not None
     assert input_image_size is not None
     assert output_image_size is not None
-    # print(f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
     input_width, input_height = input_image_size
     output_width, output_height = output_image_size
     ratio = min(output_width/input_width, output_height/input_height)
     resized_height = int(input_height*ratio)
-    # print(f'>>> resized_height={resized_height}')
     resized_width = int(input_width*ratio)
-    # print(f'>>> resized_width={resized_width}')
     if resized_height == input_height and resized_width == input_width:
         return
@@ -51,8 +81,9 @@ def translate_point_coords_from_out_to_in(point=None, input_image_size=None, out
     if resized_height < output_height:
         # adjust for padding pixels
         point['y'] *= (output_height / resized_height)
-    # print(f"translated point={point}, resized_image_size: {resized_width, resized_height}")
 def process_refexp(image: Image, prompt: str):
@@ -125,9 +156,11 @@ def process_refexp(image: Image, prompt: str):
     print(f"processed prompt: {prompt}")
     # convert coordinates from tensor image size to input image size
-    out_size = (processor.image_processor.size[1], processor.image_processor.size[0])
-    translate_point_coords_from_out_to_in(point=center_point, input_image_size=image.size, output_image_size=out_size)
     x = math.floor(width*center_point["x"])
     y = math.floor(height*center_point["y"])
@@ -183,4 +216,4 @@ demo = gr.Interface(fn=process_refexp,
                     cache_examples=False
                     )
-demo.launch()

 # use 'main' for latest revision
 print(f"Loading model checkpoint: {pretrained_repo_name}")
+processor = DonutProcessor.from_pretrained(
+    pretrained_repo_name, revision=pretrained_revision, use_auth_token="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")
+processor.image_processor.do_align_long_axis = False
+# do not manipulate image size and position
+processor.image_processor.do_resize = False
+processor.image_processor.do_thumbnail = False
+processor.image_processor.do_pad = False
+processor.image_processor.do_rescale = False
+print(f'processor image size: {processor.image_processor.size}')
+model = VisionEncoderDecoderModel.from_pretrained(
+    pretrained_repo_name, use_auth_token="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", revision=pretrained_revision)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+def prepare_image_for_encoder(image=None, output_image_size=None):
+    """
+    First, resizes the input image to fill as much as possible of the output image size
+    while preserving aspect ratio. Positions the resized image at (0,0) and fills
+    the rest of the gap space in the output image with black(0).
+    Args:
+        image: PIL image
+        output_image_size: (width, height) tuple
+    """
+    assert image is not None
+    assert output_image_size is not None
+    image.thumbnail(output_image_size)
+    oimg = Image.new(mode=image.mode, size=output_image_size, color=0)
+    oimg.paste(image, box=(0, 0))
+    return oimg
 def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
     """
     Convert relative prediction coordinates from resized encoder tensor image
     to original input image size.
     Args:
+        original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
         input_image_size: (width, height) tuple
         output_image_size: (width, height) tuple
+    """
     assert point is not None
     assert input_image_size is not None
     assert output_image_size is not None
+    print(
+        f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
     input_width, input_height = input_image_size
     output_width, output_height = output_image_size
     ratio = min(output_width/input_width, output_height/input_height)
     resized_height = int(input_height*ratio)
     resized_width = int(input_width*ratio)
+    print(f'>>> resized_width={resized_width}')
+    print(f'>>> resized_height={resized_height}')
     if resized_height == input_height and resized_width == input_width:
         return
     if resized_height < output_height:
         # adjust for padding pixels
         point['y'] *= (output_height / resized_height)
+    print(
+        f"translated point={point}, resized_image_size: {resized_width, resized_height}")
 def process_refexp(image: Image, prompt: str):
     print(f"processed prompt: {prompt}")
     # convert coordinates from tensor image size to input image size
+    out_size = (
+        processor.image_processor.size['width'], processor.image_processor.size['height'])
+    translate_point_coords_from_out_to_in(
+        point=center_point, input_image_size=image.size, output_image_size=out_size)
     x = math.floor(width*center_point["x"])
     y = math.floor(height*center_point["y"])
                     cache_examples=False
                     )
+demo.launch(share=True)