ui-refexp-click

Sleeping

App Files Files Community

ivelin commited on Feb 8, 2023

Commit

45a140a

•

1 Parent(s): ec06acb

fix: hf app prediction coordinate adjustment

Browse files

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Files changed (3) hide show

Inference_Playground_Donut_UI_RefExp_Gradio.ipynb +0 -0
app.py +17 -12
requirements.txt +1 -0

Inference_Playground_Donut_UI_RefExp_Gradio.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -19,8 +19,8 @@ processor.image_processor.do_align_long_axis = False
 processor.image_processor.do_resize = False
 processor.image_processor.do_thumbnail = False
 processor.image_processor.do_pad = False
-processor.image_processor.do_rescale = False
 print(f'processor image size: {processor.image_processor.size}')
 model = VisionEncoderDecoderModel.from_pretrained(
@@ -41,9 +41,10 @@ def prepare_image_for_encoder(image=None, output_image_size=None):
     """
     assert image is not None
     assert output_image_size is not None
-    image.thumbnail(output_image_size)
-    oimg = Image.new(mode=image.mode, size=output_image_size, color=0)
-    oimg.paste(image, box=(0, 0))
     return oimg
@@ -93,7 +94,12 @@ def process_refexp(image: Image, prompt: str):
     prompt = prompt[:80].lower()
     # prepare encoder inputs
-    pixel_values = processor(image, return_tensors="pt").pixel_values
     # prepare decoder inputs
     task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_target_center>"
@@ -146,21 +152,20 @@ def process_refexp(image: Image, prompt: str):
     except ValueError:
         y = 0
     # replace str with float coords
-    center_point = {"x": x, "y": y, "decoder output sequence": sequence}
     print(f"predicted center_point with float coordinates: {center_point}")
-    print(f"image object: {image}")
-    print(f"image size: {image.size}")
-    width, height = image.size
-    print(f"image width, height: {width, height}")
     print(f"processed prompt: {prompt}")
     # convert coordinates from tensor image size to input image size
     out_size = (
         processor.image_processor.size['width'], processor.image_processor.size['height'])
     translate_point_coords_from_out_to_in(
-        point=center_point, input_image_size=image.size, output_image_size=out_size)
     x = math.floor(width*center_point["x"])
     y = math.floor(height*center_point["y"])

 processor.image_processor.do_resize = False
 processor.image_processor.do_thumbnail = False
 processor.image_processor.do_pad = False
+# processor.image_processor.do_rescale = False
+processor.image_processor.do_normalize = True
 print(f'processor image size: {processor.image_processor.size}')
 model = VisionEncoderDecoderModel.from_pretrained(
     """
     assert image is not None
     assert output_image_size is not None
+    img2 = image.copy()
+    img2.thumbnail(output_image_size)
+    oimg = Image.new(mode=img2.mode, size=output_image_size, color=0)
+    oimg.paste(img2, box=(0, 0))
     return oimg
     prompt = prompt[:80].lower()
     # prepare encoder inputs
+    out_size = (
+        processor.image_processor.size['width'], processor.image_processor.size['height'])
+    in_size = image.size
+    prepped_image = prepare_image_for_encoder(
+        image, output_image_size=out_size)
+    pixel_values = processor(prepped_image, return_tensors="pt").pixel_values
     # prepare decoder inputs
     task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_target_center>"
     except ValueError:
         y = 0
     # replace str with float coords
+    center_point = {"x": x, "y": y,
+                    "decoder output sequence (before x,y adjustment)": sequence}
     print(f"predicted center_point with float coordinates: {center_point}")
+    print(f"input image size: {in_size}")
     print(f"processed prompt: {prompt}")
     # convert coordinates from tensor image size to input image size
     out_size = (
         processor.image_processor.size['width'], processor.image_processor.size['height'])
     translate_point_coords_from_out_to_in(
+        point=center_point, input_image_size=in_size, output_image_size=out_size)
+    width, height = in_size
     x = math.floor(width*center_point["x"])
     y = math.floor(height*center_point["y"])

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ torch
 git+https://github.com/huggingface/transformers.git
 sentencepiece
 Pillow

 git+https://github.com/huggingface/transformers.git
 sentencepiece
 Pillow
+gradio