Spaces:

ivelin
/

ui-refexp

Runtime error

App Files Files Community

ivelin commited on Jan 9, 2023

Commit

e0dd23e

•

1 Parent(s): 1952dd0

fix: result rendering

Browse files

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Files changed (1) hide show

app.py +22 -3

app.py CHANGED Viewed

@@ -13,13 +13,13 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-def process_document(image, question):
     # prepare encoder inputs
     pixel_values = processor(image, return_tensors="pt").pixel_values
     # prepare decoder inputs
     task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_refexp>"
-    prompt = task_prompt.replace("{user_input}", question)
     decoder_input_ids = processor.tokenizer(
         prompt, add_special_tokens=False, return_tensors="pt").input_ids
@@ -43,8 +43,27 @@ def process_document(image, question):
         processor.tokenizer.pad_token, "")
     # remove first task start token
     sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
-    return processor.token2json(sequence)
 description = "Gradio Demo for Donut RefExp task, an instance of `VisionEncoderDecoderModel` fine-tuned on UIBert RefExp Dataset (UI Referring Expression). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."

 model.to(device)
+def process_document(image, prompt):
     # prepare encoder inputs
     pixel_values = processor(image, return_tensors="pt").pixel_values
     # prepare decoder inputs
     task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_refexp>"
+    prompt = task_prompt.replace("{user_input}", prompt)
     decoder_input_ids = processor.tokenizer(
         prompt, add_special_tokens=False, return_tensors="pt").input_ids
         processor.tokenizer.pad_token, "")
     # remove first task start token
     sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
+    bbox = processor.token2json(sequence)
+    print(f"predicted bounding box: {bbox}")
+    width, height = image.size
+    print(f"image width, height: {width, height}")
+    print(f"prompt: {sample['prompt']}")
+    xmin = math.floor(width*bbox["xmin"])
+    ymin = math.floor(height*bbox["ymin"])
+    xmax = math.floor(width*bbox["xmax"])
+    ymax = math.floor(height*bbox["ymax"])
+    print(
+        f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")
+    shape = [(xmin, ymin), (xmax, ymax)]
+    # create rectangle image
+    img1 = ImageDraw.Draw(image)
+    img1.rectangle(shape, outline="green", width=5)
+    return image, bbox
 description = "Gradio Demo for Donut RefExp task, an instance of `VisionEncoderDecoderModel` fine-tuned on UIBert RefExp Dataset (UI Referring Expression). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."