ivelin commited on
Commit
e0dd23e
1 Parent(s): 1952dd0

fix: result rendering

Browse files

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Files changed (1) hide show
  1. app.py +22 -3
app.py CHANGED
@@ -13,13 +13,13 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
13
  model.to(device)
14
 
15
 
16
- def process_document(image, question):
17
  # prepare encoder inputs
18
  pixel_values = processor(image, return_tensors="pt").pixel_values
19
 
20
  # prepare decoder inputs
21
  task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_refexp>"
22
- prompt = task_prompt.replace("{user_input}", question)
23
  decoder_input_ids = processor.tokenizer(
24
  prompt, add_special_tokens=False, return_tensors="pt").input_ids
25
 
@@ -43,8 +43,27 @@ def process_document(image, question):
43
  processor.tokenizer.pad_token, "")
44
  # remove first task start token
45
  sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
 
 
46
 
47
- return processor.token2json(sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  description = "Gradio Demo for Donut RefExp task, an instance of `VisionEncoderDecoderModel` fine-tuned on UIBert RefExp Dataset (UI Referring Expression). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
 
13
  model.to(device)
14
 
15
 
16
+ def process_document(image, prompt):
17
  # prepare encoder inputs
18
  pixel_values = processor(image, return_tensors="pt").pixel_values
19
 
20
  # prepare decoder inputs
21
  task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_refexp>"
22
+ prompt = task_prompt.replace("{user_input}", prompt)
23
  decoder_input_ids = processor.tokenizer(
24
  prompt, add_special_tokens=False, return_tensors="pt").input_ids
25
 
 
43
  processor.tokenizer.pad_token, "")
44
  # remove first task start token
45
  sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
46
+ bbox = processor.token2json(sequence)
47
+ print(f"predicted bounding box: {bbox}")
48
 
49
+ width, height = image.size
50
+ print(f"image width, height: {width, height}")
51
+ print(f"prompt: {sample['prompt']}")
52
+
53
+ xmin = math.floor(width*bbox["xmin"])
54
+ ymin = math.floor(height*bbox["ymin"])
55
+ xmax = math.floor(width*bbox["xmax"])
56
+ ymax = math.floor(height*bbox["ymax"])
57
+
58
+ print(
59
+ f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")
60
+
61
+ shape = [(xmin, ymin), (xmax, ymax)]
62
+
63
+ # create rectangle image
64
+ img1 = ImageDraw.Draw(image)
65
+ img1.rectangle(shape, outline="green", width=5)
66
+ return image, bbox
67
 
68
 
69
  description = "Gradio Demo for Donut RefExp task, an instance of `VisionEncoderDecoderModel` fine-tuned on UIBert RefExp Dataset (UI Referring Expression). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."