Spaces:
Runtime error
Runtime error
ivelin
commited on
Commit
•
e0dd23e
1
Parent(s):
1952dd0
fix: result rendering
Browse filesSigned-off-by: ivelin <ivelin.eth@gmail.com>
app.py
CHANGED
@@ -13,13 +13,13 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
13 |
model.to(device)
|
14 |
|
15 |
|
16 |
-
def process_document(image,
|
17 |
# prepare encoder inputs
|
18 |
pixel_values = processor(image, return_tensors="pt").pixel_values
|
19 |
|
20 |
# prepare decoder inputs
|
21 |
task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_refexp>"
|
22 |
-
prompt = task_prompt.replace("{user_input}",
|
23 |
decoder_input_ids = processor.tokenizer(
|
24 |
prompt, add_special_tokens=False, return_tensors="pt").input_ids
|
25 |
|
@@ -43,8 +43,27 @@ def process_document(image, question):
|
|
43 |
processor.tokenizer.pad_token, "")
|
44 |
# remove first task start token
|
45 |
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
|
|
|
|
|
46 |
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
description = "Gradio Demo for Donut RefExp task, an instance of `VisionEncoderDecoderModel` fine-tuned on UIBert RefExp Dataset (UI Referring Expression). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
|
|
|
13 |
model.to(device)
|
14 |
|
15 |
|
16 |
+
def process_document(image, prompt):
|
17 |
# prepare encoder inputs
|
18 |
pixel_values = processor(image, return_tensors="pt").pixel_values
|
19 |
|
20 |
# prepare decoder inputs
|
21 |
task_prompt = "<s_refexp><s_prompt>{user_input}</s_prompt><s_refexp>"
|
22 |
+
prompt = task_prompt.replace("{user_input}", prompt)
|
23 |
decoder_input_ids = processor.tokenizer(
|
24 |
prompt, add_special_tokens=False, return_tensors="pt").input_ids
|
25 |
|
|
|
43 |
processor.tokenizer.pad_token, "")
|
44 |
# remove first task start token
|
45 |
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
|
46 |
+
bbox = processor.token2json(sequence)
|
47 |
+
print(f"predicted bounding box: {bbox}")
|
48 |
|
49 |
+
width, height = image.size
|
50 |
+
print(f"image width, height: {width, height}")
|
51 |
+
print(f"prompt: {sample['prompt']}")
|
52 |
+
|
53 |
+
xmin = math.floor(width*bbox["xmin"])
|
54 |
+
ymin = math.floor(height*bbox["ymin"])
|
55 |
+
xmax = math.floor(width*bbox["xmax"])
|
56 |
+
ymax = math.floor(height*bbox["ymax"])
|
57 |
+
|
58 |
+
print(
|
59 |
+
f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")
|
60 |
+
|
61 |
+
shape = [(xmin, ymin), (xmax, ymax)]
|
62 |
+
|
63 |
+
# create rectangle image
|
64 |
+
img1 = ImageDraw.Draw(image)
|
65 |
+
img1.rectangle(shape, outline="green", width=5)
|
66 |
+
return image, bbox
|
67 |
|
68 |
|
69 |
description = "Gradio Demo for Donut RefExp task, an instance of `VisionEncoderDecoderModel` fine-tuned on UIBert RefExp Dataset (UI Referring Expression). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
|