Spaces:

kh-CHEUNG
/

test_img_text-streamlit

Sleeping

App Files Files Community

kh-CHEUNG commited on Apr 16, 2024

Commit

4eea76c

verified ·

1 Parent(s): 8e5abee

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -2

app.py CHANGED Viewed

@@ -1,11 +1,45 @@
 import streamlit as st
 import torch
 from transformers import AutoProcessor, UdopForConditionalGeneration
-from PIL import Image
 # from datasets import load_dataset
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=True)
 model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
@@ -53,11 +87,21 @@ with col2:
         match task_type:
             case "Classification":
                 output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
             case "Question Answering":
                 output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
             case "Layout Analysis":
                 output_text = processor.batch_decode(model_output, skip_special_tokens=False)[0]
-        st.write(output_text)
     elif testButton and selected_file == "None":
         st.write("Please upload and select a document (/an image).")

+import numpy as np
+import re
 import streamlit as st
 import torch
 from transformers import AutoProcessor, UdopForConditionalGeneration
+from PIL import Image, ImageDraw
 # from datasets import load_dataset
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# UDOP uses 501 special loc ("location") tokens
+LAYOUT_VOCAB_SIZE = 501
+def extract_coordinates(string):
+    # Using regular expression to find all numbers in the string
+    numbers = re.findall(r'\d+', string)
+    # Converting the numbers to integers
+    numbers = list(map(int, numbers))
+    # Ensuring there are exactly 4 numbers
+    if len(numbers) != 4:
+        numbers = numbers[-4:]
+    # Extracting coordinates
+    x1, y1, x2, y2 = numbers
+    return [x1, y1, x2, y2]
+def unnormalize_box(box, image_width, image_height):
+    x1 = box[0] / LAYOUT_VOCAB_SIZE * image_width
+    y1 = box[1] / LAYOUT_VOCAB_SIZE * image_height
+    x2 = box[2] / LAYOUT_VOCAB_SIZE * image_width
+    y2 = box[3] / LAYOUT_VOCAB_SIZE * image_height
+    return [x1, y1, x2, y2]
+# Get the coordinates from the output text and denormalize them
+coordinates = extract_coordinates(output_text)
+coordinates = unnormalize_box(coordinates, unnormalized_image.width, unnormalized_image.height)
 processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=True)
 model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
         match task_type:
             case "Classification":
                 output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
+                st.write(output_text)
             case "Question Answering":
                 output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
+                st.write(output_text)
             case "Layout Analysis":
                 output_text = processor.batch_decode(model_output, skip_special_tokens=False)[0]
+                mean = processor.image_processor.image_mean
+                std = processor.image_processor.image_std
+                unnormalized_image = (encoding.pixel_values.squeeze().numpy() * np.array(std)[:, None, None]) + np.array(mean)[:, None, None]
+                unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
+                unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
+                unnormalized_image = Image.fromarray(unnormalized_image)
+                draw = ImageDraw.Draw(unnormalized_image)
+                draw.rectangle(coordinates, outline="red")
+                st.image(unnormalized_image, caption="Output Image")
     elif testButton and selected_file == "None":
         st.write("Please upload and select a document (/an image).")