kh-CHEUNG commited on
Commit
4eea76c
·
verified ·
1 Parent(s): 8e5abee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -2
app.py CHANGED
@@ -1,11 +1,45 @@
 
 
1
  import streamlit as st
2
  import torch
3
  from transformers import AutoProcessor, UdopForConditionalGeneration
4
- from PIL import Image
5
  # from datasets import load_dataset
6
 
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=True)
10
  model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
11
 
@@ -53,11 +87,21 @@ with col2:
53
  match task_type:
54
  case "Classification":
55
  output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
 
56
  case "Question Answering":
57
  output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
 
58
  case "Layout Analysis":
59
  output_text = processor.batch_decode(model_output, skip_special_tokens=False)[0]
60
- st.write(output_text)
 
 
 
 
 
 
 
 
61
  elif testButton and selected_file == "None":
62
  st.write("Please upload and select a document (/an image).")
63
 
 
1
+ import numpy as np
2
+ import re
3
  import streamlit as st
4
  import torch
5
  from transformers import AutoProcessor, UdopForConditionalGeneration
6
+ from PIL import Image, ImageDraw
7
  # from datasets import load_dataset
8
 
9
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
 
11
+ # UDOP uses 501 special loc ("location") tokens
12
+ LAYOUT_VOCAB_SIZE = 501
13
+
14
+ def extract_coordinates(string):
15
+ # Using regular expression to find all numbers in the string
16
+ numbers = re.findall(r'\d+', string)
17
+
18
+ # Converting the numbers to integers
19
+ numbers = list(map(int, numbers))
20
+
21
+ # Ensuring there are exactly 4 numbers
22
+ if len(numbers) != 4:
23
+ numbers = numbers[-4:]
24
+
25
+ # Extracting coordinates
26
+ x1, y1, x2, y2 = numbers
27
+
28
+ return [x1, y1, x2, y2]
29
+
30
+
31
+ def unnormalize_box(box, image_width, image_height):
32
+ x1 = box[0] / LAYOUT_VOCAB_SIZE * image_width
33
+ y1 = box[1] / LAYOUT_VOCAB_SIZE * image_height
34
+ x2 = box[2] / LAYOUT_VOCAB_SIZE * image_width
35
+ y2 = box[3] / LAYOUT_VOCAB_SIZE * image_height
36
+ return [x1, y1, x2, y2]
37
+
38
+
39
+ # Get the coordinates from the output text and denormalize them
40
+ coordinates = extract_coordinates(output_text)
41
+ coordinates = unnormalize_box(coordinates, unnormalized_image.width, unnormalized_image.height)
42
+
43
  processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=True)
44
  model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
45
 
 
87
  match task_type:
88
  case "Classification":
89
  output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
90
+ st.write(output_text)
91
  case "Question Answering":
92
  output_text = processor.batch_decode(model_output, skip_special_tokens=True)[0]
93
+ st.write(output_text)
94
  case "Layout Analysis":
95
  output_text = processor.batch_decode(model_output, skip_special_tokens=False)[0]
96
+ mean = processor.image_processor.image_mean
97
+ std = processor.image_processor.image_std
98
+ unnormalized_image = (encoding.pixel_values.squeeze().numpy() * np.array(std)[:, None, None]) + np.array(mean)[:, None, None]
99
+ unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
100
+ unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
101
+ unnormalized_image = Image.fromarray(unnormalized_image)
102
+ draw = ImageDraw.Draw(unnormalized_image)
103
+ draw.rectangle(coordinates, outline="red")
104
+ st.image(unnormalized_image, caption="Output Image")
105
  elif testButton and selected_file == "None":
106
  st.write("Please upload and select a document (/an image).")
107