jordyvl commited on
Commit
6c0ff19
1 Parent(s): 713451f

will not load due to hardcoded paths

Browse files
Files changed (3) hide show
  1. .gitattributes +0 -35
  2. Arial.ttf +0 -0
  3. app.py +228 -4
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Arial.ttf ADDED
Binary file (276 kB). View file
 
app.py CHANGED
@@ -1,9 +1,233 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
2
 
 
 
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!!"
6
 
 
7
 
8
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
9
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import pandas as pd
4
  import gradio as gr
5
+ from collections import OrderedDict
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ from io import BytesIO
8
+ import PyPDF2
9
+ import pdf2image
10
 
11
+ MAX_PAGES = 50
12
+ MAX_PDF_SIZE = 100000000 # almost 100MB
13
+ MIN_WIDTH, MIN_HEIGHT = 150, 150
14
 
15
+ """
16
+ Load diagnostic dataset
17
 
18
+ Have pointer to local PDF/grid files
19
 
20
+ Visualize PDF/grid files based on slider values and (randonly) sampled combination of sliders
21
+
22
+ --> truly interactive visualization of diagnostic samples and their questions
23
+
24
+ """
25
+
26
+ PDF_PATH = Path("/home/jordy/Downloads/DUDE_train-val-test_binaries/PDF")
27
+ DIAGNOSTIC_PATH = "/home/jordy/code/DUchallenge/DUeval/diagnostic_test-updated.csv" # need access to local path; otherwise will not work
28
+
29
+ answer_types = {
30
+ "abstractive": "Abstractive",
31
+ "extractive": "Extractive",
32
+ "not-answerable": "Not Answerable",
33
+ "list/abstractive": "Abstractive List",
34
+ "list/extractive": "Extractive List",
35
+ }
36
+
37
+ DIAGNOSTIC_TEST = None
38
+ if os.path.exists(DIAGNOSTIC_PATH):
39
+ DIAGNOSTIC_TEST = pd.read_csv(DIAGNOSTIC_PATH)
40
+
41
+ meta_cats = OrderedDict(
42
+ {
43
+ "complexity": ["meta", "multihop", "other_hard", "simple", None],
44
+ "evidence": [
45
+ "handwriting",
46
+ "layout",
47
+ "plain",
48
+ "table_or_list",
49
+ "visual_chart",
50
+ "visual_checkbox",
51
+ "visual_color",
52
+ "visual_image",
53
+ "visual_logo",
54
+ "visual_map",
55
+ "visual_other",
56
+ "visual_signature",
57
+ "visual_stamp",
58
+ None,
59
+ ],
60
+ "form": ["date", "numeric", "other", "proper", None],
61
+ "operation": ["arithmetic", "comparison", "counting", "normalization", None],
62
+ "type": ["abstractive", "extractive", None],
63
+ }
64
+ )
65
+ diagnostic_cats = [
66
+ "complexity_meta",
67
+ "complexity_multihop",
68
+ "complexity_other_hard",
69
+ "complexity_simple",
70
+ "evidence_handwriting",
71
+ "evidence_layout",
72
+ "evidence_plain",
73
+ "evidence_table_or_list",
74
+ "evidence_visual_chart",
75
+ "evidence_visual_checkbox",
76
+ "evidence_visual_color",
77
+ "evidence_visual_image",
78
+ "evidence_visual_logo",
79
+ "evidence_visual_map",
80
+ "evidence_visual_other",
81
+ "evidence_visual_signature",
82
+ "evidence_visual_stamp",
83
+ "form_date",
84
+ "form_numeric",
85
+ "form_other",
86
+ "form_proper",
87
+ "operation_arithmetic",
88
+ "operation_comparison",
89
+ "operation_counting",
90
+ "operation_normalization",
91
+ "type_abstractive",
92
+ "type_extractive",
93
+ "num_pages",
94
+ "num_tokens",
95
+ ]
96
+ # DIAGNOSTIC_TEST = DIAGNOSTIC_TEST[interest_cols + ["row_hash"]]
97
+
98
+ sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()]
99
+
100
+ slider_defaults = [None, "visual_checkbox", None, None, None] # [slider.value for slider in sliders]
101
+
102
+
103
+ def equal_image_grid(images):
104
+ def compute_grid(n, max_cols=6):
105
+ equalDivisor = int(n**0.5)
106
+ cols = min(equalDivisor, max_cols)
107
+ rows = equalDivisor
108
+ if rows * cols >= n:
109
+ return rows, cols
110
+ cols += 1
111
+ if rows * cols >= n:
112
+ return rows, cols
113
+ while rows * cols < n:
114
+ rows += 1
115
+ return rows, cols
116
+
117
+ # assert len(images) == rows*cols
118
+ rows, cols = compute_grid(len(images))
119
+
120
+ # rescaling to min width [height padding]
121
+ images = [im for im in images if (im.height > 0) and (im.width > 0)] # could be NA
122
+
123
+ min_width = min(im.width for im in images)
124
+ images = [im.resize((min_width, int(im.height * min_width / im.width)), resample=Image.BICUBIC) for im in images]
125
+
126
+ w, h = max([img.size[0] for img in images]), max([img.size[1] for img in images])
127
+
128
+ grid = Image.new("RGB", size=(cols * w, rows * h))
129
+ grid_w, grid_h = grid.size
130
+
131
+ for i, img in enumerate(images):
132
+ grid.paste(img, box=(i % cols * w, i // cols * h))
133
+ return grid
134
+
135
+
136
+ def add_pagenumbers(im_list, height_scale=40):
137
+ def add_pagenumber(image, i):
138
+ width, height = image.size
139
+ draw = ImageDraw.Draw(image)
140
+ fontsize = int((width * height) ** (0.5) / height_scale)
141
+ font = ImageFont.truetype("Arial.ttf", fontsize)
142
+ margin = int(2 * fontsize)
143
+ draw.text(
144
+ (width - margin, height - margin),
145
+ str(i + 1),
146
+ fill="#D00917",
147
+ font=font,
148
+ spacing=4,
149
+ align="right",
150
+ )
151
+
152
+ for i, image in enumerate(im_list):
153
+ add_pagenumber(image, i)
154
+
155
+
156
+ def pdf_to_grid(pdf_path):
157
+
158
+
159
+ reader = PyPDF2.PdfReader(pdf_path)
160
+ reached_page_limit = False
161
+ images = []
162
+ try:
163
+ for p, page in enumerate(reader.pages):
164
+ if reached_page_limit:
165
+ break
166
+ for image in page.images:
167
+ im = Image.open(BytesIO(image.data))
168
+ if im.width < MIN_WIDTH and im.height < MIN_HEIGHT:
169
+ continue
170
+ images.append(im)
171
+ except Exception as e:
172
+ print(f"{pdf_path} PyPDF get_images {e}")
173
+ images = pdf2image.convert_from_path(pdf_path)
174
+
175
+ # simpler but slower
176
+ # images = pdf2image.convert_from_path(pdf_path)
177
+
178
+ if len(images) == 0:
179
+ return None
180
+ add_pagenumbers(images)
181
+ return equal_image_grid(images)
182
+
183
+
184
+ def main(complexity, evidence, form, operation, type):
185
+ # need to write a query on diagnostic test and sample from it based on slider values
186
+ # then return the sample
187
+ query = " and ".join(
188
+ [
189
+ f"{cat}_{val} == {True}"
190
+ for cat, val in zip(meta_cats.keys(), [complexity, evidence, form, operation, type])
191
+ if val
192
+ ]
193
+ )
194
+ results = DIAGNOSTIC_TEST.query(query)
195
+ if len(results) == 0:
196
+ return f"No results found for query {query}", "", "", ""
197
+
198
+ for i, sample in results.sample(frac=1).iterrows():
199
+ print("Sampled: ", sample)
200
+
201
+ # first get PDF file
202
+ PDF, grid = None, None
203
+ pdf_path = PDF_PATH / "test" / (sample["nhash"] + ".pdf")
204
+ if not os.path.exists(pdf_path):
205
+ continue
206
+ PDF = pdf_path
207
+ grid = pdf_to_grid(pdf_path)
208
+ if not grid:
209
+ continue
210
+ # opem and visualize as grid image
211
+
212
+ question, answer = sample["question"], sample["answer"]
213
+
214
+ # get columns where sample is True
215
+ diagnostics = ", ".join([cat for cat in diagnostic_cats if sample[cat]])
216
+
217
+ return question, answer, diagnostics, grid, PDF
218
+
219
+
220
+ # test
221
+ q, a, d, im, f = main(*slider_defaults)
222
+
223
+
224
+ outputs = [
225
+ gr.Textbox(label="question"),
226
+ gr.Textbox(label="answer"),
227
+ gr.Textbox(label="diagnostics"),
228
+ gr.Image(label="image grid of PDF"),
229
+ gr.File(label="PDF"),
230
+ ]
231
+
232
+ iface = gr.Interface(fn=main, inputs=sliders, outputs=outputs, description="Visualize diagnostic samples from DUDE")
233
+ iface.launch(share=False)