jordyvl commited on
Commit
7568689
1 Parent(s): 96f0e2c

Local test functional

Browse files
Files changed (2) hide show
  1. Arial.ttf +0 -0
  2. app.py +54 -49
Arial.ttf ADDED
Binary file (276 kB). View file
 
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import os
2
- from pathlib import Path
3
- import pandas as pd
4
  import gradio as gr
5
  from collections import OrderedDict
6
  from PIL import Image, ImageDraw, ImageFont
7
  from io import BytesIO
 
 
8
  import PyPDF2
9
  import pdf2image
 
10
 
11
  MAX_PAGES = 50
12
  MAX_PDF_SIZE = 100000000 # almost 100MB
@@ -81,7 +82,7 @@ def pdf_to_grid(pdf_path):
81
  images.append(im)
82
  except Exception as e:
83
  print(f"{pdf_path} PyPDF get_images {e}")
84
- images = pdf2image.convert_from_path(pdf_path)
85
 
86
  # simpler but slower
87
  # images = pdf2image.convert_from_path(pdf_path)
@@ -92,37 +93,27 @@ def pdf_to_grid(pdf_path):
92
  return equal_image_grid(images)
93
 
94
 
95
- def main(complexity, evidence, form, operation, type):
96
- # need to write a query on diagnostic test and sample from it based on slider values
97
- # then return the sample
98
- query = " and ".join(
99
- [
100
- f"{cat}_{val} == {True}"
101
- for cat, val in zip(meta_cats.keys(), [complexity, evidence, form, operation, type])
102
- if val
103
- ]
104
- )
105
- results = DIAGNOSTIC_TEST.query(query)
106
- if len(results) == 0:
107
- return f"No results found for query {query}", "", "", "", ""
108
-
109
- for i, sample in results.sample(frac=1).iterrows():
110
- if not sample['nhash']:
111
- continue
112
- print("Sampled: ", sample["nhash"])
113
 
114
- # first get PDF file
115
- PDF, grid = None, None
116
- pdf_path = PDF_PATH / "test" / (sample["nhash"] + ".pdf")
117
- if not os.path.exists(pdf_path):
 
118
  continue
119
- PDF = pdf_path
120
- grid = pdf_to_grid(pdf_path)
121
- if not grid:
122
  continue
123
- question, answer = sample["label"] #might need to translate
124
-
125
- return label, grid, PDF
 
126
 
127
  _CLASSES = [
128
  "letter",
@@ -141,25 +132,23 @@ _CLASSES = [
141
  "questionnaire",
142
  "resume",
143
  "memo",
 
144
  ]
145
- # test
146
- # l, im, f = main(*slider_defaults)
147
 
148
- #load both datasets in memory? --> easier retrieval afterwards with seed index based on pressing button
149
- DATASETS = {
150
- 'rvl_cdip': load_dataset(
151
- "bdpc/rvl_cdip_mp",
152
- split="test"),
153
- 'rvl_cdip_N': load_dataset(
154
- "bdpc/rvl_cdip_mp",
155
- split="test")
156
- }
157
-
158
- meta_cats = {'dataset': ['rvl_cdip', 'rvl_cdip_N'],
159
- 'label': _CLASSES
160
- }
161
  sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()]
162
- slider_defaults = [slider.value for slider in sliders]
 
 
 
163
 
164
  outputs = [
165
  gr.Textbox(label="label"),
@@ -167,5 +156,21 @@ outputs = [
167
  gr.File(label="PDF"),
168
  ]
169
 
170
- iface = gr.Interface(fn=main, inputs=sliders, outputs=outputs, description="Visualize PDF samples from multi-page (PDF) document classification datasets", title='Beyond Document Page Classification: Examples')
171
- iface.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
2
  import gradio as gr
3
  from collections import OrderedDict
4
  from PIL import Image, ImageDraw, ImageFont
5
  from io import BytesIO
6
+ import time
7
+ import tempfile
8
  import PyPDF2
9
  import pdf2image
10
+ from datasets import load_dataset
11
 
12
  MAX_PAGES = 50
13
  MAX_PDF_SIZE = 100000000 # almost 100MB
 
82
  images.append(im)
83
  except Exception as e:
84
  print(f"{pdf_path} PyPDF get_images {e}")
85
+ images = pdf2image.convert_from_bytes(pdf_path)
86
 
87
  # simpler but slower
88
  # images = pdf2image.convert_from_path(pdf_path)
 
93
  return equal_image_grid(images)
94
 
95
 
96
+ def main(dataset, label):
97
+ # to get different samples, use timestamp as seed
98
+ timestamp = time.time()
99
+ seed = int(timestamp * 1000) % 1000000
100
+
101
+ shuffled_dataset = DATASETS[dataset].shuffle(buffer_size=10, seed=seed)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ # first get PDF file
104
+ for sample in shuffled_dataset:
105
+ label_column = "label" if "label" in sample else "labels"
106
+ filelabel = _CLASSES[sample[label_column]]
107
+ if label and filelabel != label:
108
  continue
109
+ pdf_path = sample["file"]
110
+ grid = pdf_to_grid(BytesIO(pdf_path))
111
+ if grid is None:
112
  continue
113
+ PDF = tempfile.NamedTemporaryFile(suffix=".pdf")
114
+ PDF.write(pdf_path)
115
+ return filelabel, grid, pdf_path
116
+
117
 
118
  _CLASSES = [
119
  "letter",
 
132
  "questionnaire",
133
  "resume",
134
  "memo",
135
+ ''
136
  ]
 
 
137
 
138
+ # load both datasets in memory? --> easier retrieval afterwards with seed index based on pressing button
139
+ DATASETS = OrderedDict(
140
+ {
141
+ "rvl_cdip": load_dataset("bdpc/rvl_cdip_mp", split="test", streaming=True),
142
+ "rvl_cdip_N": load_dataset("bdpc/rvl_cdip_n_mp", split="test", streaming=True),
143
+ }
144
+ )
145
+
146
+ meta_cats = {"dataset": ["rvl_cdip", "rvl_cdip_N"], "label": _CLASSES}
 
 
 
 
147
  sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()]
148
+ slider_defaults = [sliders[0].value, None]
149
+
150
+ # test
151
+ # l, im, f = main(*slider_defaults)
152
 
153
  outputs = [
154
  gr.Textbox(label="label"),
 
156
  gr.File(label="PDF"),
157
  ]
158
 
159
+ DESCRIPTION = """
160
+ Visualize PDF samples from multi-page (PDF) document classification datasets @ https://huggingface.co/datasets/bdpc
161
+
162
+ - **dataset**: dataset name
163
+ - **label**: label name
164
+
165
+ The first time that the app is launched, it will download the datasets, which can take a few minutes.
166
+ For fastest response, choose the rvl_cdip_N dataset, which is considerably smaller to iterate over.
167
+ """
168
+
169
+ iface = gr.Interface(
170
+ fn=main,
171
+ inputs=sliders,
172
+ outputs=outputs,
173
+ description=DESCRIPTION,
174
+ title="Beyond Document Page Classification: Examples",
175
+ )
176
+ iface.launch(share=True)