Alex Strick van Linschoten commited on
Commit
f4f594a
1 Parent(s): c6ed7c3

add area calculations and delete model

Browse files
Files changed (2) hide show
  1. 2022-01-15-vfnet-post-self-train.pth +0 -3
  2. app.py +46 -4
2022-01-15-vfnet-post-self-train.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db6b7adeef1d66f4e8684bdca6fb9fb4720ad149e4994e10a5af3e26bfc2507
3
- size 131183383
 
 
 
app.py CHANGED
@@ -13,7 +13,6 @@ from icevision.all import *
13
  from icevision.models.checkpoint import *
14
  from PIL import Image as PILImage
15
 
16
- # checkpoint_path = "./2022-01-15-vfnet-post-self-train.pth"
17
  checkpoint_path = "./allsynthetic-imgsize768.pth"
18
  checkpoint_and_model = model_from_checkpoint(checkpoint_path)
19
  model = checkpoint_and_model["model"]
@@ -33,11 +32,38 @@ learn = load_learner(
33
  labels = learn.dls.vocab
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def predict(pdf, confidence, generate_file):
37
  filename_without_extension = pdf.name[:-4]
38
  document = fitz.open(pdf.name)
39
  results = []
40
  images = []
 
41
  tmp_dir = tempfile.gettempdir()
42
  for page_num, page in enumerate(document, start=1):
43
  image_pixmap = page.get_pixmap()
@@ -77,6 +103,9 @@ def predict(pdf, confidence, generate_file):
77
  tmp_dir, filename_without_extension, "redacted_pages.pdf"
78
  )
79
  if generate_file:
 
 
 
80
  pdf = FPDF()
81
  pdf.set_auto_page_break(0)
82
  imagelist = sorted(
@@ -109,7 +138,11 @@ def predict(pdf, confidence, generate_file):
109
  font_size=16,
110
  label_color="#FF59D6",
111
  )
112
- print(pred_dict)
 
 
 
 
113
  pred_dict["img"].save(
114
  os.path.join(
115
  tmp_dir, filename_without_extension, f"pred-{image}"
@@ -123,10 +156,19 @@ def predict(pdf, confidence, generate_file):
123
  )
124
  pdf.output(report, "F")
125
 
126
- text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}."
 
 
 
 
 
 
 
 
 
127
 
128
  if generate_file:
129
- return text_output, images, report
130
  else:
131
  return text_output, images, None
132
 
13
  from icevision.models.checkpoint import *
14
  from PIL import Image as PILImage
15
 
 
16
  checkpoint_path = "./allsynthetic-imgsize768.pth"
17
  checkpoint_and_model = model_from_checkpoint(checkpoint_path)
18
  model = checkpoint_and_model["model"]
32
  labels = learn.dls.vocab
33
 
34
 
35
+ def get_content_area(pred_dict) -> int:
36
+ if "content" not in pred_dict["labels"]:
37
+ return 0
38
+ content_bboxes = [
39
+ pred_dict["bboxes"][idx]
40
+ for idx, label in enumerate(pred_dict["labels"])
41
+ if label == "content"
42
+ ]
43
+ cb = content_bboxes[0]
44
+ return (cb.xmax - cb.xmin) * (cb.ymax - cb.ymin)
45
+
46
+
47
+ def get_redaction_area(pred_dict) -> int:
48
+ if "redaction" not in pred_dict["labels"]:
49
+ return 0
50
+ redaction_bboxes = [
51
+ pred_dict["bboxes"][idx]
52
+ for idx, label in enumerate(pred_dict["labels"])
53
+ if label == "redaction"
54
+ ]
55
+ return sum(
56
+ (bbox.xmax - bbox.xmin) * (bbox.ymax - bbox.ymin)
57
+ for bbox in redaction_bboxes
58
+ )
59
+
60
+
61
  def predict(pdf, confidence, generate_file):
62
  filename_without_extension = pdf.name[:-4]
63
  document = fitz.open(pdf.name)
64
  results = []
65
  images = []
66
+ total_redacted_image_areas = 0
67
  tmp_dir = tempfile.gettempdir()
68
  for page_num, page in enumerate(document, start=1):
69
  image_pixmap = page.get_pixmap()
103
  tmp_dir, filename_without_extension, "redacted_pages.pdf"
104
  )
105
  if generate_file:
106
+ total_image_areas = 0
107
+ total_content_areas = 0
108
+ total_redaction_area = 0
109
  pdf = FPDF()
110
  pdf.set_auto_page_break(0)
111
  imagelist = sorted(
138
  font_size=16,
139
  label_color="#FF59D6",
140
  )
141
+
142
+ total_image_areas += pred_dict["width"] * pred_dict["height"]
143
+ total_content_areas += get_content_area(pred_dict)
144
+ total_redaction_area += get_redaction_area(pred_dict)
145
+
146
  pred_dict["img"].save(
147
  os.path.join(
148
  tmp_dir, filename_without_extension, f"pred-{image}"
156
  )
157
  pdf.output(report, "F")
158
 
159
+ total_redaction_proportion = round(
160
+ (total_redaction_area / total_image_areas) * 100, 1
161
+ )
162
+ content_redaction_proportion = round(
163
+ (total_redaction_area / total_content_areas) * 100, 1
164
+ )
165
+
166
+ text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}. "
167
+
168
+ redaction_analysis = f"{total_redaction_proportion}% of the total area of the redacted pages was redacted. \n {content_redaction_proportion}% of the actual content of those redacted pages was redacted."
169
 
170
  if generate_file:
171
+ return text_output + redaction_analysis, images, report
172
  else:
173
  return text_output, images, None
174